@@ -131,6 +131,10 @@ def __post_init__(self):
131
131
class UnknownColType (ColType ):
132
132
text : str
133
133
134
+ def __post_init__ (self ):
135
+ logger .warn (f"Column of type '{ self .text } ' has no compatibility handling. "
136
+ "If encoding/formatting differs between databases, it may result in false positives." )
137
+
134
138
135
139
class AbstractDatabase (ABC ):
136
140
@abstractmethod
@@ -173,16 +177,24 @@ def close(self):
173
177
"Close connection(s) to the database instance. Querying will stop functioning."
174
178
...
175
179
180
+
176
181
@abstractmethod
177
- def normalize_value_by_type ( value : str , coltype : ColType ) -> str :
178
- """Creates an SQL expression, that converts 'value' to a normalized representation .
182
+ def normalize_timestamp ( self , value : str , coltype : ColType ) -> str :
183
+ """Creates an SQL expression, that converts 'value' to a normalized timestamp .
179
184
180
- The returned expression must accept any SQL value, and return a string.
185
+ The returned expression must accept any SQL datetime/timestamp, and return a string.
186
+
187
+ Date format: "YYYY-MM-DD HH:mm:SS.FFFFFF"
188
+
189
+ Precision of dates should be rounded up/down according to coltype.rounds
190
+ """
191
+ ...
181
192
182
- - Dates are expected in the format:
183
- "YYYY-MM-DD HH:mm:SS.FFFFFF"
193
+ @abstractmethod
194
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
195
+ """Creates an SQL expression, that converts 'value' to a normalized number.
184
196
185
- Rounded up/down according to coltype.rounds
197
+ The returned expression must accept any SQL int/numeric/float, and return a string.
186
198
187
199
- Floats/Decimals are expected in the format
188
200
"I.P"
@@ -191,14 +203,31 @@ def normalize_value_by_type(value: str, coltype: ColType) -> str:
191
203
and must be at least one digit (0).
192
204
P is the fractional digits, the amount of which is specified with
193
205
coltype.precision. Trailing zeroes may be necessary.
206
+ If P is 0, the dot is omitted.
194
207
195
208
Note: This precision is different than the one used by databases. For decimals,
196
- it's the same as "numeric_scale", and for floats, who use binary precision,
197
- it can be calculated as log10(2**p)
209
+ it's the same as ``numeric_scale``, and for floats, who use binary precision,
210
+ it can be calculated as ``log10(2**numeric_precision)``.
211
+ """
212
+ ...
213
+
214
+ def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
215
+ """Creates an SQL expression, that converts 'value' to a normalized representation.
216
+
217
+ The returned expression must accept any SQL value, and return a string.
218
+
219
+ The default implementation dispatches to a method according to ``coltype``:
198
220
221
+ TemporalType -> normalize_timestamp()
222
+ NumericType -> normalize_number()
223
+ -else- -> to_string()
199
224
200
225
"""
201
- ...
226
+ if isinstance (coltype , TemporalType ):
227
+ return self .normalize_timestamp (value , coltype )
228
+ elif isinstance (coltype , NumericType ):
229
+ return self .normalize_number (value , coltype )
230
+ return self .to_string (f"{ value } " )
202
231
203
232
204
233
class Database (AbstractDatabase ):
@@ -410,27 +439,16 @@ def md5_to_int(self, s: str) -> str:
410
439
def to_string (self , s : str ):
411
440
return f"{ s } ::varchar"
412
441
413
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
414
- if isinstance (coltype , TemporalType ):
415
- # if coltype.precision == 0:
416
- # return f"to_char({value}::timestamp(0), 'YYYY-mm-dd HH24:MI:SS')"
417
- # if coltype.precision == 3:
418
- # return f"to_char({value}, 'YYYY-mm-dd HH24:MI:SS.US')"
419
- # elif coltype.precision == 6:
420
- # return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
421
- # else:
422
- # # Postgres/Redshift doesn't support arbitrary precision
423
- # raise TypeError(f"Bad precision for {type(self).__name__}: {coltype})")
424
- if coltype .rounds :
425
- return f"to_char({ value } ::timestamp({ coltype .precision } ), 'YYYY-mm-dd HH24:MI:SS.US')"
426
- else :
427
- timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
428
- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
429
442
430
- elif isinstance (coltype , NumericType ):
431
- value = f"{ value } ::decimal(38, { coltype .precision } )"
443
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
444
+ if coltype .rounds :
445
+ return f"to_char({ value } ::timestamp({ coltype .precision } ), 'YYYY-mm-dd HH24:MI:SS.US')"
432
446
433
- return self .to_string (f"{ value } " )
447
+ timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
448
+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
449
+
450
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
451
+ return self .to_string (f"{ value } ::decimal(38, { coltype .precision } )" )
434
452
435
453
436
454
class Presto (Database ):
@@ -470,25 +488,19 @@ def _query(self, sql_code: str) -> list:
470
488
def close (self ):
471
489
self ._conn .close ()
472
490
473
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
474
- if isinstance (coltype , TemporalType ):
475
- if coltype .rounds :
476
- if coltype .precision > 3 :
477
- pass
478
- s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
479
- else :
480
- s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
481
- # datetime = f"date_format(cast({value} as timestamp(6), '%Y-%m-%d %H:%i:%S.%f'))"
482
- # datetime = self.to_string(f"cast({value} as datetime(6))")
483
-
484
- return (
485
- f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
486
- )
491
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
492
+ # TODO
493
+ if coltype .rounds :
494
+ s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
495
+ else :
496
+ s = f"date_format(cast({ value } as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
487
497
488
- elif isinstance (coltype , NumericType ):
489
- value = f"cast({ value } as decimal(38,{ coltype .precision } ))"
498
+ return (
499
+ f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
500
+ )
490
501
491
- return self .to_string (value )
502
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
503
+ return self .to_string (f"cast({ value } as decimal(38,{ coltype .precision } ))" )
492
504
493
505
def select_table_schema (self , path : DbPath ) -> str :
494
506
schema , table = self ._normalize_table_path (path )
@@ -577,18 +589,16 @@ def md5_to_int(self, s: str) -> str:
577
589
def to_string (self , s : str ):
578
590
return f"cast({ s } as char)"
579
591
580
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
581
- if isinstance (coltype , TemporalType ):
582
- if coltype .rounds :
583
- return self .to_string (f"cast( cast({ value } as datetime({ coltype .precision } )) as datetime(6))" )
584
- else :
585
- s = self .to_string (f"cast({ value } as datetime(6))" )
586
- return f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
592
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
593
+ if coltype .rounds :
594
+ return self .to_string (f"cast( cast({ value } as datetime({ coltype .precision } )) as datetime(6))" )
587
595
588
- elif isinstance (coltype , NumericType ):
589
- value = f"cast({ value } as decimal(38,{ coltype .precision } ))"
596
+ s = self .to_string (f"cast({ value } as datetime(6))" )
597
+ return f"RPAD(RPAD({ s } , { TIMESTAMP_PRECISION_POS + coltype .precision } , '.'), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
598
+
599
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
600
+ return self .to_string (f"cast({ value } as decimal(38, { coltype .precision } ))" )
590
601
591
- return self .to_string (f"{ value } " )
592
602
593
603
594
604
class Oracle (ThreadedDatabase ):
@@ -633,16 +643,15 @@ def select_table_schema(self, path: DbPath) -> str:
633
643
f" FROM USER_TAB_COLUMNS WHERE table_name = '{ table .upper ()} '"
634
644
)
635
645
636
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
637
- if isinstance (coltype , TemporalType ):
638
- return f"to_char(cast({ value } as timestamp({ coltype .precision } )), 'YYYY-MM-DD HH24:MI:SS.FF6')"
639
- elif isinstance (coltype , NumericType ):
640
- # FM999.9990
641
- format_str = "FM" + "9" * (38 - coltype .precision )
642
- if coltype .precision :
643
- format_str += "0." + "9" * (coltype .precision - 1 ) + "0"
644
- return f"to_char({ value } , '{ format_str } ')"
645
- return self .to_string (f"{ value } " )
646
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
647
+ return f"to_char(cast({ value } as timestamp({ coltype .precision } )), 'YYYY-MM-DD HH24:MI:SS.FF6')"
648
+
649
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
650
+ # FM999.9990
651
+ format_str = "FM" + "9" * (38 - coltype .precision )
652
+ if coltype .precision :
653
+ format_str += "0." + "9" * (coltype .precision - 1 ) + "0"
654
+ return f"to_char({ value } , '{ format_str } ')"
646
655
647
656
def _parse_type (
648
657
self , type_repr : str , datetime_precision : int = None , numeric_precision : int = None , numeric_scale : int = None
@@ -693,27 +702,25 @@ class Redshift(Postgres):
693
702
def md5_to_int (self , s : str ) -> str :
694
703
return f"strtol(substring(md5({ s } ), { 1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS } ), 16)::decimal(38)"
695
704
696
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
697
- if isinstance (coltype , TemporalType ):
698
- if coltype .rounds :
699
- timestamp = f"{ value } ::timestamp(6)"
700
- # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
701
- secs = f"timestamp 'epoch' + round(extract(epoch from { timestamp } )::decimal(38)"
702
- # Get the milliseconds from timestamp.
703
- ms = f"extract(ms from { timestamp } )"
704
- # Get the microseconds from timestamp, without the milliseconds!
705
- us = f"extract(us from { timestamp } )"
706
- # epoch = Total time since epoch in microseconds.
707
- epoch = f"{ secs } *1000000 + { ms } *1000 + { us } "
708
- timestamp6 = f"to_char({ epoch } , -6+{ coltype .precision } ) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
709
- else :
710
- timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
711
- return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
712
-
713
- elif isinstance (coltype , NumericType ):
714
- value = f"{ value } ::decimal(38,{ coltype .precision } )"
705
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
706
+ if coltype .rounds :
707
+ timestamp = f"{ value } ::timestamp(6)"
708
+ # Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
709
+ secs = f"timestamp 'epoch' + round(extract(epoch from { timestamp } )::decimal(38)"
710
+ # Get the milliseconds from timestamp.
711
+ ms = f"extract(ms from { timestamp } )"
712
+ # Get the microseconds from timestamp, without the milliseconds!
713
+ us = f"extract(us from { timestamp } )"
714
+ # epoch = Total time since epoch in microseconds.
715
+ epoch = f"{ secs } *1000000 + { ms } *1000 + { us } "
716
+ timestamp6 = f"to_char({ epoch } , -6+{ coltype .precision } ) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
717
+ else :
718
+ timestamp6 = f"to_char({ value } ::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
719
+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
720
+
721
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
722
+ return self .to_string (f"{ value } ::decimal(38,{ coltype .precision } )" )
715
723
716
- return self .to_string (f"{ value } " )
717
724
718
725
def select_table_schema (self , path : DbPath ) -> str :
719
726
schema , table = self ._normalize_table_path (path )
@@ -813,27 +820,23 @@ def select_table_schema(self, path: DbPath) -> str:
813
820
f"WHERE table_name = '{ table } ' AND table_schema = '{ schema } '"
814
821
)
815
822
816
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
817
- if isinstance (coltype , TemporalType ):
818
- if coltype .rounds :
819
- timestamp = f"timestamp_micros(cast(round(unix_micros(cast({ value } as timestamp))/1000000, { coltype .precision } )*1000000 as int))"
820
- return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { timestamp } )"
821
- else :
822
- if coltype .precision == 0 :
823
- return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, { value } )"
824
- elif coltype .precision == 6 :
825
- return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
823
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
824
+ if coltype .rounds :
825
+ timestamp = f"timestamp_micros(cast(round(unix_micros(cast({ value } as timestamp))/1000000, { coltype .precision } )*1000000 as int))"
826
+ return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { timestamp } )"
826
827
827
- timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
828
- return f"RPAD(LEFT( { timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype . precision } ) , { TIMESTAMP_PRECISION_POS + 6 } , '0' )"
829
- elif isinstance ( coltype , Integer ) :
830
- pass
828
+ if coltype . precision == 0 :
829
+ return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000 , { value } )"
830
+ elif coltype . precision == 6 :
831
+ return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
831
832
832
- elif isinstance (coltype , NumericType ):
833
- # value = f"cast({value} as decimal)"
834
- return f"format('%.{ coltype .precision } f', ({ value } ))"
833
+ timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', { value } )"
834
+ return f"RPAD(LEFT({ timestamp6 } , { TIMESTAMP_PRECISION_POS + coltype .precision } ), { TIMESTAMP_PRECISION_POS + 6 } , '0')"
835
835
836
- return self .to_string (f"{ value } " )
836
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
837
+ if isinstance (coltype , Integer ):
838
+ return self .to_string (value )
839
+ return f"format('%.{ coltype .precision } f', { value } )"
837
840
838
841
def parse_table_name (self , name : str ) -> DbPath :
839
842
path = parse_table_name (name )
@@ -907,19 +910,16 @@ def select_table_schema(self, path: DbPath) -> str:
907
910
schema , table = self ._normalize_table_path (path )
908
911
return super ().select_table_schema ((schema , table ))
909
912
910
- def normalize_value_by_type (self , value : str , coltype : ColType ) -> str :
911
- if isinstance (coltype , TemporalType ):
912
- if coltype .rounds :
913
- timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, { value } ::timestamp(9))/1000000000, { coltype .precision } ))"
914
- else :
915
- timestamp = f"cast({ value } as timestamp({ coltype .precision } ))"
916
-
917
- return f"to_char({ timestamp } , 'YYYY-MM-DD HH24:MI:SS.FF6')"
913
+ def normalize_timestamp (self , value : str , coltype : ColType ) -> str :
914
+ if coltype .rounds :
915
+ timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, { value } ::timestamp(9))/1000000000, { coltype .precision } ))"
916
+ else :
917
+ timestamp = f"cast({ value } as timestamp({ coltype .precision } ))"
918
918
919
- elif isinstance (coltype , NumericType ):
920
- value = f"cast({ value } as decimal(38, { coltype .precision } ))"
919
+ return f"to_char({ timestamp } , 'YYYY-MM-DD HH24:MI:SS.FF6')"
921
920
922
- return self .to_string (f"{ value } " )
921
+ def normalize_number (self , value : str , coltype : ColType ) -> str :
922
+ return self .to_string (f"cast({ value } as decimal(38, { coltype .precision } ))" )
923
923
924
924
925
925
@dataclass
0 commit comments