Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 469f142

Browse files
committed
Refactor into normalize_timestamp() normalize_number()
1 parent b3e1d8a commit 469f142

File tree

1 file changed

+114
-114
lines changed

1 file changed

+114
-114
lines changed

data_diff/database.py

Lines changed: 114 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,10 @@ def __post_init__(self):
131131
class UnknownColType(ColType):
132132
text: str
133133

134+
def __post_init__(self):
135+
logger.warn(f"Column of type '{self.text}' has no compatibility handling. "
136+
"If encoding/formatting differs between databases, it may result in false positives.")
137+
134138

135139
class AbstractDatabase(ABC):
136140
@abstractmethod
@@ -173,16 +177,24 @@ def close(self):
173177
"Close connection(s) to the database instance. Querying will stop functioning."
174178
...
175179

180+
176181
@abstractmethod
177-
def normalize_value_by_type(value: str, coltype: ColType) -> str:
178-
"""Creates an SQL expression, that converts 'value' to a normalized representation.
182+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
183+
"""Creates an SQL expression, that converts 'value' to a normalized timestamp.
179184
180-
The returned expression must accept any SQL value, and return a string.
185+
The returned expression must accept any SQL datetime/timestamp, and return a string.
186+
187+
Date format: "YYYY-MM-DD HH:mm:SS.FFFFFF"
188+
189+
Precision of dates should be rounded up/down according to coltype.rounds
190+
"""
191+
...
181192

182-
- Dates are expected in the format:
183-
"YYYY-MM-DD HH:mm:SS.FFFFFF"
193+
@abstractmethod
194+
def normalize_number(self, value: str, coltype: ColType) -> str:
195+
"""Creates an SQL expression, that converts 'value' to a normalized number.
184196
185-
Rounded up/down according to coltype.rounds
197+
The returned expression must accept any SQL int/numeric/float, and return a string.
186198
187199
- Floats/Decimals are expected in the format
188200
"I.P"
@@ -191,14 +203,31 @@ def normalize_value_by_type(value: str, coltype: ColType) -> str:
191203
and must be at least one digit (0).
192204
P is the fractional digits, the amount of which is specified with
193205
coltype.precision. Trailing zeroes may be necessary.
206+
If P is 0, the dot is omitted.
194207
195208
Note: This precision is different than the one used by databases. For decimals,
196-
it's the same as "numeric_scale", and for floats, who use binary precision,
197-
it can be calculated as log10(2**p)
209+
it's the same as ``numeric_scale``, and for floats, who use binary precision,
210+
it can be calculated as ``log10(2**numeric_precision)``.
211+
"""
212+
...
213+
214+
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
215+
"""Creates an SQL expression, that converts 'value' to a normalized representation.
216+
217+
The returned expression must accept any SQL value, and return a string.
218+
219+
The default implementation dispatches to a method according to ``coltype``:
198220
221+
TemporalType -> normalize_timestamp()
222+
NumericType -> normalize_number()
223+
-else- -> to_string()
199224
200225
"""
201-
...
226+
if isinstance(coltype, TemporalType):
227+
return self.normalize_timestamp(value, coltype)
228+
elif isinstance(coltype, NumericType):
229+
return self.normalize_number(value, coltype)
230+
return self.to_string(f"{value}")
202231

203232

204233
class Database(AbstractDatabase):
@@ -410,27 +439,16 @@ def md5_to_int(self, s: str) -> str:
410439
def to_string(self, s: str):
411440
return f"{s}::varchar"
412441

413-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
414-
if isinstance(coltype, TemporalType):
415-
# if coltype.precision == 0:
416-
# return f"to_char({value}::timestamp(0), 'YYYY-mm-dd HH24:MI:SS')"
417-
# if coltype.precision == 3:
418-
# return f"to_char({value}, 'YYYY-mm-dd HH24:MI:SS.US')"
419-
# elif coltype.precision == 6:
420-
# return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
421-
# else:
422-
# # Postgres/Redshift doesn't support arbitrary precision
423-
# raise TypeError(f"Bad precision for {type(self).__name__}: {coltype})")
424-
if coltype.rounds:
425-
return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
426-
else:
427-
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
428-
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
429442

430-
elif isinstance(coltype, NumericType):
431-
value = f"{value}::decimal(38, {coltype.precision})"
443+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
444+
if coltype.rounds:
445+
return f"to_char({value}::timestamp({coltype.precision}), 'YYYY-mm-dd HH24:MI:SS.US')"
432446

433-
return self.to_string(f"{value}")
447+
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
448+
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
449+
450+
def normalize_number(self, value: str, coltype: ColType) -> str:
451+
return self.to_string(f"{value}::decimal(38, {coltype.precision})")
434452

435453

436454
class Presto(Database):
@@ -470,25 +488,19 @@ def _query(self, sql_code: str) -> list:
470488
def close(self):
471489
self._conn.close()
472490

473-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
474-
if isinstance(coltype, TemporalType):
475-
if coltype.rounds:
476-
if coltype.precision > 3:
477-
pass
478-
s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
479-
else:
480-
s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
481-
# datetime = f"date_format(cast({value} as timestamp(6), '%Y-%m-%d %H:%i:%S.%f'))"
482-
# datetime = self.to_string(f"cast({value} as datetime(6))")
483-
484-
return (
485-
f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS+coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS+6}, '0')"
486-
)
491+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
492+
# TODO
493+
if coltype.rounds:
494+
s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
495+
else:
496+
s = f"date_format(cast({value} as timestamp(6)), '%Y-%m-%d %H:%i:%S.%f')"
487497

488-
elif isinstance(coltype, NumericType):
489-
value = f"cast({value} as decimal(38,{coltype.precision}))"
498+
return (
499+
f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS+coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS+6}, '0')"
500+
)
490501

491-
return self.to_string(value)
502+
def normalize_number(self, value: str, coltype: ColType) -> str:
503+
return self.to_string(f"cast({value} as decimal(38,{coltype.precision}))")
492504

493505
def select_table_schema(self, path: DbPath) -> str:
494506
schema, table = self._normalize_table_path(path)
@@ -577,18 +589,16 @@ def md5_to_int(self, s: str) -> str:
577589
def to_string(self, s: str):
578590
return f"cast({s} as char)"
579591

580-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
581-
if isinstance(coltype, TemporalType):
582-
if coltype.rounds:
583-
return self.to_string(f"cast( cast({value} as datetime({coltype.precision})) as datetime(6))")
584-
else:
585-
s = self.to_string(f"cast({value} as datetime(6))")
586-
return f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS+coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS+6}, '0')"
592+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
593+
if coltype.rounds:
594+
return self.to_string(f"cast( cast({value} as datetime({coltype.precision})) as datetime(6))")
587595

588-
elif isinstance(coltype, NumericType):
589-
value = f"cast({value} as decimal(38,{coltype.precision}))"
596+
s = self.to_string(f"cast({value} as datetime(6))")
597+
return f"RPAD(RPAD({s}, {TIMESTAMP_PRECISION_POS+coltype.precision}, '.'), {TIMESTAMP_PRECISION_POS+6}, '0')"
598+
599+
def normalize_number(self, value: str, coltype: ColType) -> str:
600+
return self.to_string(f"cast({value} as decimal(38, {coltype.precision}))")
590601

591-
return self.to_string(f"{value}")
592602

593603

594604
class Oracle(ThreadedDatabase):
@@ -633,16 +643,15 @@ def select_table_schema(self, path: DbPath) -> str:
633643
f" FROM USER_TAB_COLUMNS WHERE table_name = '{table.upper()}'"
634644
)
635645

636-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
637-
if isinstance(coltype, TemporalType):
638-
return f"to_char(cast({value} as timestamp({coltype.precision})), 'YYYY-MM-DD HH24:MI:SS.FF6')"
639-
elif isinstance(coltype, NumericType):
640-
# FM999.9990
641-
format_str = "FM" + "9" * (38 - coltype.precision)
642-
if coltype.precision:
643-
format_str += "0." + "9" * (coltype.precision - 1) + "0"
644-
return f"to_char({value}, '{format_str}')"
645-
return self.to_string(f"{value}")
646+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
647+
return f"to_char(cast({value} as timestamp({coltype.precision})), 'YYYY-MM-DD HH24:MI:SS.FF6')"
648+
649+
def normalize_number(self, value: str, coltype: ColType) -> str:
650+
# FM999.9990
651+
format_str = "FM" + "9" * (38 - coltype.precision)
652+
if coltype.precision:
653+
format_str += "0." + "9" * (coltype.precision - 1) + "0"
654+
return f"to_char({value}, '{format_str}')"
646655

647656
def _parse_type(
648657
self, type_repr: str, datetime_precision: int = None, numeric_precision: int = None, numeric_scale: int = None
@@ -693,27 +702,25 @@ class Redshift(Postgres):
693702
def md5_to_int(self, s: str) -> str:
694703
return f"strtol(substring(md5({s}), {1+MD5_HEXDIGITS-CHECKSUM_HEXDIGITS}), 16)::decimal(38)"
695704

696-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
697-
if isinstance(coltype, TemporalType):
698-
if coltype.rounds:
699-
timestamp = f"{value}::timestamp(6)"
700-
# Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
701-
secs = f"timestamp 'epoch' + round(extract(epoch from {timestamp})::decimal(38)"
702-
# Get the milliseconds from timestamp.
703-
ms = f"extract(ms from {timestamp})"
704-
# Get the microseconds from timestamp, without the milliseconds!
705-
us = f"extract(us from {timestamp})"
706-
# epoch = Total time since epoch in microseconds.
707-
epoch = f"{secs}*1000000 + {ms}*1000 + {us}"
708-
timestamp6 = f"to_char({epoch}, -6+{coltype.precision}) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
709-
else:
710-
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
711-
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
712-
713-
elif isinstance(coltype, NumericType):
714-
value = f"{value}::decimal(38,{coltype.precision})"
705+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
706+
if coltype.rounds:
707+
timestamp = f"{value}::timestamp(6)"
708+
# Get seconds since epoch. Redshift doesn't support milli- or micro-seconds.
709+
secs = f"timestamp 'epoch' + round(extract(epoch from {timestamp})::decimal(38)"
710+
# Get the milliseconds from timestamp.
711+
ms = f"extract(ms from {timestamp})"
712+
# Get the microseconds from timestamp, without the milliseconds!
713+
us = f"extract(us from {timestamp})"
714+
# epoch = Total time since epoch in microseconds.
715+
epoch = f"{secs}*1000000 + {ms}*1000 + {us}"
716+
timestamp6 = f"to_char({epoch}, -6+{coltype.precision}) * interval '0.000001 seconds', 'YYYY-mm-dd HH24:MI:SS.US')"
717+
else:
718+
timestamp6 = f"to_char({value}::timestamp(6), 'YYYY-mm-dd HH24:MI:SS.US')"
719+
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
720+
721+
def normalize_number(self, value: str, coltype: ColType) -> str:
722+
return self.to_string(f"{value}::decimal(38,{coltype.precision})")
715723

716-
return self.to_string(f"{value}")
717724

718725
def select_table_schema(self, path: DbPath) -> str:
719726
schema, table = self._normalize_table_path(path)
@@ -813,27 +820,23 @@ def select_table_schema(self, path: DbPath) -> str:
813820
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
814821
)
815822

816-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
817-
if isinstance(coltype, TemporalType):
818-
if coltype.rounds:
819-
timestamp = f"timestamp_micros(cast(round(unix_micros(cast({value} as timestamp))/1000000, {coltype.precision})*1000000 as int))"
820-
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {timestamp})"
821-
else:
822-
if coltype.precision == 0:
823-
return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, {value})"
824-
elif coltype.precision == 6:
825-
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {value})"
823+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
824+
if coltype.rounds:
825+
timestamp = f"timestamp_micros(cast(round(unix_micros(cast({value} as timestamp))/1000000, {coltype.precision})*1000000 as int))"
826+
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {timestamp})"
826827

827-
timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {value})"
828-
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
829-
elif isinstance(coltype, Integer):
830-
pass
828+
if coltype.precision == 0:
829+
return f"FORMAT_TIMESTAMP('%F %H:%M:%S.000000, {value})"
830+
elif coltype.precision == 6:
831+
return f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {value})"
831832

832-
elif isinstance(coltype, NumericType):
833-
# value = f"cast({value} as decimal)"
834-
return f"format('%.{coltype.precision}f', ({value}))"
833+
timestamp6 = f"FORMAT_TIMESTAMP('%F %H:%M:%E6S', {value})"
834+
return f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
835835

836-
return self.to_string(f"{value}")
836+
def normalize_number(self, value: str, coltype: ColType) -> str:
837+
if isinstance(coltype, Integer):
838+
return self.to_string(value)
839+
return f"format('%.{coltype.precision}f', {value})"
837840

838841
def parse_table_name(self, name: str) -> DbPath:
839842
path = parse_table_name(name)
@@ -907,19 +910,16 @@ def select_table_schema(self, path: DbPath) -> str:
907910
schema, table = self._normalize_table_path(path)
908911
return super().select_table_schema((schema, table))
909912

910-
def normalize_value_by_type(self, value: str, coltype: ColType) -> str:
911-
if isinstance(coltype, TemporalType):
912-
if coltype.rounds:
913-
timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, {value}::timestamp(9))/1000000000, {coltype.precision}))"
914-
else:
915-
timestamp = f"cast({value} as timestamp({coltype.precision}))"
916-
917-
return f"to_char({timestamp}, 'YYYY-MM-DD HH24:MI:SS.FF6')"
913+
def normalize_timestamp(self, value: str, coltype: ColType) -> str:
914+
if coltype.rounds:
915+
timestamp = f"to_timestamp(round(date_part(epoch_nanosecond, {value}::timestamp(9))/1000000000, {coltype.precision}))"
916+
else:
917+
timestamp = f"cast({value} as timestamp({coltype.precision}))"
918918

919-
elif isinstance(coltype, NumericType):
920-
value = f"cast({value} as decimal(38, {coltype.precision}))"
919+
return f"to_char({timestamp}, 'YYYY-MM-DD HH24:MI:SS.FF6')"
921920

922-
return self.to_string(f"{value}")
921+
def normalize_number(self, value: str, coltype: ColType) -> str:
922+
return self.to_string(f"cast({value} as decimal(38, {coltype.precision}))")
923923

924924

925925
@dataclass

0 commit comments

Comments
 (0)