datafold
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 6 deletions b/‎CONTRIBUTING.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎data_diff/__main__.py‎
Lines changed: 0 additions & 1 deletion b/‎data_diff/__main__.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎data_diff/config.py‎
Lines changed: 9 additions & 1 deletion b/‎data_diff/config.py‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎data_diff/databases/base.py‎
Lines changed: 17 additions & 15 deletions b/‎data_diff/databases/base.py‎
Lines changed: 17 additions & 15 deletions
diff --git a/‎data_diff/databases/database_types.py‎
Lines changed: 12 additions & 3 deletions b/‎data_diff/databases/database_types.py‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎data_diff/databases/mysql.py‎
Lines changed: 1 addition & 0 deletions b/‎data_diff/databases/mysql.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎data_diff/diff_tables.py‎
Lines changed: 44 additions & 31 deletions b/‎data_diff/diff_tables.py‎
Lines changed: 44 additions & 31 deletions
diff --git a/‎data_diff/table_segment.py‎
Lines changed: 4 additions & 3 deletions b/‎data_diff/table_segment.py‎
Lines changed: 4 additions & 3 deletions
@@ -4,7 +4,7 @@ Contributions are very welcome! We'll be happy to help you in the process.
 
 ## What should I know before I get started?
 
-Go through the README and the documentation, and make sure that you understand how data-diff works. 
+Go through the README and the documentation, and make sure that you understand how data-diff works.
 
 ## How to contribute?
 
@@ -13,12 +13,12 @@ Go through the README and the documentation, and make sure that you understand h
 Please report the bug with as many details as you can.
 
 1. Include the exact command that you used. Make sure to run data-diff with the `-d` flag for debug output.
-2. Provide the entire output of the command. (stdout, logs, exception) 
+2. Provide the entire output of the command. (stdout, logs, exception)
 3. If possible, show us how we could reproduce the bug. i.e. how to set up an environment in which it occurs.
 
 (When pasting, always make sure to redact sensitive information, like passwords.)
 
-If data-diff returns incorrect results, i.e. false-positive or false-negative, please also include the original values. 
+If data-diff returns incorrect results, i.e. false-positive or false-negative, please also include the original values.
 
 Before you report a bug, make sure it doesn't already exist.
 
@@ -66,7 +66,7 @@ Make sure to update the appropriate `TEST_*_CONN_STRING`, so that it will be inc
 
 You can run the tests with `unittest`.
 
-When running against multiple databases, the tests can take a long while. 
+When running against multiple databases, the tests can take a long while.
 
 To save time, we recommend running them with `unittest-parallel`.
 
@@ -76,6 +76,6 @@ When debugging, we recommend using the `-f` flag, to stop on error. Also, use th
 
 New databases should be added as a new module in the `data-diff/databases/` folder.
 
-Make sure to update the `DATABASE_TYPES` dictionary in `tests/test_database_types.py`, so that it will be included in the tests.
-
 If possible, please also add the database setup to `docker-compose.yml`, so that we can run and test it for ourselves. If you do, also update the CI (`ci.yml`).
+
+Guide to implementing a new database driver: https://data-diff.readthedocs.io/en/latest/new-database-driver-guide.html
@@ -197,7 +197,6 @@ def _main(
  bisection_threshold=bisection_threshold,
  threaded=threaded,
  max_threadpool_size=threads and threads * 2,
- debug=debug,
  )
 
  if database1 is None or database2 is None:
 
@@ -26,12 +26,20 @@ def _apply_config(config: Dict[str, Any], run_name: str, kw: Dict[str, Any]):
  else:
  run_name = "default"
 
+ if 'database1' in kw:
+ for attr in ('table1', 'database2', 'table2'):
+ if kw[attr] is None:
+ raise ValueError(f"Specified database1 but not {attr}. Must specify all 4 arguments, or niether.")
+
+ for index in "12":
+ run_args[index] = {attr: kw.pop(f"{attr}{index}") for attr in ('database', 'table')}
+
  # Process databases + tables
  for index in "12":
  args = run_args.pop(index, {})
  for attr in ("database", "table"):
  if attr not in args:
- raise ConfigParseError(f"Running 'run.{run_name}': Connection #{index} in missing attribute '{attr}'.")
+ raise ConfigParseError(f"Running 'run.{run_name}': Connection #{index} is missing attribute '{attr}'.")
 
  database = args.pop("database")
  table = args.pop("table")
 
@@ -16,8 +16,10 @@
  Float,
  ColType_UUID,
  Native_UUID,
- String_Alphanum,
  String_UUID,
+ String_Alphanum,
+ String_FixedAlphanum,
+ String_VaryingAlphanum,
  TemporalType,
  UnknownColType,
  Text,
@@ -79,6 +81,7 @@ class Database(AbstractDatabase):
 
  TYPE_CLASSES: Dict[str, type] = {}
  default_schema: str = None
+ SUPPORTS_ALPHANUMS = True
 
  @property
  def name(self):
@@ -229,23 +232,22 @@ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], whe
  col_dict[col_name] = String_UUID()
  continue
 
- alphanum_samples = [s for s in samples if s and String_Alphanum.test_value(s)]
- if alphanum_samples:
- if len(alphanum_samples) != len(samples):
- logger.warning(
- f"Mixed Alphanum/Non-Alphanum values detected in column {'.'.join(table_path)}.{col_name}, disabling Alphanum support."
- )
- else:
- assert col_name in col_dict
- lens = set(map(len, alphanum_samples))
- if len(lens) > 1:
+ if self.SUPPORTS_ALPHANUMS: # Anything but MySQL (so far)
+ alphanum_samples = [s for s in samples if String_Alphanum.test_value(s)]
+ if alphanum_samples:
+ if len(alphanum_samples) != len(samples):
  logger.warning(
- f"Mixed Alphanum lengths detected in column {'.'.join(table_path)}.{col_name}, disabling Alphanum support."
+ f"Mixed Alphanum/Non-Alphanum values detected in column {'.'.join(table_path)}.{col_name}. It cannot be used as a key."
  )
  else:
- (length,) = lens
- col_dict[col_name] = String_Alphanum(length=length)
- continue
+ assert col_name in col_dict
+ lens = set(map(len, alphanum_samples))
+ if len(lens) > 1:
+ col_dict[col_name] = String_VaryingAlphanum()
+ else:
+ (length,) = lens
+ col_dict[col_name] = String_FixedAlphanum(length=length)
+ continue
 
  # @lru_cache()
  # def get_table_schema(self, path: DbPath) -> Dict[str, ColType]:
 
@@ -92,10 +92,7 @@ class String_UUID(StringType, ColType_UUID):
  pass
 
 
-@dataclass
 class String_Alphanum(StringType, ColType_Alphanum):
- length: int
-
  @staticmethod
  def test_value(value: str) -> bool:
  try:
@@ -104,6 +101,18 @@ def test_value(value: str) -> bool:
  except ValueError:
  return False
 
+ def make_value(self, value):
+ return self.python_type(value)
+
+
+class String_VaryingAlphanum(String_Alphanum):
+ pass
+
+
+@dataclass
+class String_FixedAlphanum(String_Alphanum):
+ length: int
+
  def make_value(self, value):
  if len(value) != self.length:
  raise ValueError(f"Expected alphanumeric value of length {self.length}, but got '{value}'.")
 
@@ -28,6 +28,7 @@ class MySQL(ThreadedDatabase):
  "binary": Text,
  }
  ROUNDS_ON_PREC_LOSS = True
+ SUPPORTS_ALPHANUMS = False
 
  def __init__(self, *, thread_count, **kw):
  self._args = kw
 
@@ -13,7 +13,8 @@
 from runtype import dataclass
 
 from .utils import safezip, run_as_daemon
-from .databases.database_types import IKey, NumericType, PrecisionType, StringType
+from .thread_utils import ThreadedYielder
+from .databases.database_types import IKey, NumericType, PrecisionType, StringType, ColType_UUID
 from .table_segment import TableSegment
 from .tracking import create_end_event_json, create_start_event_json, send_event_json, is_tracking_enabled
 
@@ -121,22 +122,25 @@ def diff_tables(self, table1: TableSegment, table2: TableSegment) -> DiffResult:
  logger.info(
  f"Diffing tables | segments: {self.bisection_factor}, bisection threshold: {self.bisection_threshold}. "
  f"key-range: {table1.min_key}..{table2.max_key}, "
- f"size: {table1.approximate_size()}"
+ f"size: table1 <= {table1.approximate_size()}, table2 <= {table2.approximate_size()}"
  )
 
+ ti = ThreadedYielder(self.max_threadpool_size)
  # Bisect (split) the table into segments, and diff them recursively.
- yield from self._bisect_and_diff_tables(table1, table2)
+ ti.submit(self._bisect_and_diff_tables, ti, table1, table2)
 
  # Now we check for the second min-max, to diff the portions we "missed".
  min_key2, max_key2 = self._parse_key_range_result(key_type, next(key_ranges))
 
  if min_key2 < min_key1:
  pre_tables = [t.new(min_key=min_key2, max_key=min_key1) for t in (table1, table2)]
- yield from self._bisect_and_diff_tables(*pre_tables)
+ ti.submit(self._bisect_and_diff_tables, ti, *pre_tables)
 
  if max_key2 > max_key1:
  post_tables = [t.new(min_key=max_key1, max_key=max_key2) for t in (table1, table2)]
- yield from self._bisect_and_diff_tables(*post_tables)
+ ti.submit(self._bisect_and_diff_tables, ti, *post_tables)
+
+ yield from ti
 
  except BaseException as e: # Catch KeyboardInterrupt too
  error = e
@@ -205,6 +209,10 @@ def _validate_and_adjust_columns(self, table1, table2):
  table1._schema[c1] = col1.replace(precision=lowest.precision)
  table2._schema[c2] = col2.replace(precision=lowest.precision)
 
+ elif isinstance(col1, ColType_UUID):
+ if not isinstance(col2, ColType_UUID):
+ raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}")
+
  elif isinstance(col1, StringType):
  if not isinstance(col2, StringType):
  raise TypeError(f"Incompatible types for column '{c1}': {col1} <-> {col2}")
@@ -218,16 +226,19 @@ def _validate_and_adjust_columns(self, table1, table2):
  "If encoding/formatting differs between databases, it may result in false positives."
  )
 
- def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
+ def _bisect_and_diff_tables(
+ self, ti: ThreadedYielder, table1: TableSegment, table2: TableSegment, level=0, max_rows=None
+ ):
  assert table1.is_bounded and table2.is_bounded
 
+ max_space_size = max(table1.approximate_size(), table2.approximate_size())
  if max_rows is None:
- # We can be sure that row_count <= max_rows
- max_rows = table1.max_key - table1.min_key
+ # We can be sure that row_count <= max_rows iff the table key is unique
+ max_rows = max_space_size
 
  # If count is below the threshold, just download and compare the columns locally
  # This saves time, as bisection speed is limited by ping and query performance.
- if max_rows < self.bisection_threshold:
+ if max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
  rows1, rows2 = self._threaded_call("get_values", [table1, table2])
  diff = list(diff_sets(rows1, rows2))
 
@@ -242,49 +253,51 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
 
  logger.info(". " * level + f"Diff found {len(diff)} different rows.")
  self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
- yield from diff
- return
+ return diff
 
  # Choose evenly spaced checkpoints (according to min_key and max_key)
- checkpoints = table1.choose_checkpoints(self.bisection_factor - 1)
+ biggest_table = max(table1, table2, key=methodcaller('approximate_size'))
+ checkpoints = biggest_table.choose_checkpoints(self.bisection_factor - 1)
 
  # Create new instances of TableSegment between each checkpoint
  segmented1 = table1.segment_by_checkpoints(checkpoints)
  segmented2 = table2.segment_by_checkpoints(checkpoints)
 
  # Recursively compare each pair of corresponding segments between table1 and table2
- diff_iters = [
- self._diff_tables(t1, t2, level + 1, i + 1, len(segmented1))
- for i, (t1, t2) in enumerate(safezip(segmented1, segmented2))
- ]
-
- for res in self._thread_map(list, diff_iters):
- yield from res
-
- def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_count=None):
+ for i, (t1, t2) in enumerate(safezip(segmented1, segmented2)):
+ ti.submit(self._diff_tables, ti, t1, t2, max_rows, level + 1, i + 1, len(segmented1), priority=level)
+
+ def _diff_tables(
+ self,
+ ti: ThreadedYielder,
+ table1: TableSegment,
+ table2: TableSegment,
+ max_rows: int,
+ level=0,
+ segment_index=None,
+ segment_count=None,
+ ):
  logger.info(
  ". " * level + f"Diffing segment {segment_index}/{segment_count}, "
  f"key-range: {table1.min_key}..{table2.max_key}, "
- f"size: {table2.max_key-table1.min_key}"
+ f"size <= {max_rows}"
  )
 
  # When benchmarking, we want the ability to skip checksumming. This
  # allows us to download all rows for comparison in performance. By
  # default, data-diff will checksum the section first (when it's below
  # the threshold) and _then_ download it.
  if BENCHMARK:
- max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
- if max_rows_from_keys < self.bisection_threshold:
- yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
- return
+ if max_rows < self.bisection_threshold:
+ return self._bisect_and_diff_tables(ti, table1, table2, level=level, max_rows=max_rows)
 
  (count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
 
  if count1 == 0 and count2 == 0:
- logger.warning(
- "Uneven distribution of keys detected. (big gaps in the key column). "
- "For better performance, we recommend to increase the bisection-threshold."
- )
+ # logger.warning(
+ #  f"Uneven distribution of keys detected in segment {table1.min_key}..{table2.max_key}. (big gaps in the key column). "
+ #  "For better performance, we recommend to increase the bisection-threshold."
+ # )
  assert checksum1 is None and checksum2 is None
  return
 
@@ -293,7 +306,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
  self.stats["table2_count"] = self.stats.get("table2_count", 0) + count2
 
  if checksum1 != checksum2:
- yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))
+ return self._bisect_and_diff_tables(ti, table1, table2, level=level, max_rows=max(count1, count2))
 
  def _thread_map(self, func, iterable):
  if not self.threaded:
 
@@ -4,7 +4,7 @@
 
 from runtype import dataclass
 
-from .utils import ArithString, split_space
+from .utils import ArithString, split_space, ArithAlphanumeric
 
 from .databases.base import Database
 from .databases.database_types import DbPath, DbKey, DbTime, Native_UUID, Schema, create_schema
@@ -149,8 +149,9 @@ def choose_checkpoints(self, count: int) -> List[DbKey]:
  assert self.is_bounded
  if isinstance(self.min_key, ArithString):
  assert type(self.min_key) is type(self.max_key)
- checkpoints = split_space(self.min_key.int, self.max_key.int, count)
- return [self.min_key.new(int=i) for i in checkpoints]
+ checkpoints = self.min_key.range(self.max_key, count)
+ assert all(self.min_key <= x <= self.max_key for x in checkpoints)
+ return checkpoints
 
  return split_space(self.min_key, self.max_key, count)
Original file line number	Diff line number	Diff line change
`@@ -197,7 +197,6 @@ def _main(`
`197`	`197`	`bisection_threshold=bisection_threshold,`
`198`	`198`	`threaded=threaded,`
`199`	`199`	`max_threadpool_size=threads and threads * 2,`
`200`		`- debug=debug,`
`201`	`200`	`)`
`202`	`201`
`203`	`202`	`if database1 is None or database2 is None:`
Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@ class MySQL(ThreadedDatabase):`
`28`	`28`	`"binary": Text,`
`29`	`29`	`}`
`30`	`30`	`ROUNDS_ON_PREC_LOSS = True`
	`31`	`+ SUPPORTS_ALPHANUMS = False`
`31`	`32`
`32`	`33`	`def __init__(self, , thread_count, *kw):`
`33`	`34`	`self._args = kw`