datafold · sirupsen · Jun 22, 2022 · Jun 21, 2022 · Jun 21, 2022 · Jun 22, 2022
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -1,5 +1,6 @@
 import sys
 import time
+import json
 import logging
 from itertools import islice
 
@@ -146,15 +147,24 @@ def main(
  unique_diff_count = len({i[0] for _, i in diff})
  table1_count = differ.stats.get("table1_count")
  percent = 100 * unique_diff_count / (table1_count or 1)
- print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}")
- print(f"Diff-Percent: {percent:.4f}%")
  plus = len([1 for op, _ in diff if op == "+"])
  minus = len([1 for op, _ in diff if op == "-"])
- print(f"Diff-Split: +{plus} -{minus}")
+
+ count = differ.stats["table_count"]
+ diff = {
+ "different_rows": len(diff),
+ "different_percent": percent,
+ "different_+": plus,
+ "different_-": minus,
+ "total": count,
+ }
+
+ print(json.dumps(diff, indent=2))
  else:
  for op, key in diff_iter:
  color = COLOR_SCHEME[op]
- rich.print(f"[{color}]{op} {key!r}[/{color}]")
+ jsonl = json.dumps([op, list(key)])
+ rich.print(f"[{color}]{jsonl}[/{color}]")
  sys.stdout.flush()
 
  end = time.time()

diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -381,6 +381,12 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
  if max_rows < self.bisection_threshold:
  rows1, rows2 = self._threaded_call("get_values", [table1, table2])
  diff = list(diff_sets(rows1, rows2))
+
+ # This happens when the initial bisection threshold is larger than
+ # the table itself.
+ if level == 0 and not self.stats.get("table_count", False):
+ self.stats["table_count"] = self.stats.get("table_count", 0) + max(len(rows1), len(rows2))
+
  logger.info(". " * level + f"Diff found {len(diff)} different rows.")
  self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
  yield from diff
@@ -420,7 +426,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
  return
 
  if level == 1:
- self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1
+ self.stats["table_count"] = self.stats.get("table_count", 0) + max(count1, count2)
 
  if checksum1 != checksum2:
  yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))