datafold · sirupsen · Jun 22, 2022 · Jun 21, 2022 · Jun 21, 2022 · Jun 22, 2022
diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -1,5 +1,6 @@
 import sys
 import time
+import json
 import logging
 from itertools import islice
 
@@ -50,6 +51,7 @@
 @click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.")
 @click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff")
 @click.option("-d", "--debug", is_flag=True, help="Print debug info")
+@click.option("--json", 'json_output', is_flag=True, help="Print JSON output for --stats")
 @click.option("-v", "--verbose", is_flag=True, help="Print extra info")
 @click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
 @click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.")
@@ -80,6 +82,7 @@ def main(
  interactive,
  threads,
  keep_column_case,
+ json_output,
 ):
  if limit and stats:
  print("Error: cannot specify a limit when using the -s/--stats switch")
@@ -144,17 +147,29 @@ def main(
  if stats:
  diff = list(diff_iter)
  unique_diff_count = len({i[0] for _, i in diff})
- table1_count = differ.stats.get("table1_count")
- percent = 100 * unique_diff_count / (table1_count or 1)
- print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}")
- print(f"Diff-Percent: {percent:.4f}%")
+ max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"])
+ percent = 100 * unique_diff_count / (max_table_count or 1)
  plus = len([1 for op, _ in diff if op == "+"])
  minus = len([1 for op, _ in diff if op == "-"])
- print(f"Diff-Split: +{plus} -{minus}")
+
+ if json_output:
+ json_output = {
+ "different_rows": len(diff),
+ "different_percent": percent,
+ "different_+": plus,
+ "different_-": minus,
+ "total": max_table_count,
+ }
+ print(json.dumps(json_output, indent=2))
+ else:
+ print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}")
+ print(f"Diff-Percent: {percent:.14f}%")
+ print(f"Diff-Split: +{plus} -{minus}")
  else:
  for op, key in diff_iter:
  color = COLOR_SCHEME[op]
- rich.print(f"[{color}]{op} {key!r}[/{color}]")
+ jsonl = json.dumps([op, list(key)])
+ rich.print(f"[{color}]{jsonl}[/{color}]")
  sys.stdout.flush()
 
  end = time.time()

diff --git a/data_diff/diff_tables.py b/data_diff/diff_tables.py
@@ -381,6 +381,13 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
  if max_rows < self.bisection_threshold:
  rows1, rows2 = self._threaded_call("get_values", [table1, table2])
  diff = list(diff_sets(rows1, rows2))
+
+ # This happens when the initial bisection threshold is larger than
+ # the table itself.
+ if level == 0 and not self.stats.get("table1_count", False):
+ self.stats["table1_count"] = self.stats.get("table1_count", 0) + len(rows1)
+ self.stats["table2_count"] = self.stats.get("table2_count", 0) + len(rows2)
+
  logger.info(". " * level + f"Diff found {len(diff)} different rows.")
  self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
  yield from diff
@@ -420,7 +427,8 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
  return
 
  if level == 1:
- self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1
+ self.stats["table1_count"] = self.stats.get("table_count1", 0) + count1
+ self.stats["table2_count"] = self.stats.get("table_count2", 0) + count2
 
  if checksum1 != checksum2:
  yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))