- Notifications
You must be signed in to change notification settings - Fork 296
cli: output diff as jsonl, stats as json #90
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
import sys | ||
import time | ||
import json | ||
import logging | ||
from itertools import islice | ||
| ||
| @@ -146,15 +147,24 @@ def main( | |
unique_diff_count = len({i[0] for _, i in diff}) | ||
table1_count = differ.stats.get("table1_count") | ||
percent = 100 * unique_diff_count / (table1_count or 1) | ||
print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}") | ||
print(f"Diff-Percent: {percent:.4f}%") | ||
plus = len([1 for op, _ in diff if op == "+"]) | ||
minus = len([1 for op, _ in diff if op == "-"]) | ||
print(f"Diff-Split: +{plus} -{minus}") | ||
| ||
count = differ.stats["table_count"] | ||
diff = { | ||
"different_rows": len(diff), | ||
"different_percent": percent, | ||
"different_+": plus, | ||
"different_-": minus, | ||
"total": count, | ||
} | ||
| ||
print(json.dumps(diff, indent=2)) | ||
else: | ||
for op, key in diff_iter: | ||
color = COLOR_SCHEME[op] | ||
rich.print(f"[{color}]{op} {key!r}[/{color}]") | ||
jsonl = json.dumps([op, list(key)]) | ||
rich.print(f"[{color}]{jsonl}[/{color}]") | ||
| ||
sys.stdout.flush() | ||
| ||
end = time.time() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
| @@ -381,6 +381,12 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None): | |
if max_rows < self.bisection_threshold: | ||
rows1, rows2 = self._threaded_call("get_values", [table1, table2]) | ||
diff = list(diff_sets(rows1, rows2)) | ||
| ||
# This happens when the initial bisection threshold is larger than | ||
# the table itself. | ||
if level == 0 and not self.stats.get("table_count", False): | ||
self.stats["table_count"] = self.stats.get("table_count", 0) + max(len(rows1), len(rows2)) | ||
| ||
| ||
logger.info(". " * level + f"Diff found {len(diff)} different rows.") | ||
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2)) | ||
yield from diff | ||
| @@ -420,7 +426,7 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun | |
return | ||
| ||
if level == 1: | ||
self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1 | ||
self.stats["table_count"] = self.stats.get("table_count", 0) + max(count1, count2) | ||
| ||
| ||
if checksum1 != checksum2: | ||
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know this is a bit less pretty, but I think this is most suitable as a tool that we want people to plug into all kinds of ecosystems.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The default output should be easy to read for humans
We can have a
--json
switch for JSON output.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's OK with me... You OK with
jsonl
by default for the differences below though? I think it's just as readable, and then we don't have to maintain two formats.