- Notifications
You must be signed in to change notification settings - Fork 296
cli: output diff as jsonl, stats as json #90
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
import sys | ||
import time | ||
import json | ||
import logging | ||
from itertools import islice | ||
| ||
| @@ -50,6 +51,7 @@ | |
@click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.") | ||
@click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff") | ||
@click.option("-d", "--debug", is_flag=True, help="Print debug info") | ||
@click.option("--json", 'json_output', is_flag=True, help="Print JSON output for --stats") | ||
@click.option("-v", "--verbose", is_flag=True, help="Print extra info") | ||
@click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug") | ||
@click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.") | ||
| @@ -80,6 +82,7 @@ def main( | |
interactive, | ||
threads, | ||
keep_column_case, | ||
json_output, | ||
): | ||
if limit and stats: | ||
print("Error: cannot specify a limit when using the -s/--stats switch") | ||
| @@ -144,17 +147,29 @@ def main( | |
if stats: | ||
diff = list(diff_iter) | ||
unique_diff_count = len({i[0] for _, i in diff}) | ||
table1_count = differ.stats.get("table1_count") | ||
percent = 100 * unique_diff_count / (table1_count or 1) | ||
print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}") | ||
print(f"Diff-Percent: {percent:.4f}%") | ||
max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"]) | ||
percent = 100 * unique_diff_count / (max_table_count or 1) | ||
plus = len([1 for op, _ in diff if op == "+"]) | ||
minus = len([1 for op, _ in diff if op == "-"]) | ||
print(f"Diff-Split: +{plus} -{minus}") | ||
| ||
if json_output: | ||
json_output = { | ||
"different_rows": len(diff), | ||
"different_percent": percent, | ||
"different_+": plus, | ||
"different_-": minus, | ||
"total": max_table_count, | ||
} | ||
print(json.dumps(json_output, indent=2)) | ||
else: | ||
print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}") | ||
print(f"Diff-Percent: {percent:.14f}%") | ||
print(f"Diff-Split: +{plus} -{minus}") | ||
else: | ||
for op, key in diff_iter: | ||
color = COLOR_SCHEME[op] | ||
rich.print(f"[{color}]{op} {key!r}[/{color}]") | ||
jsonl = json.dumps([op, list(key)]) | ||
rich.print(f"[{color}]{jsonl}[/{color}]") | ||
| ||
sys.stdout.flush() | ||
| ||
end = time.time() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
| @@ -381,6 +381,13 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None): | |
if max_rows < self.bisection_threshold: | ||
rows1, rows2 = self._threaded_call("get_values", [table1, table2]) | ||
diff = list(diff_sets(rows1, rows2)) | ||
| ||
# This happens when the initial bisection threshold is larger than | ||
# the table itself. | ||
if level == 0 and not self.stats.get("table1_count", False): | ||
| ||
self.stats["table1_count"] = self.stats.get("table1_count", 0) + len(rows1) | ||
self.stats["table2_count"] = self.stats.get("table2_count", 0) + len(rows2) | ||
| ||
| ||
logger.info(". " * level + f"Diff found {len(diff)} different rows.") | ||
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2)) | ||
yield from diff | ||
| @@ -420,7 +427,8 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun | |
return | ||
| ||
if level == 1: | ||
self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1 | ||
self.stats["table1_count"] = self.stats.get("table_count1", 0) + count1 | ||
self.stats["table2_count"] = self.stats.get("table_count2", 0) + count2 | ||
| ||
if checksum1 != checksum2: | ||
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW I stopped rounding aggressively here because e.g. when I have 1 different row for 100m entries, it just shows 0 with that previous rounding