Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions data_diff/__main__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import time
import json
import logging
from itertools import islice

Expand Down Expand Up @@ -50,6 +51,7 @@
@click.option("--max-age", default=None, help="Considers only rows younger than specified. See --min-age.")
@click.option("-s", "--stats", is_flag=True, help="Print stats instead of a detailed diff")
@click.option("-d", "--debug", is_flag=True, help="Print debug info")
@click.option("--json", 'json_output', is_flag=True, help="Print JSON output for --stats")
@click.option("-v", "--verbose", is_flag=True, help="Print extra info")
@click.option("-i", "--interactive", is_flag=True, help="Confirm queries, implies --debug")
@click.option("--keep-column-case", is_flag=True, help="Don't use the schema to fix the case of given column names.")
Expand Down Expand Up @@ -80,6 +82,7 @@ def main(
interactive,
threads,
keep_column_case,
json_output,
):
if limit and stats:
print("Error: cannot specify a limit when using the -s/--stats switch")
Expand Down Expand Up @@ -144,17 +147,29 @@ def main(
if stats:
diff = list(diff_iter)
unique_diff_count = len({i[0] for _, i in diff})
table1_count = differ.stats.get("table1_count")
percent = 100 * unique_diff_count / (table1_count or 1)
print(f"Diff-Total: {len(diff)} changed rows out of {table1_count}")
print(f"Diff-Percent: {percent:.4f}%")
max_table_count = max(differ.stats["table1_count"], differ.stats["table2_count"])
percent = 100 * unique_diff_count / (max_table_count or 1)
plus = len([1 for op, _ in diff if op == "+"])
minus = len([1 for op, _ in diff if op == "-"])
print(f"Diff-Split: +{plus} -{minus}")

if json_output:
json_output = {
"different_rows": len(diff),
"different_percent": percent,
"different_+": plus,
"different_-": minus,
"total": max_table_count,
}
print(json.dumps(json_output, indent=2))
else:
print(f"Diff-Total: {len(diff)} changed rows out of {max_table_count}")
print(f"Diff-Percent: {percent:.14f}%")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW I stopped rounding aggressively here because e.g. when I have 1 different row for 100m entries, it just shows 0 with that previous rounding

print(f"Diff-Split: +{plus} -{minus}")
else:
for op, key in diff_iter:
color = COLOR_SCHEME[op]
rich.print(f"[{color}]{op} {key!r}[/{color}]")
jsonl = json.dumps([op, list(key)])
rich.print(f"[{color}]{jsonl}[/{color}]")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CleanShot 2022-06-21 at 13 50 46@2x

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should also depend on json output.

Though I do like ["a", "b"] better than the tuple.

Copy link
Contributor Author

@sirupsen sirupsen Jun 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's just default to JSON then. I don't see why we'd need two formats here when it's so readable. Frankly, I do also disagree with you on the other one. Less maintenance to just have JSON

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

VS

image

You can try to argue they are "just as readable" but there is no way you will convince me.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Produced with text = f"{op} {', '.join(columns)}")

sys.stdout.flush()

end = time.time()
Expand Down
10 changes: 9 additions & 1 deletion data_diff/diff_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,13 @@ def _bisect_and_diff_tables(self, table1, table2, level=0, max_rows=None):
if max_rows < self.bisection_threshold:
rows1, rows2 = self._threaded_call("get_values", [table1, table2])
diff = list(diff_sets(rows1, rows2))

# This happens when the initial bisection threshold is larger than
# the table itself.
if level == 0 and not self.stats.get("table1_count", False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. But I think we always want to override these attribute. Why would we care what previous values they held?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(talked about on slack, just for the record) Will double count if it's not level 0, but we can remove the not

self.stats["table1_count"] = self.stats.get("table1_count", 0) + len(rows1)
self.stats["table2_count"] = self.stats.get("table2_count", 0) + len(rows2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still this line... just

self.stats["table1_count"] = len(rows1) 

No?


logger.info(". " * level + f"Diff found {len(diff)} different rows.")
self.stats["rows_downloaded"] = self.stats.get("rows_downloaded", 0) + max(len(rows1), len(rows2))
yield from diff
Expand Down Expand Up @@ -420,7 +427,8 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
return

if level == 1:
self.stats["table1_count"] = self.stats.get("table1_count", 0) + count1
self.stats["table1_count"] = self.stats.get("table_count1", 0) + count1
self.stats["table2_count"] = self.stats.get("table_count2", 0) + count2

if checksum1 != checksum2:
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max(count1, count2))
Expand Down