datafold
diff --git a/‎README.md‎
Lines changed: 19 additions & 5 deletions b/‎README.md‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎data_diff/__init__.py‎
Lines changed: 70 additions & 21 deletions b/‎data_diff/__init__.py‎
Lines changed: 70 additions & 21 deletions
@@ -132,9 +132,9 @@ Let's break this down. Assume there are two tables stored in two databases, and
 | PostgreSQL >=10 | `postgresql://<user>:<password>@<host>:5432/<database>` | 💚 |
 | MySQL | `mysql://<user>:<password>@<hostname>:5432/<database>` | 💚 |
 | Snowflake | `"snowflake://<user>[:<password>]@<account>/<database>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<role>[&authenticator=externalbrowser]"` | 💚 |
+| BigQuery | `bigquery://<project>/<dataset>` | 💚 |
+| Redshift | `redshift://<username>:<password>@<hostname>:5439/<database>` | 💚 |
 | Oracle | `oracle://<username>:<password>@<hostname>/database` | 💛 |
-| BigQuery | `bigquery://<project>/<dataset>` | 💛 |
-| Redshift | `redshift://<username>:<password>@<hostname>:5439/<database>` | 💛 |
 | Presto | `presto://<username>:<password>@<hostname>:8080/<database>` | 💛 |
 | Databricks | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>` | 💛 |
 | Trino | `trino://<username>:<password>@<hostname>:8080/<database>` | 💛 |
@@ -145,6 +145,8 @@ Let's break this down. Assume there are two tables stored in two databases, and
 | Pinot | | 📝 |
 | Druid | | 📝 |
 | Kafka | | 📝 |
+| DuckDB | | 📝 |
+| SQLite | | 📝 |
 
 * 💚: Implemented and thoroughly tested.
 * 💛: Implemented, but not thoroughly tested yet.
@@ -163,7 +165,7 @@ may be case-sensitive. This is the case for the Snowflake schema and table names
 ## Options:
 
  - `--help` - Show help message and exit.
- - `-k` or `--key-column` - Name of the primary key column
+ - `-k` or `--key-columns` - Name of the primary key column. If none provided, default is 'id'.
  - `-t` or `--update-column` - Name of updated_at/last_updated column
  - `-c` or `--columns` - Names of extra columns to compare. Can be used more than once in the same command.
  Accepts a name or a pattern like in SQL.
@@ -178,12 +180,24 @@ may be case-sensitive. This is the case for the Snowflake schema and table names
  Example: `--min-age=5min` ignores rows from the last 5 minutes.
  Valid units: `d, days, h, hours, min, minutes, mon, months, s, seconds, w, weeks, y, years`
  - `--max-age` - Considers only rows younger than specified. See `--min-age`.
- - `--bisection-factor` - Segments per iteration. When set to 2, it performs binary search.
- - `--bisection-threshold` - Minimal bisection threshold. i.e. maximum size of pages to diff locally.
  - `-j` or `--threads` - Number of worker threads to use per database. Default=1.
  - `-w`, `--where` - An additional 'where' expression to restrict the search space.
  - `--conf`, `--run` - Specify the run and configuration from a TOML file. (see below)
  - `--no-tracking` - data-diff sends home anonymous usage data. Use this to disable it.
+ - `-a`, `--algorithm` `[auto|joindiff|hashdiff]` - Force algorithm choice
+
+Same-DB diff only:
+ - `-m`, `--materialize` - Materialize the diff results into a new table in the database.
+ If a table exists by that name, it will be replaced.
+ Use `%t` in the name to place a timestamp.
+ Example: `-m test_mat_%t`
+ - `--assume-unique-key` - Skip validating the uniqueness of the key column during joindiff, which is costly in non-cloud dbs.
+ - `--sample-exclusive-rows` - Sample several rows that only appear in one of the tables, but not the other. Use with `-s`.
+
+Cross-DB diff only:
+ - `--bisection-threshold` - Minimal size of segment to be split. Smaller segments will be downloaded and compared locally.
+ - `--bisection-factor` - Segments per iteration. When set to 2, it performs binary search.
+
 
 
 ### How to use with a configuration file
 
@@ -1,41 +1,49 @@
-from typing import Tuple, Iterator, Optional, Union
+from typing import Sequence, Tuple, Iterator, Optional, Union
 
 from .tracking import disable_tracking
 from .databases.connect import connect
 from .databases.database_types import DbKey, DbTime, DbPath
-from .diff_tables import TableSegment, TableDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
+from .diff_tables import Algorithm
+from .hashdiff_tables import HashDiffer, DEFAULT_BISECTION_THRESHOLD, DEFAULT_BISECTION_FACTOR
+from .joindiff_tables import JoinDiffer
+from .table_segment import TableSegment
 
 
 def connect_to_table(
  db_info: Union[str, dict],
  table_name: Union[DbPath, str],
- key_column: str = "id",
+ key_columns: str = ("id",),
  thread_count: Optional[int] = 1,
  **kwargs,
-):
+) -> TableSegment:
  """Connects to the given database, and creates a TableSegment instance
 
  Parameters:
  db_info: Either a URI string, or a dict of connection options.
  table_name: Name of the table as a string, or a tuple that signifies the path.
- key_column: Name of the key column
- thread_count: Number of threads for this connection (only if using a threadpooled implementation)
+ key_columns: Names of the key columns
+ thread_count: Number of threads for this connection (only if using a threadpooled db implementation)
+
+ See Also:
+ :meth:`connect`
  """
+ if isinstance(key_columns, str):
+ key_columns = (key_columns,)
 
  db = connect(db_info, thread_count=thread_count)
 
  if isinstance(table_name, str):
  table_name = db.parse_table_name(table_name)
 
- return TableSegment(db, table_name, key_column, **kwargs)
+ return TableSegment(db, table_name, key_columns, **kwargs)
 
 
 def diff_tables(
  table1: TableSegment,
  table2: TableSegment,
  *,
  # Name of the key column, which uniquely identifies each row (usually id)
- key_column: str = None,
+ key_columns: Sequence[str] = None,
  # Name of updated column, which signals that rows changed (usually updated_at or last_update)
  update_column: str = None,
  # Extra columns to compare
@@ -46,31 +54,63 @@ def diff_tables(
  # Start/end update_column values, used to restrict the segment
  min_update: DbTime = None,
  max_update: DbTime = None,
- # Into how many segments to bisect per iteration
+ # Algorithm
+ algorithm: Algorithm = Algorithm.HASHDIFF,
+ # Into how many segments to bisect per iteration (hashdiff only)
  bisection_factor: int = DEFAULT_BISECTION_FACTOR,
- # When should we stop bisecting and compare locally (in row count)
+ # When should we stop bisecting and compare locally (in row count; hashdiff only)
  bisection_threshold: int = DEFAULT_BISECTION_THRESHOLD,
  # Enable/disable threaded diffing. Needed to take advantage of database threads.
  threaded: bool = True,
  # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
  # There may be many pools, so number of actual threads can be a lot higher.
  max_threadpool_size: Optional[int] = 1,
- # Enable/disable debug prints
- debug: bool = False,
 ) -> Iterator:
- """Efficiently finds the diff between table1 and table2.
+ """Finds the diff between table1 and table2.
+
+ Parameters:
+ key_columns (Tuple[str, ...]): Name of the key column, which uniquely identifies each row (usually id)
+ update_column (str, optional): Name of updated column, which signals that rows changed.
+ Usually updated_at or last_update. Used by `min_update` and `max_update`.
+ extra_columns (Tuple[str, ...], optional): Extra columns to compare
+ min_key (:data:`DbKey`, optional): Lowest key value, used to restrict the segment
+ max_key (:data:`DbKey`, optional): Highest key value, used to restrict the segment
+ min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
+ max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
+ algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
+ bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
+ bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
+ and compare locally. (Used when algorithm is `HASHDIFF`).
+ threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
+ max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
+ Only relevant when `threaded` is ``True``.
+ There may be many pools, so number of actual threads can be a lot higher.
+
+ Note:
+ The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
+ `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`.
+ If different values are needed per table, it's possible to omit them here, and instead set
+ them directly when creating each :class:`TableSegment`.
 
  Example:
  >>> table1 = connect_to_table('postgresql:///', 'Rating', 'id')
  >>> list(diff_tables(table1, table1))
  []
 
+ See Also:
+ :class:`TableSegment`
+ :class:`HashDiffer`
+ :class:`JoinDiffer`
+
  """
+ if isinstance(key_columns, str):
+ key_columns = (key_columns,)
+
  tables = [table1, table2]
  override_attrs = {
  k: v
  for k, v in dict(
- key_column=key_column,
+ key_columns=key_columns,
  update_column=update_column,
  extra_columns=extra_columns,
  min_key=min_key,
@@ -83,11 +123,20 @@ def diff_tables(
 
  segments = [t.new(**override_attrs) for t in tables] if override_attrs else tables
 
- differ = TableDiffer(
- bisection_factor=bisection_factor,
- bisection_threshold=bisection_threshold,
- debug=debug,
- threaded=threaded,
- max_threadpool_size=max_threadpool_size,
- )
+ algorithm = Algorithm(algorithm)
+ if algorithm == Algorithm.HASHDIFF:
+ differ = HashDiffer(
+ bisection_factor=bisection_factor,
+ bisection_threshold=bisection_threshold,
+ threaded=threaded,
+ max_threadpool_size=max_threadpool_size,
+ )
+ elif algorithm == Algorithm.JOINDIFF:
+ differ = JoinDiffer(
+ threaded=threaded,
+ max_threadpool_size=max_threadpool_size,
+ )
+ else:
+ raise ValueError(f"Unknown algorithm: {algorithm}")
+
  return differ.diff_tables(*segments)