datafold · sungchun12 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023 · Sep 26, 2023
diff --git a/README.md b/README.md
@@ -81,18 +81,73 @@ More information about the algorithm and performance considerations can be found
 pip install data-diff 'data-diff[postgresql,snowflake]' -U
 ```
 
-Run `data-diff` with connection URIs. In the following example, we compare tables between PostgreSQL and Snowflake using hashdiff algorithm:
-```
+Run `data-diff` with connection URIs. In the following example, we compare tables between PostgreSQL and Snowflake using the hashdiff algorithm:
+
+```bash
 data-diff \
  postgresql://<username>:'<password>'@localhost:5432/<database> \
  <table> \
- "snowflake://<username>:<password>@<password>/<DATABASE>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<ROLE>" \
+ "snowflake://<username>:<password>@<account>/<DATABASE>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<ROLE>" \
  <TABLE> \
  -k <primary key column> \
  -c <columns to compare> \
  -w <filter condition>
 ```
 
+Run `data-diff` with a `toml` configuration file. In the following example, we compare tables between MotherDuck(hosted DuckDB) and Snowflake using the hashdiff algorithm:
+
+```toml
+## DATABASE CONNECTION ##
+[database.duckdb_connection] 
+ driver = "duckdb"
+ # filepath = "datafold_demo.duckdb" # local duckdb file example
+ # filepath = "md:" # default motherduck connection example
+ filepath = "md:datafold_demo?motherduck_token=${motherduck_token}" # API token recommended for motherduck connection
+ database = "datafold_demo"
+
+[database.snowflake_connection]
+ driver = "snowflake"
+ database = "DEV"
+ user = "sung"
+ password = "${SNOWFLAKE_PASSWORD}" # or "<PASSWORD_STRING>"
+ # the info below is only required for snowflake
+ account = "${ACCOUNT}" # by33919
+ schema = "DEVELOPMENT"
+ warehouse = "DEMO"
+ role = "DEMO_ROLE"
+
+## RUN PARAMETERS ##
+[run.default]
+ verbose = true
+
+## EXAMPLE DATA DIFF JOB ##
+[run.demo_xdb_diff]
+ # Source 1 ("left")
+ 1.database = "duckdb_connection"
+ 1.table = "development.raw_orders"
+
+ # Source 2 ("right")
+ 2.database = "snowflake_connection"
+ 2.table = "RAW_ORDERS" # note that snowflake table names are case-sensitive
+
+ verbose = false
+```
+
+```bash
+# export relevant environment variables, example below
+export motherduck_token=<MOTHERDUCK_TOKEN>
+
+# run the configured data-diff job
+data-diff --conf datadiff.toml \
+ --run demo_xdb_diff \
+ -k "id" \
+ -c status
+
+# output example
+- 1, completed
++ 1, returned
+```
+
 Check out [documentation](https://docs.datafold.com/reference/open_source/cli) for the full command reference.
 
 
@@ -106,13 +161,14 @@ Check out [documentation](https://docs.datafold.com/reference/open_source/cli) f
 | Snowflake | 🟢 | `"snowflake://<user>[:<password>]@<account>/<database>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<role>[&authenticator=externalbrowser]"` |
 | BigQuery | 🟢 | `bigquery://<project>/<dataset>` |
 | Redshift | 🟢 | `redshift://<username>:<password>@<hostname>:5439/<database>` |
+| DuckDB | 🟢 | `duckdb://<dbname>@<filepath>` |
+| MotherDuck | 🟢 | `duckdb://<dbname>@<filepath>` |
 | Oracle | 🟡 | `oracle://<username>:<password>@<hostname>/servive_or_sid` |
 | Presto | 🟡 | `presto://<username>:<password>@<hostname>:8080/<database>` |
 | Databricks | 🟡 | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>` |
 | Trino | 🟡 | `trino://<username>:<password>@<hostname>:8080/<database>` |
 | Clickhouse | 🟡 | `clickhouse://<username>:<password>@<hostname>:9000/<database>` |
 | Vertica | 🟡 | `vertica://<username>:<password>@<hostname>:5433/<database>` |
-| DuckDB | 🟡 | |
 | ElasticSearch | 📝 | |
 | Planetscale | 📝 | |
 | Pinot | 📝 | |

diff --git a/data_diff/__main__.py b/data_diff/__main__.py
@@ -59,10 +59,12 @@ def _get_log_handlers(is_dbt: Optional[bool] = False) -> Dict[str, logging.Handl
  return handlers
 
 
-def _remove_passwords_in_dict(d: dict):
+def _remove_passwords_in_dict(d: dict) -> None:
  for k, v in d.items():
  if k == "password":
  d[k] = "*" * len(v)
+ elif k == "filepath" and "motherduck_token=" in v:
+ d[k] = v.split("motherduck_token=")[0] + "motherduck_token=*************"
  elif isinstance(v, dict):
  _remove_passwords_in_dict(v)
  elif k.startswith("database"):

diff --git a/data_diff/dbt_parser.py b/data_diff/dbt_parser.py
@@ -376,8 +376,8 @@ def set_connection(self):
 
  conn_info = {
  "driver": conn_type,
- "project": credentials.get("project"),
- "dataset": credentials.get("dataset"),
+ "project": credentials.get("project") or credentials.get("database"),
+ "dataset": credentials.get("dataset") or credentials.get("schema"),
  }
 
  self.threads = credentials.get("threads")
@@ -402,13 +402,13 @@ def set_connection(self):
  "user": credentials.get("user"),
  "password": credentials.get("password") or credentials.get("pass"),
  "port": credentials.get("port"),
- "dbname": credentials.get("dbname"),
+ "dbname": credentials.get("dbname") or credentials.get("database"),
  }
  self.threads = credentials.get("threads")
  elif conn_type == "databricks":
  conn_info = {
  "driver": conn_type,
- "catalog": credentials.get("catalog"),
+ "catalog": credentials.get("catalog") or credentials.get("database"),
  "server_hostname": credentials.get("host"),
  "http_path": credentials.get("http_path"),
  "schema": credentials.get("schema"),
@@ -420,7 +420,7 @@ def set_connection(self):
  "driver": "postgresql",
  "host": credentials.get("host"),
  "user": credentials.get("user"),
- "password": credentials.get("password"),
+ "password": credentials.get("password") or credentials.get("pass"),
  "port": credentials.get("port"),
  "dbname": credentials.get("dbname") or credentials.get("database"),
  }
@@ -483,19 +483,20 @@ def get_unique_columns(self) -> Dict[str, Set[str]]:
  continue
 
  model_node = manifest.nodes[uid]
- if node.test_metadata.name == "unique":
- column_name: str = node.test_metadata.kwargs["column_name"]
- for col in self._parse_concat_pk_definition(column_name):
- if model_node is None or col in model_node.columns:
- # skip anything that is not a column.
- # for example, string literals used in concat
- # like "pk1 || '-' || pk2"
+ if node.test_metadata:
+ if node.test_metadata.name == "unique":
+ column_name: str = node.test_metadata.kwargs["column_name"]
+ for col in self._parse_concat_pk_definition(column_name):
+ if model_node is None or col in model_node.columns:
+ # skip anything that is not a column.
+ # for example, string literals used in concat
+ # like "pk1 || '-' || pk2"
+ cols_by_uid[uid].add(col)
+
+ elif node.test_metadata.name == "unique_combination_of_columns":
+ for col in node.test_metadata.kwargs["combination_of_columns"]:
  cols_by_uid[uid].add(col)
 
- if node.test_metadata.name == "unique_combination_of_columns":
- for col in node.test_metadata.kwargs["combination_of_columns"]:
- cols_by_uid[uid].add(col)
-
  except (KeyError, IndexError, TypeError) as e:
  logger.warning("Failure while finding unique cols: %s", e)
 

diff --git a/data_diff/version.py b/data_diff/version.py
@@ -1 +1 @@
-__version__ = "0.9.2"
+__version__ = "0.9.3"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "data-diff"
-version = "0.9.2"
+version = "0.9.3"
 description = "Command-line tool and Python library to efficiently diff rows across two different databases."
 authors = ["Datafold <data-diff@datafold.com>"]
 license = "MIT"

diff --git a/tests/test_joindiff.py b/tests/test_joindiff.py
@@ -22,6 +22,7 @@
  db.MySQL,
  db.Snowflake,
  db.BigQuery,
+ db.DuckDB,
  db.Oracle,
  db.Redshift,
  db.Presto,
@@ -32,7 +33,7 @@
 test_each_database = test_each_database_in_list(TEST_DATABASES)
 
 
-@test_each_database_in_list({db.Snowflake, db.BigQuery})
+@test_each_database_in_list({db.Snowflake, db.BigQuery, db.DuckDB})
 class TestCompositeKey(DiffTestCase):
  src_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}
  dst_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -15,11 +15,18 @@ def test_remove_passwords_in_dict(self):
  remove_passwords_in_dict(d, "$$$$")
  assert d["database_url"] == "mysql://user:$$$$@localhost/db"
 
+ # TODO: add a database url test for motherduck tokens
+
  # Test replacing password in nested dictionary
  d = {"info": {"password": "mypassword"}}
  remove_passwords_in_dict(d, "%%")
  assert d["info"]["password"] == "%%"
 
+ # Test replacing a motherduck token in nested dictionary
+ d = {'database1': {'driver': 'duckdb', 'filepath':'md:datafold_demo?motherduck_token=awieojfaowiejacijobhiwaef'}}
+ remove_passwords_in_dict(d, "%%")
+ assert d["info"]["password"] == "%%"
+
  def test_match_regexps(self):
  def only_results(x):
  return [v for k, v in x]