Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.
64 changes: 60 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,18 +81,73 @@ More information about the algorithm and performance considerations can be found
pip install data-diff 'data-diff[postgresql,snowflake]' -U
```

Run `data-diff` with connection URIs. In the following example, we compare tables between PostgreSQL and Snowflake using hashdiff algorithm:
```
Run `data-diff` with connection URIs. In the following example, we compare tables between PostgreSQL and Snowflake using the hashdiff algorithm:

```bash
data-diff \
postgresql://<username>:'<password>'@localhost:5432/<database> \
<table> \
"snowflake://<username>:<password>@<password>/<DATABASE>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<ROLE>" \
"snowflake://<username>:<password>@<account>/<DATABASE>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<ROLE>" \
<TABLE> \
-k <primary key column> \
-c <columns to compare> \
-w <filter condition>
```

Run `data-diff` with a `toml` configuration file. In the following example, we compare tables between MotherDuck(hosted DuckDB) and Snowflake using the hashdiff algorithm:

```toml
## DATABASE CONNECTION ##
[database.duckdb_connection]
driver = "duckdb"
# filepath = "datafold_demo.duckdb" # local duckdb file example
# filepath = "md:" # default motherduck connection example
filepath = "md:datafold_demo?motherduck_token=${motherduck_token}" # API token recommended for motherduck connection
database = "datafold_demo"

[database.snowflake_connection]
driver = "snowflake"
database = "DEV"
user = "sung"
password = "${SNOWFLAKE_PASSWORD}" # or "<PASSWORD_STRING>"
# the info below is only required for snowflake
account = "${ACCOUNT}" # by33919
schema = "DEVELOPMENT"
warehouse = "DEMO"
role = "DEMO_ROLE"

## RUN PARAMETERS ##
[run.default]
verbose = true

## EXAMPLE DATA DIFF JOB ##
[run.demo_xdb_diff]
# Source 1 ("left")
1.database = "duckdb_connection"
1.table = "development.raw_orders"

# Source 2 ("right")
2.database = "snowflake_connection"
2.table = "RAW_ORDERS" # note that snowflake table names are case-sensitive

verbose = false
```

```bash
# export relevant environment variables, example below
export motherduck_token=<MOTHERDUCK_TOKEN>

# run the configured data-diff job
data-diff --conf datadiff.toml \
--run demo_xdb_diff \
-k "id" \
-c status

# output example
- 1, completed
+ 1, returned
```

Check out [documentation](https://docs.datafold.com/reference/open_source/cli) for the full command reference.


Expand All @@ -106,13 +161,14 @@ Check out [documentation](https://docs.datafold.com/reference/open_source/cli) f
| Snowflake | 🟢 | `"snowflake://<user>[:<password>]@<account>/<database>/<SCHEMA>?warehouse=<WAREHOUSE>&role=<role>[&authenticator=externalbrowser]"` |
| BigQuery | 🟢 | `bigquery://<project>/<dataset>` |
| Redshift | 🟢 | `redshift://<username>:<password>@<hostname>:5439/<database>` |
| DuckDB | 🟢 | `duckdb://<dbname>@<filepath>` |
| MotherDuck | 🟢 | `duckdb://<dbname>@<filepath>` |
| Oracle | 🟡 | `oracle://<username>:<password>@<hostname>/servive_or_sid` |
| Presto | 🟡 | `presto://<username>:<password>@<hostname>:8080/<database>` |
| Databricks | 🟡 | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>` |
| Trino | 🟡 | `trino://<username>:<password>@<hostname>:8080/<database>` |
| Clickhouse | 🟡 | `clickhouse://<username>:<password>@<hostname>:9000/<database>` |
| Vertica | 🟡 | `vertica://<username>:<password>@<hostname>:5433/<database>` |
| DuckDB | 🟡 | |
| ElasticSearch | 📝 | |
| Planetscale | 📝 | |
| Pinot | 📝 | |
Expand Down
4 changes: 3 additions & 1 deletion data_diff/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,12 @@ def _get_log_handlers(is_dbt: Optional[bool] = False) -> Dict[str, logging.Handl
return handlers


def _remove_passwords_in_dict(d: dict):
def _remove_passwords_in_dict(d: dict) -> None:
for k, v in d.items():
if k == "password":
d[k] = "*" * len(v)
elif k == "filepath" and "motherduck_token=" in v:
d[k] = v.split("motherduck_token=")[0] + "motherduck_token=*************"
elif isinstance(v, dict):
_remove_passwords_in_dict(v)
elif k.startswith("database"):
Expand Down
33 changes: 17 additions & 16 deletions data_diff/dbt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,8 +376,8 @@ def set_connection(self):

conn_info = {
"driver": conn_type,
"project": credentials.get("project"),
"dataset": credentials.get("dataset"),
"project": credentials.get("project") or credentials.get("database"),
"dataset": credentials.get("dataset") or credentials.get("schema"),
}

self.threads = credentials.get("threads")
Expand All @@ -402,13 +402,13 @@ def set_connection(self):
"user": credentials.get("user"),
"password": credentials.get("password") or credentials.get("pass"),
"port": credentials.get("port"),
"dbname": credentials.get("dbname"),
"dbname": credentials.get("dbname") or credentials.get("database"),
}
self.threads = credentials.get("threads")
elif conn_type == "databricks":
conn_info = {
"driver": conn_type,
"catalog": credentials.get("catalog"),
"catalog": credentials.get("catalog") or credentials.get("database"),
"server_hostname": credentials.get("host"),
"http_path": credentials.get("http_path"),
"schema": credentials.get("schema"),
Expand All @@ -420,7 +420,7 @@ def set_connection(self):
"driver": "postgresql",
"host": credentials.get("host"),
"user": credentials.get("user"),
"password": credentials.get("password"),
"password": credentials.get("password") or credentials.get("pass"),
"port": credentials.get("port"),
"dbname": credentials.get("dbname") or credentials.get("database"),
}
Expand Down Expand Up @@ -483,19 +483,20 @@ def get_unique_columns(self) -> Dict[str, Set[str]]:
continue

model_node = manifest.nodes[uid]
if node.test_metadata.name == "unique":
column_name: str = node.test_metadata.kwargs["column_name"]
for col in self._parse_concat_pk_definition(column_name):
if model_node is None or col in model_node.columns:
# skip anything that is not a column.
# for example, string literals used in concat
# like "pk1 || '-' || pk2"
if node.test_metadata:
if node.test_metadata.name == "unique":
column_name: str = node.test_metadata.kwargs["column_name"]
for col in self._parse_concat_pk_definition(column_name):
if model_node is None or col in model_node.columns:
# skip anything that is not a column.
# for example, string literals used in concat
# like "pk1 || '-' || pk2"
cols_by_uid[uid].add(col)

elif node.test_metadata.name == "unique_combination_of_columns":
for col in node.test_metadata.kwargs["combination_of_columns"]:
cols_by_uid[uid].add(col)

if node.test_metadata.name == "unique_combination_of_columns":
for col in node.test_metadata.kwargs["combination_of_columns"]:
cols_by_uid[uid].add(col)

except (KeyError, IndexError, TypeError) as e:
logger.warning("Failure while finding unique cols: %s", e)

Expand Down
2 changes: 1 addition & 1 deletion data_diff/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.9.2"
__version__ = "0.9.3"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "data-diff"
version = "0.9.2"
version = "0.9.3"
description = "Command-line tool and Python library to efficiently diff rows across two different databases."
authors = ["Datafold <data-diff@datafold.com>"]
license = "MIT"
Expand Down
3 changes: 2 additions & 1 deletion tests/test_joindiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
db.MySQL,
db.Snowflake,
db.BigQuery,
db.DuckDB,
db.Oracle,
db.Redshift,
db.Presto,
Expand All @@ -32,7 +33,7 @@
test_each_database = test_each_database_in_list(TEST_DATABASES)


@test_each_database_in_list({db.Snowflake, db.BigQuery})
@test_each_database_in_list({db.Snowflake, db.BigQuery, db.DuckDB})
class TestCompositeKey(DiffTestCase):
src_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}
dst_schema = {"id": int, "userid": int, "movieid": int, "rating": float, "timestamp": datetime}
Expand Down
7 changes: 7 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,18 @@ def test_remove_passwords_in_dict(self):
remove_passwords_in_dict(d, "$$$$")
assert d["database_url"] == "mysql://user:$$$$@localhost/db"

# TODO: add a database url test for motherduck tokens

# Test replacing password in nested dictionary
d = {"info": {"password": "mypassword"}}
remove_passwords_in_dict(d, "%%")
assert d["info"]["password"] == "%%"

# Test replacing a motherduck token in nested dictionary
d = {'database1': {'driver': 'duckdb', 'filepath':'md:datafold_demo?motherduck_token=awieojfaowiejacijobhiwaef'}}
remove_passwords_in_dict(d, "%%")
assert d["info"]["password"] == "%%"

def test_match_regexps(self):
def only_results(x):
return [v for k, v in x]
Expand Down