Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 3a53501

Browse files
committed
tests: database_types dual-use for benchmarks
presto: fix test suite diff_tables: fix not double recursing tests: fix presto create indexes, etc., haven't tested all dbs more
1 parent 75b8c5d commit 3a53501

File tree

7 files changed

+565
-137
lines changed

7 files changed

+565
-137
lines changed

data_diff/databases/presto.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@ def to_string(self, s: str):
5050

5151
def _query(self, sql_code: str) -> list:
5252
"Uses the standard SQL cursor interface"
53-
return _query_conn(self._conn, sql_code)
53+
c = self._conn.cursor()
54+
c.execute(sql_code)
55+
if sql_code.lower().startswith("select"):
56+
return c.fetchall()
57+
# Required for the query to actually run 🤯
58+
if re.match(r"(insert|create|truncate|drop)", sql_code, re.IGNORECASE):
59+
return c.fetchone()
5460

5561
def close(self):
5662
self._conn.close()

data_diff/diff_tables.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,13 @@ def _diff_tables(self, table1, table2, level=0, segment_index=None, segment_coun
403403
f"size: {table2.max_key-table1.min_key}"
404404
)
405405

406+
# The entire segment wasn't below the threshold, but the next set of
407+
# segments might be. In that case, it's useless to checksum them.
408+
max_rows_from_keys = max(table1.max_key - table1.min_key, table2.max_key - table2.min_key)
409+
if max_rows_from_keys < self.bisection_threshold:
410+
yield from self._bisect_and_diff_tables(table1, table2, level=level, max_rows=max_rows_from_keys)
411+
return
412+
406413
(count1, checksum1), (count2, checksum2) = self._threaded_call("count_and_checksum", [table1, table2])
407414

408415
if count1 == 0 and count2 == 0:

data_diff/sql.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,10 @@ class Checksum(Sql):
115115

116116
def compile(self, c: Compiler):
117117
compiled_exprs = ", ".join(map(c.compile, self.exprs))
118-
expr = f"concat({compiled_exprs})"
118+
expr = compiled_exprs
119+
if len(self.exprs) > 1:
120+
expr = f"concat({compiled_exprs})"
121+
119122
md5 = c.database.md5_to_int(expr)
120123
return f"sum({md5})"
121124

dev/presto-conf/standalone/catalog/postgresql.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ connector.name=postgresql
22
connection-url=jdbc:postgresql://postgres:5432/postgres
33
connection-user=postgres
44
connection-password=Password1
5+
allow-drop-table=true

docker-compose.yml

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,23 @@ services:
44
postgres:
55
container_name: postgresql
66
image: postgres:14.1-alpine
7+
shm_size: 1g
78
# work_mem: less tmp files
89
# maintenance_work_mem: improve table-level op perf
910
# max_wal_size: allow more time before merging to heap
1011
command: >
11-
-c work_mem=1GB
12-
-c maintenance_work_mem=1GB
13-
-c max_wal_size=8GB
12+
-c shared_buffers=16GB
13+
-c effective_cache_size=48GB
14+
-c maintenance_work_mem=2GB
15+
-c checkpoint_completion_target=0.9
16+
-c default_statistics_target=100
17+
-c random_page_cost=1.1
18+
-c effective_io_concurrency=200
19+
-c work_mem=20971kB
20+
-c max_worker_processes=14
21+
-c max_parallel_workers_per_gather=4
22+
-c max_parallel_workers=14
23+
-c max_parallel_maintenance_workers=4
1424
restart: always
1525
volumes:
1626
- postgresql-data:/var/lib/postgresql/data:delegated

tests/common.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,21 @@
22

33
from data_diff import databases as db
44
import logging
5+
import os
56

6-
logging.basicConfig(level=logging.INFO)
7+
DEFAULT_N_SAMPLES = 50
8+
N_SAMPLES = int(os.environ.get('N_SAMPLES', DEFAULT_N_SAMPLES))
9+
BENCHMARK = os.environ.get('BENCHMARK', False)
10+
11+
level = logging.WARN
12+
if os.environ.get('DEBUG', False):
13+
level = logging.DEBUG
14+
15+
logging.basicConfig(level=level)
16+
logging.getLogger("diff_tables").setLevel(level)
17+
logging.getLogger("database").setLevel(level)
18+
if BENCHMARK:
19+
logging.getLogger("benchmark").setLevel(logging.DEBUG)
720

821
TEST_MYSQL_CONN_STRING: str = "mysql://mysql:Password1@localhost/mysql"
922
TEST_POSTGRESQL_CONN_STRING: str = None

0 commit comments

Comments
 (0)