1- from typing import Tuple , Iterator , Optional , Union
1+ from typing import Sequence , Tuple , Iterator , Optional , Union
22
33from .tracking import disable_tracking
44from .databases .connect import connect
55from .databases .database_types import DbKey , DbTime , DbPath
6- from .diff_tables import TableSegment , TableDiffer , DEFAULT_BISECTION_THRESHOLD , DEFAULT_BISECTION_FACTOR
6+ from .diff_tables import Algorithm
7+ from .hashdiff_tables import HashDiffer , DEFAULT_BISECTION_THRESHOLD , DEFAULT_BISECTION_FACTOR
8+ from .joindiff_tables import JoinDiffer
9+ from .table_segment import TableSegment
710
811
912def connect_to_table (
1013 db_info : Union [str , dict ],
1114 table_name : Union [DbPath , str ],
12- key_column : str = "id" ,
15+ key_columns : str = ( "id" ,) ,
1316 thread_count : Optional [int ] = 1 ,
1417 ** kwargs ,
15- ):
18+ ) -> TableSegment :
1619 """Connects to the given database, and creates a TableSegment instance
1720
1821 Parameters:
1922 db_info: Either a URI string, or a dict of connection options.
2023 table_name: Name of the table as a string, or a tuple that signifies the path.
21- key_column: Name of the key column
22- thread_count: Number of threads for this connection (only if using a threadpooled implementation)
24+ key_columns: Names of the key columns
25+ thread_count: Number of threads for this connection (only if using a threadpooled db implementation)
26+
27+ See Also:
28+ :meth:`connect`
2329 """
30+ if isinstance (key_columns , str ):
31+ key_columns = (key_columns ,)
2432
2533 db = connect (db_info , thread_count = thread_count )
2634
2735 if isinstance (table_name , str ):
2836 table_name = db .parse_table_name (table_name )
2937
30- return TableSegment (db , table_name , key_column , ** kwargs )
38+ return TableSegment (db , table_name , key_columns , ** kwargs )
3139
3240
3341def diff_tables (
3442 table1 : TableSegment ,
3543 table2 : TableSegment ,
3644 * ,
3745 # Name of the key column, which uniquely identifies each row (usually id)
38- key_column : str = None ,
46+ key_columns : Sequence [ str ] = None ,
3947 # Name of updated column, which signals that rows changed (usually updated_at or last_update)
4048 update_column : str = None ,
4149 # Extra columns to compare
@@ -46,31 +54,63 @@ def diff_tables(
4654 # Start/end update_column values, used to restrict the segment
4755 min_update : DbTime = None ,
4856 max_update : DbTime = None ,
49- # Into how many segments to bisect per iteration
57+ # Algorithm
58+ algorithm : Algorithm = Algorithm .HASHDIFF ,
59+ # Into how many segments to bisect per iteration (hashdiff only)
5060 bisection_factor : int = DEFAULT_BISECTION_FACTOR ,
51- # When should we stop bisecting and compare locally (in row count)
61+ # When should we stop bisecting and compare locally (in row count; hashdiff only )
5262 bisection_threshold : int = DEFAULT_BISECTION_THRESHOLD ,
5363 # Enable/disable threaded diffing. Needed to take advantage of database threads.
5464 threaded : bool = True ,
5565 # Maximum size of each threadpool. None = auto. Only relevant when threaded is True.
5666 # There may be many pools, so number of actual threads can be a lot higher.
5767 max_threadpool_size : Optional [int ] = 1 ,
58- # Enable/disable debug prints
59- debug : bool = False ,
6068) -> Iterator :
61- """Efficiently finds the diff between table1 and table2.
69+ """Finds the diff between table1 and table2.
70+
71+ Parameters:
72+ key_columns (Tuple[str, ...]): Name of the key column, which uniquely identifies each row (usually id)
73+ update_column (str, optional): Name of updated column, which signals that rows changed.
74+ Usually updated_at or last_update. Used by `min_update` and `max_update`.
75+ extra_columns (Tuple[str, ...], optional): Extra columns to compare
76+ min_key (:data:`DbKey`, optional): Lowest key value, used to restrict the segment
77+ max_key (:data:`DbKey`, optional): Highest key value, used to restrict the segment
78+ min_update (:data:`DbTime`, optional): Lowest update_column value, used to restrict the segment
79+ max_update (:data:`DbTime`, optional): Highest update_column value, used to restrict the segment
80+ algorithm (:class:`Algorithm`): Which diffing algorithm to use (`HASHDIFF` or `JOINDIFF`)
81+ bisection_factor (int): Into how many segments to bisect per iteration. (Used when algorithm is `HASHDIFF`)
82+ bisection_threshold (Number): Minimal row count of segment to bisect, otherwise download
83+ and compare locally. (Used when algorithm is `HASHDIFF`).
84+ threaded (bool): Enable/disable threaded diffing. Needed to take advantage of database threads.
85+ max_threadpool_size (int): Maximum size of each threadpool. ``None`` means auto.
86+ Only relevant when `threaded` is ``True``.
87+ There may be many pools, so number of actual threads can be a lot higher.
88+
89+ Note:
90+ The following parameters are used to override the corresponding attributes of the given :class:`TableSegment` instances:
91+ `key_columns`, `update_column`, `extra_columns`, `min_key`, `max_key`.
92+ If different values are needed per table, it's possible to omit them here, and instead set
93+ them directly when creating each :class:`TableSegment`.
6294
6395 Example:
6496 >>> table1 = connect_to_table('postgresql:///', 'Rating', 'id')
6597 >>> list(diff_tables(table1, table1))
6698 []
6799
100+ See Also:
101+ :class:`TableSegment`
102+ :class:`HashDiffer`
103+ :class:`JoinDiffer`
104+
68105 """
106+ if isinstance (key_columns , str ):
107+ key_columns = (key_columns ,)
108+
69109 tables = [table1 , table2 ]
70110 override_attrs = {
71111 k : v
72112 for k , v in dict (
73- key_column = key_column ,
113+ key_columns = key_columns ,
74114 update_column = update_column ,
75115 extra_columns = extra_columns ,
76116 min_key = min_key ,
@@ -83,11 +123,20 @@ def diff_tables(
83123
84124 segments = [t .new (** override_attrs ) for t in tables ] if override_attrs else tables
85125
86- differ = TableDiffer (
87- bisection_factor = bisection_factor ,
88- bisection_threshold = bisection_threshold ,
89- debug = debug ,
90- threaded = threaded ,
91- max_threadpool_size = max_threadpool_size ,
92- )
126+ algorithm = Algorithm (algorithm )
127+ if algorithm == Algorithm .HASHDIFF :
128+ differ = HashDiffer (
129+ bisection_factor = bisection_factor ,
130+ bisection_threshold = bisection_threshold ,
131+ threaded = threaded ,
132+ max_threadpool_size = max_threadpool_size ,
133+ )
134+ elif algorithm == Algorithm .JOINDIFF :
135+ differ = JoinDiffer (
136+ threaded = threaded ,
137+ max_threadpool_size = max_threadpool_size ,
138+ )
139+ else :
140+ raise ValueError (f"Unknown algorithm: { algorithm } " )
141+
93142 return differ .diff_tables (* segments )
0 commit comments