lihp18
diff --git a/‎data_diff/databases/base.py‎
Lines changed: 10 additions & 5 deletions b/‎data_diff/databases/base.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎data_diff/databases/database_types.py‎
Lines changed: 3 additions & 1 deletion b/‎data_diff/databases/database_types.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎data_diff/databases/databricks.py‎
Lines changed: 4 additions & 2 deletions b/‎data_diff/databases/databricks.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎data_diff/table_segment.py‎
Lines changed: 1 addition & 1 deletion b/‎data_diff/table_segment.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/test_diff_tables.py‎
Lines changed: 10 additions & 0 deletions b/‎tests/test_diff_tables.py‎
Lines changed: 10 additions & 0 deletions
@@ -187,25 +187,30 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
  assert len(d) == len(rows)
  return d
 
- def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str]):
+ def _process_table_schema(
+ self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str], where: str = None
+ ):
  accept = {i.lower() for i in filter_columns}
 
  col_dict = {row[0]: self._parse_type(path, *row) for name, row in raw_schema.items() if name.lower() in accept}
 
- self._refine_coltypes(path, col_dict)
+ self._refine_coltypes(path, col_dict, where)
 
  # Return a dict of form {name: type} after normalization
  return col_dict
 
- def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType]):
- "Refine the types in the column dict, by querying the database for a sample of their values"
+ def _refine_coltypes(self, table_path: DbPath, col_dict: Dict[str, ColType], where: str = None):
+ """Refine the types in the column dict, by querying the database for a sample of their values
+
+ 'where' restricts the rows to be sampled.
+ """
 
  text_columns = [k for k, v in col_dict.items() if isinstance(v, Text)]
  if not text_columns:
  return
 
  fields = [self.normalize_uuid(c, String_UUID()) for c in text_columns]
- samples_by_row = self.query(Select(fields, TableName(table_path), limit=16), list)
+ samples_by_row = self.query(Select(fields, TableName(table_path), limit=16, where=where and [where]), list)
  if not samples_by_row:
  raise ValueError(f"Table {table_path} is empty.")
 
 
@@ -177,7 +177,9 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
  ...
 
  @abstractmethod
- def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str]):
+ def _process_table_schema(
+ self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str], where: str = None
+ ):
  """Process the result of query_table_schema().
 
  Done in a separate step, to minimize the amount of processed columns.
 
@@ -83,7 +83,9 @@ def query_table_schema(self, path: DbPath) -> Dict[str, tuple]:
  assert len(d) == len(rows)
  return d
 
- def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str]):
+ def _process_table_schema(
+ self, path: DbPath, raw_schema: Dict[str, tuple], filter_columns: Sequence[str], where: str = None
+ ):
  accept = {i.lower() for i in filter_columns}
  rows = [row for name, row in raw_schema.items() if name.lower() in accept]
 
@@ -115,7 +117,7 @@ def _process_table_schema(self, path: DbPath, raw_schema: Dict[str, tuple], filt
 
  col_dict: Dict[str, ColType] = {row[0]: self._parse_type(path, *row) for row in resulted_rows}
 
- self._refine_coltypes(path, col_dict)
+ self._refine_coltypes(path, col_dict, where)
  return col_dict
 
  def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
 
@@ -111,7 +111,7 @@ def _normalize_column(self, name: str, template: str = None) -> str:
  return self.database.normalize_value_by_type(col, col_type)
 
  def _with_raw_schema(self, raw_schema: dict) -> "TableSegment":
- schema = self.database._process_table_schema(self.table_path, raw_schema, self._relevant_columns)
+ schema = self.database._process_table_schema(self.table_path, raw_schema, self._relevant_columns, self.where)
  return self.new(_schema=create_schema(self.database, self.table_path, schema, self.case_sensitive))
 
  def with_schema(self) -> "TableSegment":
 
@@ -443,6 +443,16 @@ def test_string_keys(self):
 
  self.assertRaises(ValueError, list, differ.diff_tables(self.a, self.b))
 
+ def test_where_sampling(self):
+ a = self.a.replace(where="1=1")
+
+ differ = TableDiffer()
+ diff = list(differ.diff_tables(a, self.b))
+ self.assertEqual(diff, [("-", (str(self.new_uuid), "This one is different"))])
+
+ a_empty = self.a.replace(where="1=0")
+ self.assertRaises(ValueError, list, differ.diff_tables(a_empty, self.b))
+
 
 @test_per_database
 class TestAlphanumericKeys(TestPerDatabase):