airbytehq · midavadim · Feb 7, 2022 · Jan 6, 2022 · Jan 18, 2022 · Feb 2, 2022
diff --git a/.../bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py b/.../bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py
@@ -146,7 +146,7 @@ def __normalize_non_column_identifier_name(
  ) -> str:
  # We force standard naming for non column names (see issue #1785)
  result = transform_standard_naming(input_name)
- result = self.__normalize_naming_conventions(result)
+ result = self.__normalize_naming_conventions(result, is_column=False)
  if truncate:
  result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level)
  result = self.__normalize_identifier_case(result, is_quoted=False)
@@ -160,7 +160,7 @@ def __normalize_non_column_identifier_name(
  def __normalize_identifier_name(
  self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0
  ) -> str:
- result = self.__normalize_naming_conventions(column_name)
+ result = self.__normalize_naming_conventions(column_name, is_column=True)
  if truncate:
  result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level)
  if self.needs_quotes(result):
@@ -189,14 +189,16 @@ def apply_quote(self, input: str) -> str:
  return f"quote('{input}')"
  return f"adapter.quote('{input}')"
 
- def __normalize_naming_conventions(self, input_name: str) -> str:
+ def __normalize_naming_conventions(self, input_name: str, is_column: bool = False) -> str:
  result = input_name
  if self.destination_type.value == DestinationType.ORACLE.value:
  return transform_standard_naming(result)
  elif self.destination_type.value == DestinationType.BIGQUERY.value:
+ # Can start with number: datasetId, table
+ # Can not start with number: column
  result = transform_standard_naming(result)
  doesnt_start_with_alphaunderscore = match("[^A-Za-z_]", result[0]) is not None
- if doesnt_start_with_alphaunderscore:
+ if is_column and doesnt_start_with_alphaunderscore:
  result = f"_{result}"
  return result
 

diff --git a/...yte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py b/...yte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py
@@ -98,6 +98,62 @@ def test_transform_standard_naming(input_str: str, expected: str):
  assert transform_standard_naming(input_str) == expected
 
 
+@pytest.mark.parametrize(
+ "input_str, destination_type, expected, expected_in_jinja",
+ [
+ # Case sensitive names
+ ("Identifier Name", "Postgres", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
+ ("Identifier Name", "BigQuery", "Identifier_Name", "'Identifier_Name'"),
+ ("Identifier Name", "Snowflake", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
+ ("Identifier Name", "Redshift", "{{ adapter.quote('identifier name') }}", "adapter.quote('identifier name')"),
+ ("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
+ ("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
+ # Reserved Word for BigQuery and MySQL only
+ ("Groups", "Postgres", "groups", "'groups'"),
+ ("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
+ ("Groups", "Snowflake", "GROUPS", "'GROUPS'"),
+ ("Groups", "Redshift", "groups", "'groups'"),
+ ("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
+ ("Groups", "MSSQL", "groups", "'groups'"),
+ ],
+)
+def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str):
+ t = DestinationType.from_string(destination_type)
+ assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=False) == expected
+ assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=True) == expected_in_jinja
+
+
+@pytest.mark.parametrize(
+ "input_str, expected",
+ [
+ # below the limit
+ ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh"),
+ # at the limit
+ ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii"),
+ # over the limit
+ ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
+ ("Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
+ ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii_Jjjj_Kkkk", "Aaaa_Bbbb_Cccc_Dddd___g_Hhhh_Iiii_Jjjj_Kkkk"),
+ ("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_0123456789", "ABCDEFGHIJKLMNOPQRST__qrstuvwxyz_0123456789"),
+ ],
+)
+def test_truncate_identifier(input_str: str, expected: str):
+ """
+ Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit:
+ - `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii`
+ - `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii`
+
+ Deciding on how to truncate (in the middle) are being verified in these tests.
+ In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii`
+ and can potentially cause a collision in table names.
+
+ Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`.
+ """
+ name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
+ print(f"Truncating from #{len(input_str)} to #{len(expected)}")
+ assert name_transformer.truncate_identifier_name(input_str) == expected
+
+
 @pytest.mark.parametrize(
  "input_str, destination_type, expected, expected_column",
  [
@@ -117,7 +173,8 @@ def test_transform_standard_naming(input_str: str, expected: str):
  ("a-Unicode_name_文6", "MSSQL", "a_unicode_name__6", "{{ adapter.quote('a-Unicode_name_文6') }}"),
  # Doesnt start with alpha or underscore
  ("100x2001", "Postgres", "100x2001", "{{ adapter.quote('100x2001') }}"),
- ("100x2002", "BigQuery", "_100x2002", "_100x2002"),
+ ("100x2002", "BigQuery", "100x2002", "_100x2002"),
+ ("文2_a-Unicode_name", "BigQuery", "_2_a_Unicode_name", "_2_a_Unicode_name"),
  ("100x2003", "Snowflake", "100x2003", "{{ adapter.quote('100x2003') }}"),
  ("100x2004", "Redshift", "100x2004", "{{ adapter.quote('100x2004') }}"),
  ("100x2005", "MySQL", "100x2005", "{{ adapter.quote('100x2005') }}"),
@@ -164,59 +221,3 @@ def test_normalize_name(input_str: str, destination_type: str, expected: str, ex
  assert DestinationNameTransformer(t).normalize_schema_name(input_str) == expected
  assert DestinationNameTransformer(t).normalize_table_name(input_str) == expected
  assert DestinationNameTransformer(t).normalize_column_name(input_str) == expected_column
-
-
-@pytest.mark.parametrize(
- "input_str, destination_type, expected, expected_in_jinja",
- [
- # Case sensitive names
- ("Identifier Name", "Postgres", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
- ("Identifier Name", "BigQuery", "Identifier_Name", "'Identifier_Name'"),
- ("Identifier Name", "Snowflake", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
- ("Identifier Name", "Redshift", "{{ adapter.quote('identifier name') }}", "adapter.quote('identifier name')"),
- ("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
- ("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
- # Reserved Word for BigQuery and MySQL only
- ("Groups", "Postgres", "groups", "'groups'"),
- ("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
- ("Groups", "Snowflake", "GROUPS", "'GROUPS'"),
- ("Groups", "Redshift", "groups", "'groups'"),
- ("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
- ("Groups", "MSSQL", "groups", "'groups'"),
- ],
-)
-def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str):
- t = DestinationType.from_string(destination_type)
- assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=False) == expected
- assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=True) == expected_in_jinja
-
-
-@pytest.mark.parametrize(
- "input_str, expected",
- [
- # below the limit
- ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh"),
- # at the limit
- ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii"),
- # over the limit
- ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
- ("Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
- ("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii_Jjjj_Kkkk", "Aaaa_Bbbb_Cccc_Dddd___g_Hhhh_Iiii_Jjjj_Kkkk"),
- ("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_0123456789", "ABCDEFGHIJKLMNOPQRST__qrstuvwxyz_0123456789"),
- ],
-)
-def test_truncate_identifier(input_str: str, expected: str):
- """
- Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit:
- - `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii`
- - `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii`
-
- Deciding on how to truncate (in the middle) are being verified in these tests.
- In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii`
- and can potentially cause a collision in table names.
-
- Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`.
- """
- name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
- print(f"Truncating from #{len(input_str)} to #{len(expected)}")
- assert name_transformer.truncate_identifier_name(input_str) == expected