Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def __normalize_non_column_identifier_name(
) -> str:
# We force standard naming for non column names (see issue #1785)
result = transform_standard_naming(input_name)
result = self.__normalize_naming_conventions(result)
result = self.__normalize_naming_conventions(result, is_column=False)
if truncate:
result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level)
result = self.__normalize_identifier_case(result, is_quoted=False)
Expand All @@ -160,7 +160,7 @@ def __normalize_non_column_identifier_name(
def __normalize_identifier_name(
self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0
) -> str:
result = self.__normalize_naming_conventions(column_name)
result = self.__normalize_naming_conventions(column_name, is_column=True)
if truncate:
result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level)
if self.needs_quotes(result):
Expand Down Expand Up @@ -189,14 +189,16 @@ def apply_quote(self, input: str) -> str:
return f"quote('{input}')"
return f"adapter.quote('{input}')"

def __normalize_naming_conventions(self, input_name: str) -> str:
def __normalize_naming_conventions(self, input_name: str, is_column: bool = False) -> str:
result = input_name
if self.destination_type.value == DestinationType.ORACLE.value:
return transform_standard_naming(result)
elif self.destination_type.value == DestinationType.BIGQUERY.value:
# Can start with number: datasetId, table
# Can not start with number: column
result = transform_standard_naming(result)
doesnt_start_with_alphaunderscore = match("[^A-Za-z_]", result[0]) is not None
if doesnt_start_with_alphaunderscore:
if is_column and doesnt_start_with_alphaunderscore:
result = f"_{result}"
return result

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,62 @@ def test_transform_standard_naming(input_str: str, expected: str):
assert transform_standard_naming(input_str) == expected


@pytest.mark.parametrize(
"input_str, destination_type, expected, expected_in_jinja",
[
# Case sensitive names
("Identifier Name", "Postgres", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
("Identifier Name", "BigQuery", "Identifier_Name", "'Identifier_Name'"),
("Identifier Name", "Snowflake", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
("Identifier Name", "Redshift", "{{ adapter.quote('identifier name') }}", "adapter.quote('identifier name')"),
("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
# Reserved Word for BigQuery and MySQL only
("Groups", "Postgres", "groups", "'groups'"),
("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
("Groups", "Snowflake", "GROUPS", "'GROUPS'"),
("Groups", "Redshift", "groups", "'groups'"),
("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
("Groups", "MSSQL", "groups", "'groups'"),
],
)
def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str):
t = DestinationType.from_string(destination_type)
assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=False) == expected
assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=True) == expected_in_jinja


@pytest.mark.parametrize(
"input_str, expected",
[
# below the limit
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh"),
# at the limit
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii"),
# over the limit
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
("Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii_Jjjj_Kkkk", "Aaaa_Bbbb_Cccc_Dddd___g_Hhhh_Iiii_Jjjj_Kkkk"),
("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_0123456789", "ABCDEFGHIJKLMNOPQRST__qrstuvwxyz_0123456789"),
],
)
def test_truncate_identifier(input_str: str, expected: str):
"""
Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit:
- `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii`
- `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii`

Deciding on how to truncate (in the middle) are being verified in these tests.
In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii`
and can potentially cause a collision in table names.

Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`.
"""
name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
print(f"Truncating from #{len(input_str)} to #{len(expected)}")
assert name_transformer.truncate_identifier_name(input_str) == expected


@pytest.mark.parametrize(
"input_str, destination_type, expected, expected_column",
[
Expand All @@ -117,7 +173,8 @@ def test_transform_standard_naming(input_str: str, expected: str):
("a-Unicode_name_文6", "MSSQL", "a_unicode_name__6", "{{ adapter.quote('a-Unicode_name_文6') }}"),
# Doesnt start with alpha or underscore
("100x2001", "Postgres", "100x2001", "{{ adapter.quote('100x2001') }}"),
("100x2002", "BigQuery", "_100x2002", "_100x2002"),
("100x2002", "BigQuery", "100x2002", "_100x2002"),
("文2_a-Unicode_name", "BigQuery", "_2_a_Unicode_name", "_2_a_Unicode_name"),
("100x2003", "Snowflake", "100x2003", "{{ adapter.quote('100x2003') }}"),
("100x2004", "Redshift", "100x2004", "{{ adapter.quote('100x2004') }}"),
("100x2005", "MySQL", "100x2005", "{{ adapter.quote('100x2005') }}"),
Expand Down Expand Up @@ -164,59 +221,3 @@ def test_normalize_name(input_str: str, destination_type: str, expected: str, ex
assert DestinationNameTransformer(t).normalize_schema_name(input_str) == expected
assert DestinationNameTransformer(t).normalize_table_name(input_str) == expected
assert DestinationNameTransformer(t).normalize_column_name(input_str) == expected_column


@pytest.mark.parametrize(
"input_str, destination_type, expected, expected_in_jinja",
[
# Case sensitive names
("Identifier Name", "Postgres", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
("Identifier Name", "BigQuery", "Identifier_Name", "'Identifier_Name'"),
("Identifier Name", "Snowflake", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
("Identifier Name", "Redshift", "{{ adapter.quote('identifier name') }}", "adapter.quote('identifier name')"),
("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
# Reserved Word for BigQuery and MySQL only
("Groups", "Postgres", "groups", "'groups'"),
("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
("Groups", "Snowflake", "GROUPS", "'GROUPS'"),
("Groups", "Redshift", "groups", "'groups'"),
("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
("Groups", "MSSQL", "groups", "'groups'"),
],
)
def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str):
t = DestinationType.from_string(destination_type)
assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=False) == expected
assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=True) == expected_in_jinja


@pytest.mark.parametrize(
"input_str, expected",
[
# below the limit
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh"),
# at the limit
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii"),
# over the limit
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
("Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii_Jjjj_Kkkk", "Aaaa_Bbbb_Cccc_Dddd___g_Hhhh_Iiii_Jjjj_Kkkk"),
("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_0123456789", "ABCDEFGHIJKLMNOPQRST__qrstuvwxyz_0123456789"),
],
)
def test_truncate_identifier(input_str: str, expected: str):
"""
Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit:
- `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii`
- `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii`

Deciding on how to truncate (in the middle) are being verified in these tests.
In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii`
and can potentially cause a collision in table names.

Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`.
"""
name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
print(f"Truncating from #{len(input_str)} to #{len(expected)}")
assert name_transformer.truncate_identifier_name(input_str) == expected