Skip to content

Commit b447bb5

Browse files
authored
🐛 normalization for bigquery: allow datasetId and table to start with number (#9341)
* use unchanged dataset_id if it starts with a number * Can start with number: datasetId, table Can not start with number: column (added _ before name) * updated normalization container version * updated normalization container version
1 parent 66137f6 commit b447bb5

File tree

5 files changed

+91
-87
lines changed

5 files changed

+91
-87
lines changed

airbyte-integrations/bases/base-normalization/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,5 @@ WORKDIR /airbyte
2828
ENV AIRBYTE_ENTRYPOINT "/airbyte/entrypoint.sh"
2929
ENTRYPOINT ["/airbyte/entrypoint.sh"]
3030

31-
LABEL io.airbyte.version=0.1.65
31+
LABEL io.airbyte.version=0.1.66
3232
LABEL io.airbyte.name=airbyte/normalization

airbyte-integrations/bases/base-normalization/normalization/transform_catalog/destination_name_transformer.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def __normalize_non_column_identifier_name(
146146
) -> str:
147147
# We force standard naming for non column names (see issue #1785)
148148
result = transform_standard_naming(input_name)
149-
result = self.__normalize_naming_conventions(result)
149+
result = self.__normalize_naming_conventions(result, is_column=False)
150150
if truncate:
151151
result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level)
152152
result = self.__normalize_identifier_case(result, is_quoted=False)
@@ -160,7 +160,7 @@ def __normalize_non_column_identifier_name(
160160
def __normalize_identifier_name(
161161
self, column_name: str, in_jinja: bool = False, truncate: bool = True, conflict: bool = False, conflict_level: int = 0
162162
) -> str:
163-
result = self.__normalize_naming_conventions(column_name)
163+
result = self.__normalize_naming_conventions(column_name, is_column=True)
164164
if truncate:
165165
result = self.truncate_identifier_name(input_name=result, conflict=conflict, conflict_level=conflict_level)
166166
if self.needs_quotes(result):
@@ -189,14 +189,16 @@ def apply_quote(self, input: str) -> str:
189189
return f"quote('{input}')"
190190
return f"adapter.quote('{input}')"
191191

192-
def __normalize_naming_conventions(self, input_name: str) -> str:
192+
def __normalize_naming_conventions(self, input_name: str, is_column: bool = False) -> str:
193193
result = input_name
194194
if self.destination_type.value == DestinationType.ORACLE.value:
195195
return transform_standard_naming(result)
196196
elif self.destination_type.value == DestinationType.BIGQUERY.value:
197+
# Can start with number: datasetId, table
198+
# Can not start with number: column
197199
result = transform_standard_naming(result)
198200
doesnt_start_with_alphaunderscore = match("[^A-Za-z_]", result[0]) is not None
199-
if doesnt_start_with_alphaunderscore:
201+
if is_column and doesnt_start_with_alphaunderscore:
200202
result = f"_{result}"
201203
return result
202204

airbyte-integrations/bases/base-normalization/unit_tests/test_destination_name_transformer.py

Lines changed: 58 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,62 @@ def test_transform_standard_naming(input_str: str, expected: str):
9898
assert transform_standard_naming(input_str) == expected
9999

100100

101+
@pytest.mark.parametrize(
102+
"input_str, destination_type, expected, expected_in_jinja",
103+
[
104+
# Case sensitive names
105+
("Identifier Name", "Postgres", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
106+
("Identifier Name", "BigQuery", "Identifier_Name", "'Identifier_Name'"),
107+
("Identifier Name", "Snowflake", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
108+
("Identifier Name", "Redshift", "{{ adapter.quote('identifier name') }}", "adapter.quote('identifier name')"),
109+
("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
110+
("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
111+
# Reserved Word for BigQuery and MySQL only
112+
("Groups", "Postgres", "groups", "'groups'"),
113+
("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
114+
("Groups", "Snowflake", "GROUPS", "'GROUPS'"),
115+
("Groups", "Redshift", "groups", "'groups'"),
116+
("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
117+
("Groups", "MSSQL", "groups", "'groups'"),
118+
],
119+
)
120+
def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str):
121+
t = DestinationType.from_string(destination_type)
122+
assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=False) == expected
123+
assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=True) == expected_in_jinja
124+
125+
126+
@pytest.mark.parametrize(
127+
"input_str, expected",
128+
[
129+
# below the limit
130+
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh"),
131+
# at the limit
132+
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii"),
133+
# over the limit
134+
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
135+
("Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
136+
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii_Jjjj_Kkkk", "Aaaa_Bbbb_Cccc_Dddd___g_Hhhh_Iiii_Jjjj_Kkkk"),
137+
("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_0123456789", "ABCDEFGHIJKLMNOPQRST__qrstuvwxyz_0123456789"),
138+
],
139+
)
140+
def test_truncate_identifier(input_str: str, expected: str):
141+
"""
142+
Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit:
143+
- `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii`
144+
- `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii`
145+
146+
Deciding on how to truncate (in the middle) are being verified in these tests.
147+
In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii`
148+
and can potentially cause a collision in table names.
149+
150+
Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`.
151+
"""
152+
name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
153+
print(f"Truncating from #{len(input_str)} to #{len(expected)}")
154+
assert name_transformer.truncate_identifier_name(input_str) == expected
155+
156+
101157
@pytest.mark.parametrize(
102158
"input_str, destination_type, expected, expected_column",
103159
[
@@ -117,7 +173,8 @@ def test_transform_standard_naming(input_str: str, expected: str):
117173
("a-Unicode_name_文6", "MSSQL", "a_unicode_name__6", "{{ adapter.quote('a-Unicode_name_文6') }}"),
118174
# Doesnt start with alpha or underscore
119175
("100x2001", "Postgres", "100x2001", "{{ adapter.quote('100x2001') }}"),
120-
("100x2002", "BigQuery", "_100x2002", "_100x2002"),
176+
("100x2002", "BigQuery", "100x2002", "_100x2002"),
177+
("文2_a-Unicode_name", "BigQuery", "_2_a_Unicode_name", "_2_a_Unicode_name"),
121178
("100x2003", "Snowflake", "100x2003", "{{ adapter.quote('100x2003') }}"),
122179
("100x2004", "Redshift", "100x2004", "{{ adapter.quote('100x2004') }}"),
123180
("100x2005", "MySQL", "100x2005", "{{ adapter.quote('100x2005') }}"),
@@ -164,59 +221,3 @@ def test_normalize_name(input_str: str, destination_type: str, expected: str, ex
164221
assert DestinationNameTransformer(t).normalize_schema_name(input_str) == expected
165222
assert DestinationNameTransformer(t).normalize_table_name(input_str) == expected
166223
assert DestinationNameTransformer(t).normalize_column_name(input_str) == expected_column
167-
168-
169-
@pytest.mark.parametrize(
170-
"input_str, destination_type, expected, expected_in_jinja",
171-
[
172-
# Case sensitive names
173-
("Identifier Name", "Postgres", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
174-
("Identifier Name", "BigQuery", "Identifier_Name", "'Identifier_Name'"),
175-
("Identifier Name", "Snowflake", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
176-
("Identifier Name", "Redshift", "{{ adapter.quote('identifier name') }}", "adapter.quote('identifier name')"),
177-
("Identifier Name", "MySQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
178-
("Identifier Name", "MSSQL", "{{ adapter.quote('Identifier Name') }}", "adapter.quote('Identifier Name')"),
179-
# Reserved Word for BigQuery and MySQL only
180-
("Groups", "Postgres", "groups", "'groups'"),
181-
("Groups", "BigQuery", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
182-
("Groups", "Snowflake", "GROUPS", "'GROUPS'"),
183-
("Groups", "Redshift", "groups", "'groups'"),
184-
("Groups", "MySQL", "{{ adapter.quote('Groups') }}", "adapter.quote('Groups')"),
185-
("Groups", "MSSQL", "groups", "'groups'"),
186-
],
187-
)
188-
def test_normalize_column_name(input_str: str, destination_type: str, expected: str, expected_in_jinja: str):
189-
t = DestinationType.from_string(destination_type)
190-
assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=False) == expected
191-
assert DestinationNameTransformer(t).normalize_column_name(input_str, in_jinja=True) == expected_in_jinja
192-
193-
194-
@pytest.mark.parametrize(
195-
"input_str, expected",
196-
[
197-
# below the limit
198-
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh"),
199-
# at the limit
200-
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii", "Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iii"),
201-
# over the limit
202-
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
203-
("Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii", "Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii"),
204-
("Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii_Jjjj_Kkkk", "Aaaa_Bbbb_Cccc_Dddd___g_Hhhh_Iiii_Jjjj_Kkkk"),
205-
("ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz_0123456789", "ABCDEFGHIJKLMNOPQRST__qrstuvwxyz_0123456789"),
206-
],
207-
)
208-
def test_truncate_identifier(input_str: str, expected: str):
209-
"""
210-
Rules about truncations, for example for both of these strings which are too long for the postgres 64 limit:
211-
- `Aaaa_Bbbb_Cccc_Dddd_Eeee_Ffff_Gggg_Hhhh_Iiii`
212-
- `Aaaa_Bbbb_Cccc_Dddd_Eeee_a_very_long_name_Ffff_Gggg_Hhhh_Iiii`
213-
214-
Deciding on how to truncate (in the middle) are being verified in these tests.
215-
In this instance, both strings ends up as:`Aaaa_Bbbb_Cccc_Dddd___e_Ffff_Gggg_Hhhh_Iiii`
216-
and can potentially cause a collision in table names.
217-
218-
Note that dealing with such collisions is not part of `destination_name_transformer` but of the `stream_processor`.
219-
"""
220-
name_transformer = DestinationNameTransformer(DestinationType.POSTGRES)
221-
print(f"Truncating from #{len(input_str)} to #{len(expected)}")
222-
assert name_transformer.truncate_identifier_name(input_str) == expected

airbyte-workers/src/main/java/io/airbyte/workers/normalization/NormalizationRunnerFactory.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
public class NormalizationRunnerFactory {
1515

1616
public static final String BASE_NORMALIZATION_IMAGE_NAME = "airbyte/normalization";
17-
public static final String NORMALIZATION_VERSION = "0.1.65";
17+
public static final String NORMALIZATION_VERSION = "0.1.66";
1818

1919
static final Map<String, ImmutablePair<String, DefaultNormalizationRunner.DestinationType>> NORMALIZATION_MAPPING =
2020
ImmutableMap.<String, ImmutablePair<String, DefaultNormalizationRunner.DestinationType>>builder()

docs/understanding-airbyte/basic-normalization.md

Lines changed: 25 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -349,28 +349,29 @@ Note that Basic Normalization is packaged in a docker image `airbyte/normalizati
349349
Therefore, in order to "upgrade" to the desired normalization version, you need to use the corresponding Airbyte version that it's being released in:
350350

351351
| Airbyte Version | Normalization Version | Date | Pull Request | Subject |
352-
| :--- | :--- | :--- | :--- | :--- |
353-
| 0.35.13-alpha | 0.1.65 | 2021-01-28 | [\#9846](https://github.com/airbytehq/airbyte/pull/9846) | Tweak dbt multi-thread parameter down |
354-
| 0.35.12-alpha | 0.1.64 | 2021-01-28 | [\#9793](https://github.com/airbytehq/airbyte/pull/9793) | Support PEM format for ssh-tunnel keys |
355-
| 0.35.4-alpha | 0.1.63 | 2021-01-07 | [\#9301](https://github.com/airbytehq/airbyte/pull/9301) | Fix Snowflake prefix tables starting with numbers |
356-
| | 0.1.62 | 2021-01-07 | [\#9340](https://github.com/airbytehq/airbyte/pull/9340) | Use TCP-port support for clickhouse |
357-
| | 0.1.62 | 2021-01-07 | [\#9063](https://github.com/airbytehq/airbyte/pull/9063) | Change Snowflake-specific materialization settings |
358-
| | 0.1.62 | 2021-01-07 | [\#9317](https://github.com/airbytehq/airbyte/pull/9317) | Fix issue with quoted & case sensitive columns |
359-
| | 0.1.62 | 2021-01-07 | [\#9281](https://github.com/airbytehq/airbyte/pull/9281) | Fix SCD partition by float columns in BigQuery |
360-
| 0.32.11-alpha | 0.1.61 | 2021-12-02 | [\#8394](https://github.com/airbytehq/airbyte/pull/8394) | Fix incremental queries not updating empty tables |
361-
| | 0.1.61 | 2021-12-01 | [\#8378](https://github.com/airbytehq/airbyte/pull/8378) | Fix un-nesting queries and add proper ref hints |
362-
| 0.32.5-alpha | 0.1.60 | 2021-11-22 | [\#8088](https://github.com/airbytehq/airbyte/pull/8088) | Speed-up incremental queries for SCD table on Snowflake |
363-
| 0.30.32-alpha | 0.1.59 | 2021-11-08 | [\#7669](https://github.com/airbytehq/airbyte/pull/7169) | Fix nested incremental dbt |
364-
| 0.30.24-alpha | 0.1.57 | 2021-10-26 | [\#7162](https://github.com/airbytehq/airbyte/pull/7162) | Implement incremental dbt updates |
365-
| 0.30.16-alpha | 0.1.52 | 2021-10-07 | [\#6379](https://github.com/airbytehq/airbyte/pull/6379) | Handle empty string for date and date-time format |
366-
| | 0.1.51 | 2021-10-08 | [\#6799](https://github.com/airbytehq/airbyte/pull/6799) | Added support for ad\_cdc\_log\_pos while normalization |
367-
| | 0.1.50 | 2021-10-07 | [\#6079](https://github.com/airbytehq/airbyte/pull/6079) | Added support for MS SQL Server normalization |
368-
| | 0.1.49 | 2021-10-06 | [\#6709](https://github.com/airbytehq/airbyte/pull/6709) | Forward destination dataset location to dbt profiles |
369-
| 0.29.17-alpha | 0.1.47 | 2021-09-20 | [\#6317](https://github.com/airbytehq/airbyte/pull/6317) | MySQL: updated MySQL normalization with using SSH tunnel |
370-
| | 0.1.45 | 2021-09-18 | [\#6052](https://github.com/airbytehq/airbyte/pull/6052) | Snowflake: accept any date-time format |
371-
| 0.29.8-alpha | 0.1.40 | 2021-08-18 | [\#5433](https://github.com/airbytehq/airbyte/pull/5433) | Allow optional credentials\_json for BigQuery |
372-
| 0.29.5-alpha | 0.1.39 | 2021-08-11 | [\#4557](https://github.com/airbytehq/airbyte/pull/4557) | Handle date times and solve conflict name btw stream/field |
373-
| 0.28.2-alpha | 0.1.38 | 2021-07-28 | [\#5027](https://github.com/airbytehq/airbyte/pull/5027) | Handle quotes in column names when parsing JSON blob |
374-
| 0.27.5-alpha | 0.1.37 | 2021-07-22 | [\#3947](https://github.com/airbytehq/airbyte/pull/4881/) | Handle `NULL` cursor field values when deduping |
375-
| 0.27.2-alpha | 0.1.36 | 2021-07-09 | [\#3947](https://github.com/airbytehq/airbyte/pull/4163/) | Enable normalization for MySQL destination |
352+
|:----------------| :--- | :--- | :--- | :--- |
353+
| | 0.1.66 | 2022-02-04 | [\#9341](https://github.com/airbytehq/airbyte/pull/9341) | Fix normalization for bigquery datasetId and tables |
354+
| 0.35.13-alpha | 0.1.65 | 2021-01-28 | [\#9846](https://github.com/airbytehq/airbyte/pull/9846) | Tweak dbt multi-thread parameter down |
355+
| 0.35.12-alpha | 0.1.64 | 2021-01-28 | [\#9793](https://github.com/airbytehq/airbyte/pull/9793) | Support PEM format for ssh-tunnel keys |
356+
| 0.35.4-alpha | 0.1.63 | 2021-01-07 | [\#9301](https://github.com/airbytehq/airbyte/pull/9301) | Fix Snowflake prefix tables starting with numbers |
357+
| | 0.1.62 | 2021-01-07 | [\#9340](https://github.com/airbytehq/airbyte/pull/9340) | Use TCP-port support for clickhouse |
358+
| | 0.1.62 | 2021-01-07 | [\#9063](https://github.com/airbytehq/airbyte/pull/9063) | Change Snowflake-specific materialization settings |
359+
| | 0.1.62 | 2021-01-07 | [\#9317](https://github.com/airbytehq/airbyte/pull/9317) | Fix issue with quoted & case sensitive columns |
360+
| | 0.1.62 | 2021-01-07 | [\#9281](https://github.com/airbytehq/airbyte/pull/9281) | Fix SCD partition by float columns in BigQuery |
361+
| 0.32.11-alpha | 0.1.61 | 2021-12-02 | [\#8394](https://github.com/airbytehq/airbyte/pull/8394) | Fix incremental queries not updating empty tables |
362+
| | 0.1.61 | 2021-12-01 | [\#8378](https://github.com/airbytehq/airbyte/pull/8378) | Fix un-nesting queries and add proper ref hints |
363+
| 0.32.5-alpha | 0.1.60 | 2021-11-22 | [\#8088](https://github.com/airbytehq/airbyte/pull/8088) | Speed-up incremental queries for SCD table on Snowflake |
364+
| 0.30.32-alpha | 0.1.59 | 2021-11-08 | [\#7669](https://github.com/airbytehq/airbyte/pull/7169) | Fix nested incremental dbt |
365+
| 0.30.24-alpha | 0.1.57 | 2021-10-26 | [\#7162](https://github.com/airbytehq/airbyte/pull/7162) | Implement incremental dbt updates |
366+
| 0.30.16-alpha | 0.1.52 | 2021-10-07 | [\#6379](https://github.com/airbytehq/airbyte/pull/6379) | Handle empty string for date and date-time format |
367+
| | 0.1.51 | 2021-10-08 | [\#6799](https://github.com/airbytehq/airbyte/pull/6799) | Added support for ad\_cdc\_log\_pos while normalization |
368+
| | 0.1.50 | 2021-10-07 | [\#6079](https://github.com/airbytehq/airbyte/pull/6079) | Added support for MS SQL Server normalization |
369+
| | 0.1.49 | 2021-10-06 | [\#6709](https://github.com/airbytehq/airbyte/pull/6709) | Forward destination dataset location to dbt profiles |
370+
| 0.29.17-alpha | 0.1.47 | 2021-09-20 | [\#6317](https://github.com/airbytehq/airbyte/pull/6317) | MySQL: updated MySQL normalization with using SSH tunnel |
371+
| | 0.1.45 | 2021-09-18 | [\#6052](https://github.com/airbytehq/airbyte/pull/6052) | Snowflake: accept any date-time format |
372+
| 0.29.8-alpha | 0.1.40 | 2021-08-18 | [\#5433](https://github.com/airbytehq/airbyte/pull/5433) | Allow optional credentials\_json for BigQuery |
373+
| 0.29.5-alpha | 0.1.39 | 2021-08-11 | [\#4557](https://github.com/airbytehq/airbyte/pull/4557) | Handle date times and solve conflict name btw stream/field |
374+
| 0.28.2-alpha | 0.1.38 | 2021-07-28 | [\#5027](https://github.com/airbytehq/airbyte/pull/5027) | Handle quotes in column names when parsing JSON blob |
375+
| 0.27.5-alpha | 0.1.37 | 2021-07-22 | [\#3947](https://github.com/airbytehq/airbyte/pull/4881/) | Handle `NULL` cursor field values when deduping |
376+
| 0.27.2-alpha | 0.1.36 | 2021-07-09 | [\#3947](https://github.com/airbytehq/airbyte/pull/4163/) | Enable normalization for MySQL destination |
376377

0 commit comments

Comments
 (0)