Skip to content

Commit 81bfb5c

Browse files
Source file: fix csv schema discovery (#15870)
* #174 source file: fix csv schema discovery * #174 source file: upd changelog * auto-bump connector version [ci skip] Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
1 parent 6b34451 commit 81bfb5c

File tree

7 files changed

+51
-8
lines changed

7 files changed

+51
-8
lines changed

airbyte-config/init/src/main/resources/seed/source_definitions.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@
279279
- name: File
280280
sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
281281
dockerRepository: airbyte/source-file
282-
dockerImageTag: 0.2.19
282+
dockerImageTag: 0.2.20
283283
documentationUrl: https://docs.airbyte.io/integrations/sources/file
284284
icon: file.svg
285285
sourceType: file

airbyte-config/init/src/main/resources/seed/source_specs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2291,7 +2291,7 @@
22912291
supportsNormalization: false
22922292
supportsDBT: false
22932293
supported_destination_sync_modes: []
2294-
- dockerImage: "airbyte/source-file:0.2.19"
2294+
- dockerImage: "airbyte/source-file:0.2.20"
22952295
spec:
22962296
documentationUrl: "https://docs.airbyte.io/integrations/sources/file"
22972297
connectionSpecification:

airbyte-integrations/connectors/source-file/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ COPY source_file ./source_file
1717
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
1818
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
1919

20-
LABEL io.airbyte.version=0.2.19
20+
LABEL io.airbyte.version=0.2.20
2121
LABEL io.airbyte.name=airbyte/source-file

airbyte-integrations/connectors/source-file/integration_tests/file_formats_test.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55

66
from pathlib import Path
7+
from unittest.mock import patch
78

89
import pytest
910
from airbyte_cdk import AirbyteLogger
@@ -56,3 +57,29 @@ def run_load_nested_json_schema(config, expected_columns=10, expected_rows=42):
5657
df = data_list[0]
5758
assert len(df) == expected_rows # DataFrame should have 42 items
5859
return df
60+
61+
62+
# https://github.com/airbytehq/alpha-beta-issues/issues/174
63+
# this is to ensure we make all conditions under which the bug is reproduced, i.e.
64+
# - chunk size < file size
65+
# - column type in the last chunk is not `string`
66+
@patch("source_file.client.Client.CSV_CHUNK_SIZE", 1)
67+
def test_csv_schema():
68+
source = SourceFile()
69+
file_path = str(SAMPLE_DIRECTORY.parent.joinpath("discover.csv"))
70+
config = {"dataset_name": "test", "format": "csv", "url": file_path, "provider": {"storage": "local"}}
71+
catalog = source.discover(logger=AirbyteLogger(), config=config).dict()
72+
assert len(catalog["streams"]) == 1
73+
schema = catalog["streams"][0]["json_schema"]
74+
assert schema == {
75+
"$schema": "http://json-schema.org/draft-07/schema#",
76+
"properties": {
77+
"Address": {"type": ["string", "null"]},
78+
"City": {"type": ["string", "null"]},
79+
"First Name": {"type": ["string", "null"]},
80+
"Last Name": {"type": ["string", "null"]},
81+
"State": {"type": ["string", "null"]},
82+
"zip_code": {"type": ["string", "null"]},
83+
},
84+
"type": "object",
85+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
First Name,Last Name,Address,City,State,zip_code
2+
John,Doe,120 jefferson st.,Riverside, NJ,8075
3+
Jack,McGinnis,220 hobo Av.,Phila, PA,9119
4+
"John ""Da Man""",Repici,120 Jefferson St.,Riverside, NJ,8075
5+
Stephen,Tyler,"7452 Terrace ""At the Plaza"" road",SomeTown,SD,91234
6+
,Blankman,,SomeTown, SD,unknown
7+
"Joan ""the bone"", Anne",Jet,"9th, at Terrace plc",Desert City,CO,3333

airbyte-integrations/connectors/source-file/source_file/client.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ def _open_azblob_url(self):
226226
class Client:
227227
"""Class that manages reading and parsing data from streams"""
228228

229+
CSV_CHUNK_SIZE = 10_000
229230
reader_class = URLFile
230231
binary_formats = {"excel", "feather", "parquet", "orc", "pickle"}
231232

@@ -313,7 +314,7 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable:
313314

314315
reader_options = {**self._reader_options}
315316
if self._reader_format == "csv":
316-
reader_options["chunksize"] = 10000
317+
reader_options["chunksize"] = self.CSV_CHUNK_SIZE
317318
if skip_data:
318319
reader_options["nrows"] = 0
319320
reader_options["index_col"] = 0
@@ -323,17 +324,22 @@ def load_dataframes(self, fp, skip_data=False) -> Iterable:
323324
yield reader(fp, **reader_options)
324325

325326
@staticmethod
326-
def dtype_to_json_type(dtype) -> str:
327+
def dtype_to_json_type(current_type: str, dtype) -> str:
327328
"""Convert Pandas Dataframe types to Airbyte Types.
328329
330+
:param current_type: str - one of the following types based on previous dataframes
329331
:param dtype: Pandas Dataframe type
330332
:return: Corresponding Airbyte Type
331333
"""
334+
number_types = ("int64", "float64")
335+
if current_type == "string":
336+
# previous column values was of the string type, no sense to look further
337+
return current_type
332338
if dtype == object:
333339
return "string"
334-
elif dtype in ("int64", "float64"):
340+
if dtype in number_types and (not current_type or current_type in number_types):
335341
return "number"
336-
elif dtype == "bool":
342+
if dtype == "bool" and (not current_type or current_type == "boolean"):
337343
return "boolean"
338344
return "string"
339345

@@ -379,7 +385,9 @@ def _stream_properties(self, fp):
379385
fields = {}
380386
for df in df_list:
381387
for col in df.columns:
382-
fields[col] = self.dtype_to_json_type(df[col].dtype)
388+
# if data type of the same column differs in dataframes, we choose the broadest one
389+
prev_frame_column_type = fields.get(col)
390+
fields[col] = self.dtype_to_json_type(prev_frame_column_type, df[col].dtype)
383391
return {field: {"type": [fields[field], "null"]} for field in fields}
384392

385393
@property

docs/integrations/sources/file.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ In order to read large files from a remote location, this connector uses the [sm
127127

128128
| Version | Date | Pull Request | Subject |
129129
|---------|------------|----------------------------------------------------------|---------------------------------------------------|
130+
| 0.2.20 | 2022-08-23 | [15870](https://github.com/airbytehq/airbyte/pull/15870) | Fix CSV schema discovery |
130131
| 0.2.19 | 2022-08-19 | [15768](https://github.com/airbytehq/airbyte/pull/15768) | Convert 'nan' to 'null' |
131132
| 0.2.18 | 2022-08-16 | [15698](https://github.com/airbytehq/airbyte/pull/15698) | Cache binary stream to file for discover |
132133
| 0.2.17 | 2022-08-11 | [15501](https://github.com/airbytehq/airbyte/pull/15501) | Cache binary stream to file |

0 commit comments

Comments
 (0)