Skip to content

Commit f143c8f

Browse files
🎉 Source File - add support for custom encoding (#15293)
* added support for custom encoding * fixed unit test for utf16 * updated docs * bumped connector version * auto-bump connector version [ci skip] Co-authored-by: Octavia Squidington III <octavia-squidington-iii@users.noreply.github.com>
1 parent bbf3584 commit f143c8f

File tree

8 files changed

+74
-42
lines changed

8 files changed

+74
-42
lines changed

airbyte-config/init/src/main/resources/seed/source_definitions.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@
271271
- name: File
272272
sourceDefinitionId: 778daa7c-feaf-4db6-96f3-70fd645acc77
273273
dockerRepository: airbyte/source-file
274-
dockerImageTag: 0.2.15
274+
dockerImageTag: 0.2.16
275275
documentationUrl: https://docs.airbyte.io/integrations/sources/file
276276
icon: file.svg
277277
sourceType: file

airbyte-config/init/src/main/resources/seed/source_specs.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2255,7 +2255,7 @@
22552255
supportsNormalization: false
22562256
supportsDBT: false
22572257
supported_destination_sync_modes: []
2258-
- dockerImage: "airbyte/source-file:0.2.15"
2258+
- dockerImage: "airbyte/source-file:0.2.16"
22592259
spec:
22602260
documentationUrl: "https://docs.airbyte.io/integrations/sources/file"
22612261
connectionSpecification:

airbyte-integrations/connectors/source-file/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ COPY source_file ./source_file
1717
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
1818
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
1919

20-
LABEL io.airbyte.version=0.2.15
20+
LABEL io.airbyte.version=0.2.16
2121
LABEL io.airbyte.name=airbyte/source-file
Binary file not shown.

airbyte-integrations/connectors/source-file/source_file/client.py

Lines changed: 33 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,14 @@ class URLFile:
5454
```
5555
"""
5656

57-
def __init__(self, url: str, provider: dict):
57+
def __init__(self, url: str, provider: dict, binary=None, encoding=None):
5858
self._url = url
5959
self._provider = provider
6060
self._file = None
61+
self.args = {
62+
"mode": "rb" if binary else "r",
63+
"encoding": encoding,
64+
}
6165

6266
def __enter__(self):
6367
return self._file
@@ -74,29 +78,28 @@ def close(self):
7478
self._file.close()
7579
self._file = None
7680

77-
def open(self, binary=False):
81+
def open(self):
7882
self.close()
7983
try:
80-
self._file = self._open(binary=binary)
84+
self._file = self._open()
8185
except google.api_core.exceptions.NotFound as err:
8286
raise FileNotFoundError(self.url) from err
8387
return self
8488

85-
def _open(self, binary):
86-
mode = "rb" if binary else "r"
89+
def _open(self):
8790
storage = self.storage_scheme
8891
url = self.url
8992

9093
if storage == "gs://":
91-
return self._open_gcs_url(binary=binary)
94+
return self._open_gcs_url()
9295
elif storage == "s3://":
93-
return self._open_aws_url(binary=binary)
96+
return self._open_aws_url()
9497
elif storage == "azure://":
95-
return self._open_azblob_url(binary=binary)
98+
return self._open_azblob_url()
9699
elif storage == "webhdfs://":
97100
host = self._provider["host"]
98101
port = self._provider["port"]
99-
return smart_open.open(f"webhdfs://{host}:{port}/{url}", mode=mode)
102+
return smart_open.open(f"webhdfs://{host}:{port}/{url}", **self.args)
100103
elif storage in ("ssh://", "scp://", "sftp://"):
101104
user = self._provider["user"]
102105
host = self._provider["host"]
@@ -114,19 +117,15 @@ def _open(self, binary):
114117
uri = f"{storage}{user}:{password}@{host}:{port}/{url}"
115118
else:
116119
uri = f"{storage}{user}@{host}:{port}/{url}"
117-
return smart_open.open(uri, transport_params=transport_params, mode=mode)
120+
return smart_open.open(uri, transport_params=transport_params, **self.args)
118121
elif storage in ("https://", "http://"):
119122
transport_params = None
120123
if "user_agent" in self._provider and self._provider["user_agent"]:
121124
airbyte_version = environ.get("AIRBYTE_VERSION", "0.0")
122125
transport_params = {"headers": {"Accept-Encoding": "identity", "User-Agent": f"Airbyte/{airbyte_version}"}}
123126
logger.info(f"TransportParams: {transport_params}")
124-
return smart_open.open(
125-
self.full_url,
126-
mode=mode,
127-
transport_params=transport_params,
128-
)
129-
return smart_open.open(self.full_url, mode=mode)
127+
return smart_open.open(self.full_url, transport_params=transport_params, **self.args)
128+
return smart_open.open(self.full_url, **self.args)
130129

131130
@property
132131
def url(self) -> str:
@@ -168,8 +167,7 @@ def storage_scheme(self) -> str:
168167
logger.error(f"Unknown Storage provider in: {self.full_url}")
169168
return ""
170169

171-
def _open_gcs_url(self, binary) -> object:
172-
mode = "rb" if binary else "r"
170+
def _open_gcs_url(self) -> object:
173171
service_account_json = self._provider.get("service_account_json")
174172
credentials = None
175173
if service_account_json:
@@ -185,28 +183,27 @@ def _open_gcs_url(self, binary) -> object:
185183
client = GCSClient(credentials=credentials, project=credentials._project_id)
186184
else:
187185
client = GCSClient.create_anonymous_client()
188-
file_to_close = smart_open.open(self.full_url, transport_params=dict(client=client), mode=mode)
186+
file_to_close = smart_open.open(self.full_url, transport_params={"client": client}, **self.args)
189187

190188
return file_to_close
191189

192-
def _open_aws_url(self, binary):
193-
mode = "rb" if binary else "r"
190+
def _open_aws_url(self):
194191
aws_access_key_id = self._provider.get("aws_access_key_id")
195192
aws_secret_access_key = self._provider.get("aws_secret_access_key")
196193
use_aws_account = aws_access_key_id and aws_secret_access_key
197194

198195
if use_aws_account:
199196
aws_access_key_id = self._provider.get("aws_access_key_id", "")
200197
aws_secret_access_key = self._provider.get("aws_secret_access_key", "")
201-
result = smart_open.open(f"{self.storage_scheme}{aws_access_key_id}:{aws_secret_access_key}@{self.url}", mode=mode)
198+
url = f"{self.storage_scheme}{aws_access_key_id}:{aws_secret_access_key}@{self.url}"
199+
result = smart_open.open(url, **self.args)
202200
else:
203201
config = botocore.client.Config(signature_version=botocore.UNSIGNED)
204202
params = {"client": boto3.client("s3", config=config)}
205-
result = smart_open.open(self.full_url, transport_params=params, mode=mode)
203+
result = smart_open.open(self.full_url, transport_params=params, **self.args)
206204
return result
207205

208-
def _open_azblob_url(self, binary):
209-
mode = "rb" if binary else "r"
206+
def _open_azblob_url(self):
210207
storage_account = self._provider.get("storage_account")
211208
storage_acc_url = f"https://{storage_account}.blob.core.windows.net"
212209
sas_token = self._provider.get("sas_token", None)
@@ -220,14 +217,15 @@ def _open_azblob_url(self, binary):
220217
# assuming anonymous public read access given no credential
221218
client = BlobServiceClient(account_url=storage_acc_url)
222219

223-
result = smart_open.open(f"{self.storage_scheme}{self.url}", transport_params=dict(client=client), mode=mode)
224-
return result
220+
url = f"{self.storage_scheme}{self.url}"
221+
return smart_open.open(url, transport_params=dict(client=client), **self.args)
225222

226223

227224
class Client:
228225
"""Class that manages reading and parsing data from streams"""
229226

230227
reader_class = URLFile
228+
binary_formats = {"excel", "feather", "parquet", "orc", "pickle"}
231229

232230
def __init__(self, dataset_name: str, url: str, provider: dict, format: str = None, reader_options: str = None):
233231
self._dataset_name = dataset_name
@@ -243,6 +241,9 @@ def __init__(self, dataset_name: str, url: str, provider: dict, format: str = No
243241
logger.error(error_msg)
244242
raise ConfigurationError(error_msg) from err
245243

244+
self.binary_source = self._reader_format in self.binary_formats
245+
self.encoding = self._reader_options.get("encoding")
246+
246247
@property
247248
def stream_name(self) -> str:
248249
if self._dataset_name:
@@ -336,17 +337,12 @@ def dtype_to_json_type(dtype) -> str:
336337

337338
@property
338339
def reader(self) -> reader_class:
339-
return self.reader_class(url=self._url, provider=self._provider)
340-
341-
@property
342-
def binary_source(self):
343-
binary_formats = {"excel", "feather", "parquet", "orc", "pickle"}
344-
return self._reader_format in binary_formats
340+
return self.reader_class(url=self._url, provider=self._provider, binary=self.binary_source, encoding=self.encoding)
345341

346342
def read(self, fields: Iterable = None) -> Iterable[dict]:
347343
"""Read data from the stream"""
348-
with self.reader.open(binary=self.binary_source) as fp:
349-
if self._reader_format == "json" or self._reader_format == "jsonl":
344+
with self.reader.open() as fp:
345+
if self._reader_format in ["json", "jsonl"]:
350346
yield from self.load_nested_json(fp)
351347
elif self._reader_format == "yaml":
352348
fields = set(fields) if fields else None
@@ -376,8 +372,8 @@ def _stream_properties(self, fp):
376372
def streams(self) -> Iterable:
377373
"""Discovers available streams"""
378374
# TODO handle discovery of directories of multiple files instead
379-
with self.reader.open(binary=self.binary_source) as fp:
380-
if self._reader_format == "json" or self._reader_format == "jsonl":
375+
with self.reader.open() as fp:
376+
if self._reader_format in ["json", "jsonl"]:
381377
json_schema = self.load_nested_json_schema(fp)
382378
else:
383379
json_schema = {

airbyte-integrations/connectors/source-file/source_file/source.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def check(self, logger, config: Mapping) -> AirbyteConnectionStatus:
8383
client = self._get_client(config)
8484
logger.info(f"Checking access to {client.reader.full_url}...")
8585
try:
86-
with client.reader.open(binary=client.binary_source):
86+
with client.reader.open():
8787
return AirbyteConnectionStatus(status=Status.SUCCEEDED)
8888
except Exception as err:
8989
reason = f"Failed to load {client.reader.full_url}: {repr(err)}\n{traceback.format_exc()}"
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#
2+
# Copyright (c) 2022 Airbyte, Inc., all rights reserved.
3+
#
4+
5+
import logging
6+
from pathlib import Path
7+
8+
from source_file.source import SourceFile
9+
10+
HERE = Path(__file__).parent.absolute()
11+
12+
13+
def test_csv_with_utf16_encoding():
14+
15+
config_local_csv_utf16 = {
16+
"dataset_name": "AAA",
17+
"format": "csv",
18+
"reader_options": '{"encoding":"utf_16"}',
19+
"url": f"{HERE}/../integration_tests/sample_files/test_utf16.csv",
20+
"provider": {"storage": "local"},
21+
}
22+
expected_schema = {
23+
"$schema": "http://json-schema.org/draft-07/schema#",
24+
"properties": {
25+
"header1": {"type": ["string", "null"]},
26+
"header2": {"type": ["number", "null"]},
27+
"header3": {"type": ["number", "null"]},
28+
"header4": {"type": ["boolean", "null"]},
29+
},
30+
"type": "object",
31+
}
32+
33+
catalog = SourceFile().discover(logger=logging.getLogger("airbyte"), config=config_local_csv_utf16)
34+
stream = next(iter(catalog.streams))
35+
assert stream.json_schema == expected_schema

docs/integrations/sources/file.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,8 @@ In order to read large files from a remote location, this connector uses the [sm
126126
## Changelog
127127

128128
| Version | Date | Pull Request | Subject |
129-
|---------|------------|----------------------------------------------------------| ------------------------------------------------- |
129+
|---------|------------|----------------------------------------------------------|---------------------------------------------------|
130+
| 0.2.16 | 2022-08-10 | [15293](https://github.com/airbytehq/airbyte/pull/15293) | added support for encoding reader option |
130131
| 0.2.15 | 2022-08-05 | [15269](https://github.com/airbytehq/airbyte/pull/15269) | Bump `smart-open` version to 6.0.0 |
131132
| 0.2.12 | 2022-07-12 | [14535](https://github.com/airbytehq/airbyte/pull/14535) | Fix invalid schema generation for JSON files |
132133
| 0.2.11 | 2022-07-12 | [9974](https://github.com/airbytehq/airbyte/pull/14588) | Add support to YAML format |

0 commit comments

Comments
 (0)