Skip to content

Commit 18eb9e8

Browse files
refactor(bigquery): update code samples of load table autodetect and truncate (#28)
Co-authored-by: Peter Lamut <plamut@users.noreply.github.com>
1 parent da40b62 commit 18eb9e8

16 files changed

+571
-168
lines changed

docs/snippets.py

Lines changed: 0 additions & 168 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import time
2626

2727
import pytest
28-
import six
2928

3029
try:
3130
import fastparquet
@@ -585,173 +584,6 @@ def test_manage_views(client, to_delete):
585584
# [END bigquery_grant_view_access]
586585

587586

588-
def test_load_table_from_uri_autodetect(client, to_delete, capsys):
589-
"""Load table from a GCS URI using various formats and auto-detected schema
590-
Each file format has its own tested load from URI sample. Because most of
591-
the code is common for autodetect, append, and truncate, this sample
592-
includes snippets for all supported formats but only calls a single load
593-
job.
594-
This code snippet is made up of shared code, then format-specific code,
595-
followed by more shared code. Note that only the last format in the
596-
format-specific code section will be tested in this test.
597-
"""
598-
dataset_id = "load_table_from_uri_auto_{}".format(_millis())
599-
project = client.project
600-
dataset_ref = bigquery.DatasetReference(project, dataset_id)
601-
dataset = bigquery.Dataset(dataset_ref)
602-
client.create_dataset(dataset)
603-
to_delete.append(dataset)
604-
605-
# Shared code
606-
# [START bigquery_load_table_gcs_csv_autodetect]
607-
# [START bigquery_load_table_gcs_json_autodetect]
608-
# from google.cloud import bigquery
609-
# client = bigquery.Client()
610-
# dataset_id = 'my_dataset'
611-
612-
dataset_ref = bigquery.DatasetReference(project, dataset_id)
613-
job_config = bigquery.LoadJobConfig()
614-
job_config.autodetect = True
615-
# [END bigquery_load_table_gcs_csv_autodetect]
616-
# [END bigquery_load_table_gcs_json_autodetect]
617-
618-
# Format-specific code
619-
# [START bigquery_load_table_gcs_csv_autodetect]
620-
job_config.skip_leading_rows = 1
621-
# The source format defaults to CSV, so the line below is optional.
622-
job_config.source_format = bigquery.SourceFormat.CSV
623-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
624-
# [END bigquery_load_table_gcs_csv_autodetect]
625-
# unset csv-specific attribute
626-
del job_config._properties["load"]["skipLeadingRows"]
627-
628-
# [START bigquery_load_table_gcs_json_autodetect]
629-
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
630-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
631-
# [END bigquery_load_table_gcs_json_autodetect]
632-
633-
# Shared code
634-
# [START bigquery_load_table_gcs_csv_autodetect]
635-
# [START bigquery_load_table_gcs_json_autodetect]
636-
load_job = client.load_table_from_uri(
637-
uri, dataset_ref.table("us_states"), job_config=job_config
638-
) # API request
639-
print("Starting job {}".format(load_job.job_id))
640-
641-
load_job.result() # Waits for table load to complete.
642-
print("Job finished.")
643-
644-
destination_table = client.get_table(dataset_ref.table("us_states"))
645-
print("Loaded {} rows.".format(destination_table.num_rows))
646-
# [END bigquery_load_table_gcs_csv_autodetect]
647-
# [END bigquery_load_table_gcs_json_autodetect]
648-
649-
out, _ = capsys.readouterr()
650-
assert "Loaded 50 rows." in out
651-
652-
653-
def test_load_table_from_uri_truncate(client, to_delete, capsys):
654-
"""Replaces table data with data from a GCS URI using various formats
655-
Each file format has its own tested load from URI sample. Because most of
656-
the code is common for autodetect, append, and truncate, this sample
657-
includes snippets for all supported formats but only calls a single load
658-
job.
659-
This code snippet is made up of shared code, then format-specific code,
660-
followed by more shared code. Note that only the last format in the
661-
format-specific code section will be tested in this test.
662-
"""
663-
dataset_id = "load_table_from_uri_trunc_{}".format(_millis())
664-
project = client.project
665-
dataset_ref = bigquery.DatasetReference(project, dataset_id)
666-
dataset = bigquery.Dataset(dataset_ref)
667-
client.create_dataset(dataset)
668-
to_delete.append(dataset)
669-
670-
job_config = bigquery.LoadJobConfig()
671-
job_config.schema = [
672-
bigquery.SchemaField("name", "STRING"),
673-
bigquery.SchemaField("post_abbr", "STRING"),
674-
]
675-
table_ref = dataset.table("us_states")
676-
body = six.BytesIO(b"Washington,WA")
677-
client.load_table_from_file(body, table_ref, job_config=job_config).result()
678-
previous_rows = client.get_table(table_ref).num_rows
679-
assert previous_rows > 0
680-
681-
# Shared code
682-
# [START bigquery_load_table_gcs_avro_truncate]
683-
# [START bigquery_load_table_gcs_csv_truncate]
684-
# [START bigquery_load_table_gcs_json_truncate]
685-
# [START bigquery_load_table_gcs_parquet_truncate]
686-
# [START bigquery_load_table_gcs_orc_truncate]
687-
# from google.cloud import bigquery
688-
# client = bigquery.Client()
689-
# table_ref = client.dataset('my_dataset').table('existing_table')
690-
691-
job_config = bigquery.LoadJobConfig()
692-
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
693-
# [END bigquery_load_table_gcs_avro_truncate]
694-
# [END bigquery_load_table_gcs_csv_truncate]
695-
# [END bigquery_load_table_gcs_json_truncate]
696-
# [END bigquery_load_table_gcs_parquet_truncate]
697-
# [END bigquery_load_table_gcs_orc_truncate]
698-
699-
# Format-specific code
700-
# [START bigquery_load_table_gcs_avro_truncate]
701-
job_config.source_format = bigquery.SourceFormat.AVRO
702-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.avro"
703-
# [END bigquery_load_table_gcs_avro_truncate]
704-
705-
# [START bigquery_load_table_gcs_csv_truncate]
706-
job_config.skip_leading_rows = 1
707-
# The source format defaults to CSV, so the line below is optional.
708-
job_config.source_format = bigquery.SourceFormat.CSV
709-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
710-
# [END bigquery_load_table_gcs_csv_truncate]
711-
# unset csv-specific attribute
712-
del job_config._properties["load"]["skipLeadingRows"]
713-
714-
# [START bigquery_load_table_gcs_json_truncate]
715-
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
716-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
717-
# [END bigquery_load_table_gcs_json_truncate]
718-
719-
# [START bigquery_load_table_gcs_parquet_truncate]
720-
job_config.source_format = bigquery.SourceFormat.PARQUET
721-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"
722-
# [END bigquery_load_table_gcs_parquet_truncate]
723-
724-
# [START bigquery_load_table_gcs_orc_truncate]
725-
job_config.source_format = bigquery.SourceFormat.ORC
726-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.orc"
727-
# [END bigquery_load_table_gcs_orc_truncate]
728-
729-
# Shared code
730-
# [START bigquery_load_table_gcs_avro_truncate]
731-
# [START bigquery_load_table_gcs_csv_truncate]
732-
# [START bigquery_load_table_gcs_json_truncate]
733-
# [START bigquery_load_table_gcs_parquet_truncate]
734-
# [START bigquery_load_table_gcs_orc_truncate]
735-
load_job = client.load_table_from_uri(
736-
uri, table_ref, job_config=job_config
737-
) # API request
738-
print("Starting job {}".format(load_job.job_id))
739-
740-
load_job.result() # Waits for table load to complete.
741-
print("Job finished.")
742-
743-
destination_table = client.get_table(table_ref)
744-
print("Loaded {} rows.".format(destination_table.num_rows))
745-
# [END bigquery_load_table_gcs_avro_truncate]
746-
# [END bigquery_load_table_gcs_csv_truncate]
747-
# [END bigquery_load_table_gcs_json_truncate]
748-
# [END bigquery_load_table_gcs_parquet_truncate]
749-
# [END bigquery_load_table_gcs_orc_truncate]
750-
751-
out, _ = capsys.readouterr()
752-
assert "Loaded 50 rows." in out
753-
754-
755587
def test_load_table_add_column(client, to_delete):
756588
dataset_id = "load_table_add_column_{}".format(_millis())
757589
project = client.project

docs/usage/tables.rst

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,22 @@ Load an ORC file from Cloud Storage:
132132
See also: `Loading ORC data from Cloud Storage
133133
<https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-orc>`_.
134134

135+
Load a CSV file from Cloud Storage and auto-detect schema:
136+
137+
.. literalinclude:: ../samples/load_table_uri_autodetect_csv.py
138+
:language: python
139+
:dedent: 4
140+
:start-after: [START bigquery_load_table_gcs_csv_autodetect]
141+
:end-before: [END bigquery_load_table_gcs_csv_autodetect]
142+
143+
Load a JSON file from Cloud Storage and auto-detect schema:
144+
145+
.. literalinclude:: ../samples/load_table_uri_autodetect_json.py
146+
:language: python
147+
:dedent: 4
148+
:start-after: [START bigquery_load_table_gcs_json_autodetect]
149+
:end-before: [END bigquery_load_table_gcs_json_autodetect]
150+
135151
Updating a Table
136152
^^^^^^^^^^^^^^^^
137153

@@ -220,3 +236,46 @@ Restore a deleted table from a snapshot by using the
220236
:dedent: 4
221237
:start-after: [START bigquery_undelete_table]
222238
:end-before: [END bigquery_undelete_table]
239+
240+
Overwrite a Table
241+
^^^^^^^^^^^^^^^^^
242+
243+
Replace the table data with an Avro file from Cloud Storage:
244+
245+
.. literalinclude:: ../samples/load_table_uri_truncate_avro.py
246+
:language: python
247+
:dedent: 4
248+
:start-after: [START bigquery_load_table_gcs_avro_truncate]
249+
:end-before: [END bigquery_load_table_gcs_avro_truncate]
250+
251+
Replace the table data with a CSV file from Cloud Storage:
252+
253+
.. literalinclude:: ../samples/load_table_uri_truncate_csv.py
254+
:language: python
255+
:dedent: 4
256+
:start-after: [START bigquery_load_table_gcs_csv_truncate]
257+
:end-before: [END bigquery_load_table_gcs_csv_truncate]
258+
259+
Replace the table data with a JSON file from Cloud Storage:
260+
261+
.. literalinclude:: ../samples/load_table_uri_truncate_json.py
262+
:language: python
263+
:dedent: 4
264+
:start-after: [START bigquery_load_table_gcs_json_truncate]
265+
:end-before: [END bigquery_load_table_gcs_json_truncate]
266+
267+
Replace the table data with an ORC file from Cloud Storage:
268+
269+
.. literalinclude:: ../samples/load_table_uri_truncate_orc.py
270+
:language: python
271+
:dedent: 4
272+
:start-after: [START bigquery_load_table_gcs_orc_truncate]
273+
:end-before: [END bigquery_load_table_gcs_orc_truncate]
274+
275+
Replace the table data with a Parquet file from Cloud Storage:
276+
277+
.. literalinclude:: ../samples/load_table_uri_truncate_parquet.py
278+
:language: python
279+
:dedent: 4
280+
:start-after: [START bigquery_load_table_gcs_parquet_truncate]
281+
:end-before: [END bigquery_load_table_gcs_parquet_truncate]
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def load_table_uri_autodetect_csv(table_id):
17+
18+
# [START bigquery_load_table_gcs_csv_autodetect]
19+
from google.cloud import bigquery
20+
21+
# Construct a BigQuery client object.
22+
client = bigquery.Client()
23+
24+
# TODO(developer): Set table_id to the ID of the table to create.
25+
# table_id = "your-project.your_dataset.your_table_name
26+
27+
# Set the encryption key to use for the destination.
28+
# TODO: Replace this key with a key you have created in KMS.
29+
# kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
30+
# "cloud-samples-tests", "us", "test", "test"
31+
# )
32+
job_config = bigquery.LoadJobConfig(
33+
autodetect=True,
34+
skip_leading_rows=1,
35+
# The source format defaults to CSV, so the line below is optional.
36+
source_format=bigquery.SourceFormat.CSV,
37+
)
38+
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
39+
load_job = client.load_table_from_uri(
40+
uri, table_id, job_config=job_config
41+
) # Make an API request.
42+
load_job.result() # Waits for the job to complete.
43+
destination_table = client.get_table(table_id)
44+
print("Loaded {} rows.".format(destination_table.num_rows))
45+
# [END bigquery_load_table_gcs_csv_autodetect]
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def load_table_uri_autodetect_json(table_id):
17+
18+
# [START bigquery_load_table_gcs_json_autodetect]
19+
from google.cloud import bigquery
20+
21+
# Construct a BigQuery client object.
22+
client = bigquery.Client()
23+
24+
# TODO(developer): Set table_id to the ID of the table to create.
25+
# table_id = "your-project.your_dataset.your_table_name
26+
27+
# Set the encryption key to use for the destination.
28+
# TODO: Replace this key with a key you have created in KMS.
29+
# kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
30+
# "cloud-samples-tests", "us", "test", "test"
31+
# )
32+
job_config = bigquery.LoadJobConfig(
33+
autodetect=True, source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
34+
)
35+
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
36+
load_job = client.load_table_from_uri(
37+
uri, table_id, job_config=job_config
38+
) # Make an API request.
39+
load_job.result() # Waits for the job to complete.
40+
destination_table = client.get_table(table_id)
41+
print("Loaded {} rows.".format(destination_table.num_rows))
42+
# [END bigquery_load_table_gcs_json_autodetect]
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def load_table_uri_truncate_avro(table_id):
17+
18+
# [START bigquery_load_table_gcs_avro_truncate]
19+
import six
20+
21+
from google.cloud import bigquery
22+
23+
# Construct a BigQuery client object.
24+
client = bigquery.Client()
25+
26+
# TODO(developer): Set table_id to the ID of the table to create.
27+
# table_id = "your-project.your_dataset.your_table_name
28+
29+
job_config = bigquery.LoadJobConfig(
30+
schema=[
31+
bigquery.SchemaField("name", "STRING"),
32+
bigquery.SchemaField("post_abbr", "STRING"),
33+
],
34+
)
35+
36+
body = six.BytesIO(b"Washington,WA")
37+
client.load_table_from_file(body, table_id, job_config=job_config).result()
38+
previous_rows = client.get_table(table_id).num_rows
39+
assert previous_rows > 0
40+
41+
job_config = bigquery.LoadJobConfig(
42+
write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
43+
source_format=bigquery.SourceFormat.AVRO,
44+
)
45+
46+
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.avro"
47+
load_job = client.load_table_from_uri(
48+
uri, table_id, job_config=job_config
49+
) # Make an API request.
50+
51+
load_job.result() # Waits for the job to complete.
52+
53+
destination_table = client.get_table(table_id)
54+
print("Loaded {} rows.".format(destination_table.num_rows))
55+
# [END bigquery_load_table_gcs_avro_truncate]

0 commit comments

Comments
 (0)