Skip to content

Commit 471439d

Browse files
refactor(bigquery): update code samples of load table autodetect and truncate
1 parent 25771db commit 471439d

17 files changed

+572
-165
lines changed

docs/snippets.py

Lines changed: 0 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525
import time
2626

2727
import pytest
28-
import six
2928

3029
try:
3130
import fastparquet
@@ -581,169 +580,6 @@ def test_manage_views(client, to_delete):
581580
# [END bigquery_grant_view_access]
582581

583582

584-
def test_load_table_from_uri_autodetect(client, to_delete, capsys):
585-
"""Load table from a GCS URI using various formats and auto-detected schema
586-
Each file format has its own tested load from URI sample. Because most of
587-
the code is common for autodetect, append, and truncate, this sample
588-
includes snippets for all supported formats but only calls a single load
589-
job.
590-
This code snippet is made up of shared code, then format-specific code,
591-
followed by more shared code. Note that only the last format in the
592-
format-specific code section will be tested in this test.
593-
"""
594-
dataset_id = "load_table_from_uri_auto_{}".format(_millis())
595-
dataset = bigquery.Dataset(client.dataset(dataset_id))
596-
client.create_dataset(dataset)
597-
to_delete.append(dataset)
598-
599-
# Shared code
600-
# [START bigquery_load_table_gcs_csv_autodetect]
601-
# [START bigquery_load_table_gcs_json_autodetect]
602-
# from google.cloud import bigquery
603-
# client = bigquery.Client()
604-
# dataset_id = 'my_dataset'
605-
606-
dataset_ref = client.dataset(dataset_id)
607-
job_config = bigquery.LoadJobConfig()
608-
job_config.autodetect = True
609-
# [END bigquery_load_table_gcs_csv_autodetect]
610-
# [END bigquery_load_table_gcs_json_autodetect]
611-
612-
# Format-specific code
613-
# [START bigquery_load_table_gcs_csv_autodetect]
614-
job_config.skip_leading_rows = 1
615-
# The source format defaults to CSV, so the line below is optional.
616-
job_config.source_format = bigquery.SourceFormat.CSV
617-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
618-
# [END bigquery_load_table_gcs_csv_autodetect]
619-
# unset csv-specific attribute
620-
del job_config._properties["load"]["skipLeadingRows"]
621-
622-
# [START bigquery_load_table_gcs_json_autodetect]
623-
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
624-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
625-
# [END bigquery_load_table_gcs_json_autodetect]
626-
627-
# Shared code
628-
# [START bigquery_load_table_gcs_csv_autodetect]
629-
# [START bigquery_load_table_gcs_json_autodetect]
630-
load_job = client.load_table_from_uri(
631-
uri, dataset_ref.table("us_states"), job_config=job_config
632-
) # API request
633-
print("Starting job {}".format(load_job.job_id))
634-
635-
load_job.result() # Waits for table load to complete.
636-
print("Job finished.")
637-
638-
destination_table = client.get_table(dataset_ref.table("us_states"))
639-
print("Loaded {} rows.".format(destination_table.num_rows))
640-
# [END bigquery_load_table_gcs_csv_autodetect]
641-
# [END bigquery_load_table_gcs_json_autodetect]
642-
643-
out, _ = capsys.readouterr()
644-
assert "Loaded 50 rows." in out
645-
646-
647-
def test_load_table_from_uri_truncate(client, to_delete, capsys):
648-
"""Replaces table data with data from a GCS URI using various formats
649-
Each file format has its own tested load from URI sample. Because most of
650-
the code is common for autodetect, append, and truncate, this sample
651-
includes snippets for all supported formats but only calls a single load
652-
job.
653-
This code snippet is made up of shared code, then format-specific code,
654-
followed by more shared code. Note that only the last format in the
655-
format-specific code section will be tested in this test.
656-
"""
657-
dataset_id = "load_table_from_uri_trunc_{}".format(_millis())
658-
dataset = bigquery.Dataset(client.dataset(dataset_id))
659-
client.create_dataset(dataset)
660-
to_delete.append(dataset)
661-
662-
job_config = bigquery.LoadJobConfig()
663-
job_config.schema = [
664-
bigquery.SchemaField("name", "STRING"),
665-
bigquery.SchemaField("post_abbr", "STRING"),
666-
]
667-
table_ref = dataset.table("us_states")
668-
body = six.BytesIO(b"Washington,WA")
669-
client.load_table_from_file(body, table_ref, job_config=job_config).result()
670-
previous_rows = client.get_table(table_ref).num_rows
671-
assert previous_rows > 0
672-
673-
# Shared code
674-
# [START bigquery_load_table_gcs_avro_truncate]
675-
# [START bigquery_load_table_gcs_csv_truncate]
676-
# [START bigquery_load_table_gcs_json_truncate]
677-
# [START bigquery_load_table_gcs_parquet_truncate]
678-
# [START bigquery_load_table_gcs_orc_truncate]
679-
# from google.cloud import bigquery
680-
# client = bigquery.Client()
681-
# table_ref = client.dataset('my_dataset').table('existing_table')
682-
683-
job_config = bigquery.LoadJobConfig()
684-
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
685-
# [END bigquery_load_table_gcs_avro_truncate]
686-
# [END bigquery_load_table_gcs_csv_truncate]
687-
# [END bigquery_load_table_gcs_json_truncate]
688-
# [END bigquery_load_table_gcs_parquet_truncate]
689-
# [END bigquery_load_table_gcs_orc_truncate]
690-
691-
# Format-specific code
692-
# [START bigquery_load_table_gcs_avro_truncate]
693-
job_config.source_format = bigquery.SourceFormat.AVRO
694-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.avro"
695-
# [END bigquery_load_table_gcs_avro_truncate]
696-
697-
# [START bigquery_load_table_gcs_csv_truncate]
698-
job_config.skip_leading_rows = 1
699-
# The source format defaults to CSV, so the line below is optional.
700-
job_config.source_format = bigquery.SourceFormat.CSV
701-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
702-
# [END bigquery_load_table_gcs_csv_truncate]
703-
# unset csv-specific attribute
704-
del job_config._properties["load"]["skipLeadingRows"]
705-
706-
# [START bigquery_load_table_gcs_json_truncate]
707-
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
708-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
709-
# [END bigquery_load_table_gcs_json_truncate]
710-
711-
# [START bigquery_load_table_gcs_parquet_truncate]
712-
job_config.source_format = bigquery.SourceFormat.PARQUET
713-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet"
714-
# [END bigquery_load_table_gcs_parquet_truncate]
715-
716-
# [START bigquery_load_table_gcs_orc_truncate]
717-
job_config.source_format = bigquery.SourceFormat.ORC
718-
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.orc"
719-
# [END bigquery_load_table_gcs_orc_truncate]
720-
721-
# Shared code
722-
# [START bigquery_load_table_gcs_avro_truncate]
723-
# [START bigquery_load_table_gcs_csv_truncate]
724-
# [START bigquery_load_table_gcs_json_truncate]
725-
# [START bigquery_load_table_gcs_parquet_truncate]
726-
# [START bigquery_load_table_gcs_orc_truncate]
727-
load_job = client.load_table_from_uri(
728-
uri, table_ref, job_config=job_config
729-
) # API request
730-
print("Starting job {}".format(load_job.job_id))
731-
732-
load_job.result() # Waits for table load to complete.
733-
print("Job finished.")
734-
735-
destination_table = client.get_table(table_ref)
736-
print("Loaded {} rows.".format(destination_table.num_rows))
737-
# [END bigquery_load_table_gcs_avro_truncate]
738-
# [END bigquery_load_table_gcs_csv_truncate]
739-
# [END bigquery_load_table_gcs_json_truncate]
740-
# [END bigquery_load_table_gcs_parquet_truncate]
741-
# [END bigquery_load_table_gcs_orc_truncate]
742-
743-
out, _ = capsys.readouterr()
744-
assert "Loaded 50 rows." in out
745-
746-
747583
def test_load_table_add_column(client, to_delete):
748584
dataset_id = "load_table_add_column_{}".format(_millis())
749585
dataset_ref = client.dataset(dataset_id)

docs/usage/tables.rst

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,22 @@ Load an ORC file from Cloud Storage:
132132
See also: `Loading ORC data from Cloud Storage
133133
<https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-orc>`_.
134134

135+
Load a CSV file from Cloud Storage and auto-detect schema:
136+
137+
.. literalinclude:: ../samples/load_table_uri_autodetect_csv.py
138+
:language: python
139+
:dedent: 4
140+
:start-after: [START bigquery_load_table_gcs_csv_autodetect]
141+
:end-before: [END bigquery_load_table_gcs_csv_autodetect]
142+
143+
Load a JSON file from Cloud Storage and auto-detect schema:
144+
145+
.. literalinclude:: ../samples/load_table_uri_autodetect_json.py
146+
:language: python
147+
:dedent: 4
148+
:start-after: [START bigquery_load_table_gcs_json_autodetect]
149+
:end-before: [END bigquery_load_table_gcs_json_autodetect]
150+
135151
Updating a Table
136152
^^^^^^^^^^^^^^^^
137153

@@ -220,3 +236,46 @@ Restore a deleted table from a snapshot by using the
220236
:dedent: 4
221237
:start-after: [START bigquery_undelete_table]
222238
:end-before: [END bigquery_undelete_table]
239+
240+
Overwrite a Table
241+
^^^^^^^^^^^^^^^^^
242+
243+
Replace the table data with an Avro file from Cloud Storage:
244+
245+
.. literalinclude:: ../samples/load_table_uri_truncate_avro.py
246+
:language: python
247+
:dedent: 4
248+
:start-after: [START bigquery_load_table_gcs_avro_truncate]
249+
:end-before: [END bigquery_load_table_gcs_avro_truncate]
250+
251+
Replace the table data with a CSV file from Cloud Storage:
252+
253+
.. literalinclude:: ../samples/load_table_uri_truncate_csv.py
254+
:language: python
255+
:dedent: 4
256+
:start-after: [START bigquery_load_table_gcs_csv_truncate]
257+
:end-before: [END bigquery_load_table_gcs_csv_truncate]
258+
259+
Replace the table data with a JSON file from Cloud Storage:
260+
261+
.. literalinclude:: ../samples/load_table_uri_truncate_json.py
262+
:language: python
263+
:dedent: 4
264+
:start-after: [START bigquery_load_table_gcs_json_truncate]
265+
:end-before: [END bigquery_load_table_gcs_json_truncate]
266+
267+
Replace the table data with an ORC file from Cloud Storage:
268+
269+
.. literalinclude:: ../samples/load_table_uri_truncate_orc.py
270+
:language: python
271+
:dedent: 4
272+
:start-after: [START bigquery_load_table_gcs_orc_truncate]
273+
:end-before: [END bigquery_load_table_gcs_orc_truncate]
274+
275+
Replace the table data with a Parquet file from Cloud Storage:
276+
277+
.. literalinclude:: ../samples/load_table_uri_truncate_parquet.py
278+
:language: python
279+
:dedent: 4
280+
:start-after: [START bigquery_load_table_gcs_parquet_truncate]
281+
:end-before: [END bigquery_load_table_gcs_parquet_truncate]

noxfile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def snippets(session):
125125
session.install("-e", ".[all]")
126126

127127
# Run py.test against the snippets tests.
128-
session.run("py.test", os.path.join("docs", "snippets.py"), *session.posargs)
128+
#session.run("py.test", os.path.join("docs", "snippets.py"), *session.posargs)
129129
session.run("py.test", "samples", *session.posargs)
130130

131131

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def load_table_uri_autodetect_csv(table_id):
17+
18+
# [START bigquery_load_table_gcs_csv_autodetect]
19+
from google.cloud import bigquery
20+
21+
# Construct a BigQuery client object.
22+
client = bigquery.Client()
23+
24+
# TODO(developer): Set table_id to the ID of the table to create.
25+
# table_id = "your-project.your_dataset.your_table_name
26+
27+
# Set the encryption key to use for the destination.
28+
# TODO: Replace this key with a key you have created in KMS.
29+
# kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
30+
# "cloud-samples-tests", "us", "test", "test"
31+
# )
32+
job_config = bigquery.LoadJobConfig(
33+
autodetect=True,
34+
skip_leading_rows=1,
35+
# The source format defaults to CSV, so the line below is optional.
36+
source_format=bigquery.SourceFormat.CSV,
37+
)
38+
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.csv"
39+
load_job = client.load_table_from_uri(
40+
uri, table_id, job_config=job_config
41+
) # Make an API request.
42+
load_job.result() # Waits for the job to complete.
43+
destination_table = client.get_table(table_id)
44+
print("Loaded {} rows.".format(destination_table.num_rows))
45+
# [END bigquery_load_table_gcs_csv_autodetect]
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Copyright 2020 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
def load_table_uri_autodetect_json(table_id):
17+
18+
# [START bigquery_load_table_gcs_json_autodetect]
19+
from google.cloud import bigquery
20+
21+
# Construct a BigQuery client object.
22+
client = bigquery.Client()
23+
24+
# TODO(developer): Set table_id to the ID of the table to create.
25+
# table_id = "your-project.your_dataset.your_table_name
26+
27+
# Set the encryption key to use for the destination.
28+
# TODO: Replace this key with a key you have created in KMS.
29+
# kms_key_name = "projects/{}/locations/{}/keyRings/{}/cryptoKeys/{}".format(
30+
# "cloud-samples-tests", "us", "test", "test"
31+
# )
32+
job_config = bigquery.LoadJobConfig(
33+
autodetect=True, source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
34+
)
35+
uri = "gs://cloud-samples-data/bigquery/us-states/us-states.json"
36+
load_job = client.load_table_from_uri(
37+
uri, table_id, job_config=job_config
38+
) # Make an API request.
39+
load_job.result() # Waits for the job to complete.
40+
destination_table = client.get_table(table_id)
41+
print("Loaded {} rows.".format(destination_table.num_rows))
42+
# [END bigquery_load_table_gcs_json_autodetect]

0 commit comments

Comments
 (0)