googleapis · leahecole · Oct 21, 2020 · Oct 13, 2020 · Oct 15, 2020 · Oct 15, 2020
diff --git a/samples/__init__.py b/samples/__init__.py
diff --git a/samples/snippets/__init__.py b/samples/snippets/__init__.py
diff --git a/samples/snippets/batch_process_documents_sample_v1beta3.py b/samples/snippets/batch_process_documents_sample_v1beta3.py
@@ -0,0 +1,121 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# [START documentai_batch_process_document]
+import re
+
+from google.cloud import documentai_v1beta3 as documentai
+from google.cloud import storage
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id= 'YOUR_PROJECT_ID'
+# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
+# input_uri = "YOUR_INPUT_URI"
+# gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI"
+# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"
+
+
+def batch_process_documents(
+ project_id,
+ location,
+ processor_id,
+ gcs_input_uri,
+ gcs_output_uri,
+ gcs_output_uri_prefix,
+):
+
+ client = documentai.DocumentProcessorServiceClient()
+
+ destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"
+
+ # 'mime_type' can be 'application/pdf', 'image/tiff',
+ # and 'image/gif', or 'application/json'
+ input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
+ gcs_source=gcs_input_uri, mime_type="application/pdf"
+ )
+
+ # Where to write results
+ output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
+ gcs_destination=destination_uri
+ )
+
+ # Location can be 'us' or 'eu'
+ name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
+ request = documentai.types.document_processor_service.BatchProcessRequest(
+ name=name,
+ input_configs=[input_config],
+ output_config=output_config,
+ )
+
+ operation = client.batch_process_documents(request)
+
+ # Wait for the operation to finish
+ operation.result()
+
+ # Results are written to GCS. Use a regex to find
+ # output files
+ match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
+ output_bucket = match.group(1)
+ prefix = match.group(2)
+
+ storage_client = storage.Client()
+ bucket = storage_client.get_bucket(output_bucket)
+ blob_list = list(bucket.list_blobs(prefix=prefix))
+ print("Output files:")
+
+ for i, blob in enumerate(blob_list):
+ # Download the contents of this blob as a bytes object.
+ blob_as_bytes = blob.download_as_bytes()
+ document = documentai.types.Document.from_json(blob_as_bytes)
+
+ print(f"Fetched file {i + 1}")
+
+ # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
+
+ # Read the text recognition output from the processor
+ for page in document.pages:
+ for form_field in page.form_fields:
+ field_name = get_text(form_field.field_name, document)
+ field_value = get_text(form_field.field_value, document)
+ print("Extracted key value pair:")
+ print(f"\t{field_name}, {field_value}")
+ for paragraph in document.pages:
+ paragraph_text = get_text(paragraph.layout, document)
+ print(f"Paragraph text:\n{paragraph_text}")
+
+
+# Extract shards from the text field
+def get_text(doc_element: dict, document: dict):
+ """
+ Document AI identifies form fields by their offsets
+ in document text. This function converts offsets
+ to text snippets.
+ """
+ response = ""
+ # If a text segment spans several lines, it will
+ # be stored in different text segments.
+ for segment in doc_element.text_anchor.text_segments:
+ start_index = (
+ int(segment.start_index)
+ if "start_index" in doc_element.text_anchor.__dict__
+ else 0
+ )
+ end_index = int(segment.end_index)
+ response += document.text[start_index:end_index]
+ return response
+
+
+# [END documentai_batch_process_document]
diff --git a/samples/snippets/batch_process_documents_sample_v1beta3_test.py b/samples/snippets/batch_process_documents_sample_v1beta3_test.py
@@ -0,0 +1,62 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from uuid import uuid4
+
+from google.cloud import storage
+from google.cloud.exceptions import NotFound
+
+import pytest
+
+from samples.snippets import batch_process_documents_sample_v1beta3
+
+location = "us"
+project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
+processor_id = "90484cfdedb024f6"
+gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
+gcs_output_uri_prefix = uuid4()
+BUCKET_NAME = f"document-ai-python-{uuid4()}"
+
+
+@pytest.fixture(scope="module")
+def test_bucket():
+ storage_client = storage.Client()
+ bucket = storage_client.create_bucket(BUCKET_NAME)
+ yield bucket.name
+
+ try:
+ blobs = list(bucket.list_blobs())
+ for blob in blobs:
+ blob.delete()
+ bucket.delete()
+ except NotFound:
+ print("Bucket already deleted.")
+
+
+def test_batch_process_documents(capsys, test_bucket):
+ batch_process_documents_sample_v1beta3.batch_process_documents(
+ project_id=project_id,
+ location=location,
+ processor_id=processor_id,
+ gcs_input_uri=gcs_input_uri,
+ gcs_output_uri=f"gs://{test_bucket}",
+ gcs_output_uri_prefix=gcs_output_uri_prefix,
+ )
+ out, _ = capsys.readouterr()
+
+ assert "Extracted" in out
+ assert "Paragraph" in out
+ assert "Invoice" in out
diff --git a/samples/snippets/noxfile.py b/samples/snippets/noxfile.py
@@ -37,24 +37,22 @@
 
 TEST_CONFIG = {
  # You can opt out from the test for specific Python versions.
- 'ignored_versions': ["2.7"],
-
+ "ignored_versions": ["2.7"],
  # An envvar key for determining the project id to use. Change it
  # to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
  # build specific Cloud project. You can also use your own string
  # to use your own Cloud project.
- 'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT',
+ "gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
  # 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',
-
  # A dictionary you want to inject into your test. Don't put any
  # secrets here. These values will override predefined values.
- 'envs': {},
+ "envs": {},
 }
 
 
 try:
  # Ensure we can import noxfile_config in the project's directory.
- sys.path.append('.')
+ sys.path.append(".")
  from noxfile_config import TEST_CONFIG_OVERRIDE
 except ImportError as e:
  print("No user noxfile_config found: detail: {}".format(e))
@@ -69,13 +67,13 @@ def get_pytest_env_vars():
  ret = {}
 
  # Override the GCLOUD_PROJECT and the alias.
- env_key = TEST_CONFIG['gcloud_project_env']
+ env_key = TEST_CONFIG["gcloud_project_env"]
  # This should error out if not set.
- ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
- ret['GCLOUD_PROJECT'] = os.environ[env_key] # deprecated
+ ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key]
+ ret["GCLOUD_PROJECT"] = os.environ[env_key] # deprecated
 
  # Apply user supplied envs.
- ret.update(TEST_CONFIG['envs'])
+ ret.update(TEST_CONFIG["envs"])
  return ret
 
 
@@ -84,7 +82,7 @@ def get_pytest_env_vars():
 ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]
 
 # Any default versions that should be ignored.
-IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
+IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"]
 
 TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])
 
@@ -138,7 +136,7 @@ def lint(session):
  args = FLAKE8_COMMON_ARGS + [
  "--application-import-names",
  ",".join(local_names),
- "."
+ ".",
  ]
  session.run("flake8", *args)
 
@@ -147,6 +145,7 @@ def lint(session):
 # Black
 #
 
+
 @nox.session
 def blacken(session):
  session.install("black")
@@ -194,9 +193,9 @@ def py(session):
  if session.python in TESTED_VERSIONS:
  _session_tests(session)
  else:
- session.skip("SKIPPED: {} tests are disabled for this sample.".format(
- session.python
- ))
+ session.skip(
+ "SKIPPED: {} tests are disabled for this sample.".format(session.python)
+ )
 
 
 #

diff --git a/samples/snippets/process_document_sample_v1beta3.py b/samples/snippets/process_document_sample_v1beta3.py
@@ -0,0 +1,88 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from google.cloud import documentai_v1beta3 as documentai
+
+# [START documentai_process_document]
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id= 'YOUR_PROJECT_ID';
+# location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
+# file_path = '/path/to/local/pdf';
+
+
+def process_document_sample(
+ project_id: str, location: str, processor_id: str, file_path: str
+):
+ # Instantiates a client
+ client = documentai.DocumentProcessorServiceClient()
+
+ # The full resource name of the processor, e.g.:
+ # projects/project-id/locations/location/processor/processor-id
+ # You must create new processors in the Cloud Console first
+ name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
+
+ with open(file_path, "rb") as image:
+ image_content = image.read()
+
+ # Read the file into memory
+ document = {"content": image_content, "mime_type": "application/pdf"}
+
+ # Configure the process request
+ request = {"name": name, "document": document}
+
+ # Recognizes text entities in the PDF document
+ result = client.process_document(request=request)
+
+ document = result.document
+
+ print("Document processing complete.")
+
+ # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document
+
+ document_pages = document.pages
+
+ # Read the text recognition output from the processor
+ print("The document contains the following paragraphs:")
+ for page in document_pages:
+ paragraphs = page.paragraphs
+ for paragraph in paragraphs:
+ paragraph_text = get_text(paragraph.layout, document)
+ print(f"Paragraph text: {paragraph_text}")
+
+
+# Extract shards from the text field
+def get_text(doc_element: dict, document: dict):
+ """
+ Document AI identifies form fields by their offsets
+ in document text. This function converts offsets
+ to text snippets.
+ """
+ response = ""
+ # If a text segment spans several lines, it will
+ # be stored in different text segments.
+ for segment in doc_element.text_anchor.text_segments:
+ start_index = (
+ int(segment.start_index)
+ if segment.start_index in doc_element.text_anchor.text_segments
+ else 0
+ )
+ end_index = int(segment.end_index)
+ response += document.text[start_index:end_index]
+ return response
+
+
+# [END documentai_process_document]
diff --git a/samples/snippets/process_document_sample_v1beta3_test.py b/samples/snippets/process_document_sample_v1beta3_test.py
@@ -0,0 +1,37 @@
+# # Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+
+from samples.snippets import process_document_sample_v1beta3
+
+
+location = "us"
+project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
+processor_id = "90484cfdedb024f6"
+file_path = "resources/invoice.pdf"
+
+
+def test_process_documents(capsys):
+ process_document_sample_v1beta3.process_document_sample(
+ project_id=project_id,
+ location=location,
+ processor_id=processor_id,
+ file_path=file_path,
+ )
+ out, _ = capsys.readouterr()
+
+ assert "Paragraph" in out
+ assert "Invoice" in out