This repository was archived by the owner on Sep 20, 2023. It is now read-only.
- Notifications
You must be signed in to change notification settings - Fork 33
docs(samples): new Doc AI samples for v1beta3 #44
Merged
Merged
Changes from all commits
Commits
Show all changes
49 commits Select commit Hold shift + click to select a range
b2dc573 batch_process_sample. changing from async to synchronous
aribray b01d802 add quick start and process_document samples and tests
aribray cfb964a add test and sample for batch_process
aribray 8f9246d add test and sample for batch_process
aribray ba7681a resolve merge conflict
aribray a37f39a python document ai samples
aribray 99f7f11 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 87254c7 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray bcf97a6 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 26b9450 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 4943437 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 9439937 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 15dd4e4 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 0943fba Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 01058fe Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray d616c54 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 0b18336 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 4d08bf4 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray 6ee7994 Merge branch 'python-docai' of https://github.com/aribray/python-docu…
aribray dc24b32 resolve formatting
aribray 82e8ab9 use os.environ
aribray c373ac3 remove os.path.join
aribray 6389213 move tests
aribray 37cd427 descriptive variable
aribray aef335e specific Exception, formatting
aribray a4d2b4a parse all pages in process_document
aribray bbc187e add more helpful comments
aribray dd6488f remove unused imports
aribray 2179581 better exception handling
aribray 3cb2c0a rename test files
aribray f424aee Merge branch 'master' into python-docai
aribray 27b63f1 Merge branch 'master' into python-docai
aribray 043b445 ran linter, removed nested function in batch predict
aribray dba5ef8 refactor tests
aribray 5416bbc format imports
aribray d9e2cca format imports
aribray 700ab75 format imports
aribray 66dde36 serialize as Document object
aribray 0b839e8 extract get_text helper function
aribray bda06e8 fix file path
aribray ad5ff58 delete test bucket
aribray cd4a1d1 Update samples/snippets/batch_process_documents_sample_v1beta3_test.py
aribray e9ba609 Update samples/snippets/batch_process_documents_sample_v1beta3_test.py
aribray a439e32 add more specific assertion in batch_process
aribray 4e3f369 add more specific assertion in process_document and quickstart
aribray 9c7adaf fix output_uri name
aribray 0849731 Apply suggestions from code review to resolve exception
aribray 61f0c7f resolve exception
aribray 80d0fb4 lint
aribray File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
Empty file.
Empty file.
121 changes: 121 additions & 0 deletions 121 samples/snippets/batch_process_documents_sample_v1beta3.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,121 @@ | ||
| # Copyright 2020 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| | ||
| | ||
| # [START documentai_batch_process_document] | ||
| import re | ||
| | ||
| from google.cloud import documentai_v1beta3 as documentai | ||
| from google.cloud import storage | ||
| | ||
| # TODO(developer): Uncomment these variables before running the sample. | ||
| # project_id= 'YOUR_PROJECT_ID' | ||
| # location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu' | ||
| # processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console | ||
| # input_uri = "YOUR_INPUT_URI" | ||
| # gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI" | ||
| # gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" | ||
| | ||
| | ||
| def batch_process_documents( | ||
| project_id, | ||
| location, | ||
| processor_id, | ||
| gcs_input_uri, | ||
| gcs_output_uri, | ||
| gcs_output_uri_prefix, | ||
| ): | ||
| | ||
| client = documentai.DocumentProcessorServiceClient() | ||
| | ||
| destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/" | ||
| | ||
| # 'mime_type' can be 'application/pdf', 'image/tiff', | ||
| # and 'image/gif', or 'application/json' | ||
| input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig( | ||
| gcs_source=gcs_input_uri, mime_type="application/pdf" | ||
| ) | ||
| | ||
| # Where to write results | ||
| output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig( | ||
| gcs_destination=destination_uri | ||
| ) | ||
| | ||
| # Location can be 'us' or 'eu' | ||
| name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" | ||
| request = documentai.types.document_processor_service.BatchProcessRequest( | ||
| name=name, | ||
| input_configs=[input_config], | ||
| output_config=output_config, | ||
| ) | ||
| | ||
| operation = client.batch_process_documents(request) | ||
| | ||
| # Wait for the operation to finish | ||
| operation.result() | ||
| | ||
| # Results are written to GCS. Use a regex to find | ||
| # output files | ||
| match = re.match(r"gs://([^/]+)/(.+)", destination_uri) | ||
| output_bucket = match.group(1) | ||
| prefix = match.group(2) | ||
| | ||
| storage_client = storage.Client() | ||
| bucket = storage_client.get_bucket(output_bucket) | ||
| blob_list = list(bucket.list_blobs(prefix=prefix)) | ||
| print("Output files:") | ||
| | ||
| for i, blob in enumerate(blob_list): | ||
| # Download the contents of this blob as a bytes object. | ||
| blob_as_bytes = blob.download_as_bytes() | ||
| document = documentai.types.Document.from_json(blob_as_bytes) | ||
| | ||
| print(f"Fetched file {i + 1}") | ||
| | ||
| # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document | ||
| | ||
| # Read the text recognition output from the processor | ||
| for page in document.pages: | ||
| for form_field in page.form_fields: | ||
| field_name = get_text(form_field.field_name, document) | ||
| field_value = get_text(form_field.field_value, document) | ||
| print("Extracted key value pair:") | ||
| print(f"\t{field_name}, {field_value}") | ||
| for paragraph in document.pages: | ||
| paragraph_text = get_text(paragraph.layout, document) | ||
| print(f"Paragraph text:\n{paragraph_text}") | ||
| | ||
| | ||
| # Extract shards from the text field | ||
| def get_text(doc_element: dict, document: dict): | ||
| """ | ||
| Document AI identifies form fields by their offsets | ||
| in document text. This function converts offsets | ||
| to text snippets. | ||
| """ | ||
| response = "" | ||
| # If a text segment spans several lines, it will | ||
| # be stored in different text segments. | ||
| for segment in doc_element.text_anchor.text_segments: | ||
| start_index = ( | ||
| int(segment.start_index) | ||
| if "start_index" in doc_element.text_anchor.__dict__ | ||
| else 0 | ||
| ) | ||
| end_index = int(segment.end_index) | ||
| response += document.text[start_index:end_index] | ||
| return response | ||
| | ||
| | ||
| # [END documentai_batch_process_document] | ||
62 changes: 62 additions & 0 deletions 62 samples/snippets/batch_process_documents_sample_v1beta3_test.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,62 @@ | ||
| # Copyright 2020 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # | ||
| | ||
| import os | ||
| from uuid import uuid4 | ||
| | ||
| from google.cloud import storage | ||
| from google.cloud.exceptions import NotFound | ||
| | ||
| import pytest | ||
| | ||
| from samples.snippets import batch_process_documents_sample_v1beta3 | ||
| | ||
| location = "us" | ||
| project_id = os.environ["GOOGLE_CLOUD_PROJECT"] | ||
| processor_id = "90484cfdedb024f6" | ||
| gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf" | ||
| gcs_output_uri_prefix = uuid4() | ||
| BUCKET_NAME = f"document-ai-python-{uuid4()}" | ||
| | ||
aribray marked this conversation as resolved. Show resolved Hide resolved | ||
| | ||
| @pytest.fixture(scope="module") | ||
| def test_bucket(): | ||
| storage_client = storage.Client() | ||
| bucket = storage_client.create_bucket(BUCKET_NAME) | ||
| yield bucket.name | ||
| | ||
| try: | ||
| blobs = list(bucket.list_blobs()) | ||
| for blob in blobs: | ||
| blob.delete() | ||
| bucket.delete() | ||
| except NotFound: | ||
| print("Bucket already deleted.") | ||
| | ||
| | ||
| def test_batch_process_documents(capsys, test_bucket): | ||
| batch_process_documents_sample_v1beta3.batch_process_documents( | ||
| project_id=project_id, | ||
| location=location, | ||
| processor_id=processor_id, | ||
| gcs_input_uri=gcs_input_uri, | ||
| gcs_output_uri=f"gs://{test_bucket}", | ||
| gcs_output_uri_prefix=gcs_output_uri_prefix, | ||
| ) | ||
| out, _ = capsys.readouterr() | ||
| | ||
| assert "Extracted" in out | ||
| assert "Paragraph" in out | ||
aribray marked this conversation as resolved. Show resolved Hide resolved | ||
| assert "Invoice" in out | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,88 @@ | ||
| # Copyright 2020 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # | ||
| | ||
| from google.cloud import documentai_v1beta3 as documentai | ||
| | ||
| # [START documentai_process_document] | ||
| | ||
| # TODO(developer): Uncomment these variables before running the sample. | ||
| # project_id= 'YOUR_PROJECT_ID'; | ||
| # location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' | ||
| # processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console | ||
| # file_path = '/path/to/local/pdf'; | ||
| | ||
| | ||
| def process_document_sample( | ||
| project_id: str, location: str, processor_id: str, file_path: str | ||
| ): | ||
| # Instantiates a client | ||
| client = documentai.DocumentProcessorServiceClient() | ||
| | ||
| # The full resource name of the processor, e.g.: | ||
| # projects/project-id/locations/location/processor/processor-id | ||
| # You must create new processors in the Cloud Console first | ||
| name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" | ||
| | ||
| with open(file_path, "rb") as image: | ||
| image_content = image.read() | ||
| | ||
| # Read the file into memory | ||
| document = {"content": image_content, "mime_type": "application/pdf"} | ||
| | ||
| # Configure the process request | ||
| request = {"name": name, "document": document} | ||
| | ||
| # Recognizes text entities in the PDF document | ||
| result = client.process_document(request=request) | ||
| | ||
| document = result.document | ||
| | ||
| print("Document processing complete.") | ||
| | ||
| # For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document | ||
| | ||
| document_pages = document.pages | ||
| | ||
| # Read the text recognition output from the processor | ||
| print("The document contains the following paragraphs:") | ||
| for page in document_pages: | ||
| paragraphs = page.paragraphs | ||
| for paragraph in paragraphs: | ||
| paragraph_text = get_text(paragraph.layout, document) | ||
| print(f"Paragraph text: {paragraph_text}") | ||
| | ||
| | ||
| # Extract shards from the text field | ||
| def get_text(doc_element: dict, document: dict): | ||
| """ | ||
| Document AI identifies form fields by their offsets | ||
| in document text. This function converts offsets | ||
| to text snippets. | ||
| """ | ||
| response = "" | ||
| # If a text segment spans several lines, it will | ||
| # be stored in different text segments. | ||
| for segment in doc_element.text_anchor.text_segments: | ||
| start_index = ( | ||
| int(segment.start_index) | ||
| if segment.start_index in doc_element.text_anchor.text_segments | ||
| else 0 | ||
| ) | ||
| end_index = int(segment.end_index) | ||
| response += document.text[start_index:end_index] | ||
| return response | ||
| | ||
| | ||
| # [END documentai_process_document] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| # # Copyright 2020 Google LLC | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| # | ||
| | ||
| import os | ||
| | ||
| from samples.snippets import process_document_sample_v1beta3 | ||
| | ||
| | ||
| location = "us" | ||
| project_id = os.environ["GOOGLE_CLOUD_PROJECT"] | ||
| processor_id = "90484cfdedb024f6" | ||
| file_path = "resources/invoice.pdf" | ||
| | ||
| | ||
| def test_process_documents(capsys): | ||
| process_document_sample_v1beta3.process_document_sample( | ||
| project_id=project_id, | ||
| location=location, | ||
| processor_id=processor_id, | ||
| file_path=file_path, | ||
| ) | ||
| out, _ = capsys.readouterr() | ||
| | ||
| assert "Paragraph" in out | ||
aribray marked this conversation as resolved. Show resolved Hide resolved | ||
| assert "Invoice" in out | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit. This suggestion is invalid because no changes were made to the code. Suggestions cannot be applied while the pull request is closed. Suggestions cannot be applied while viewing a subset of changes. Only one suggestion per line can be applied in a batch. Add this suggestion to a batch that can be applied as a single commit. Applying suggestions on deleted lines is not supported. You must change the existing code in this line in order to create a valid suggestion. Outdated suggestions cannot be applied. This suggestion has been applied or marked resolved. Suggestions cannot be applied from pending reviews. Suggestions cannot be applied on multi-line comments. Suggestions cannot be applied while the pull request is queued to merge. Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.