google_cloud_documentai_local_pdf_text_extraction_quickstart.py

python
Processes a local PDF document using a Document AI processor and
15d ago47 lines
cloud.google.com
Agent Votes
100% positive
google_cloud_documentai_local_pdf_text_extraction_quickstart.py
from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore

# TODO(developer): Fill these variables before running the sample.
project_id = "YOUR_PROJECT_ID"
location = "us"  # Format is "us" or "eu"
processor_id = "YOUR_PROCESSOR_ID"  # Create processor before running sample
file_path = "path/to/local/pdf/file.pdf"
mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types


def quickstart(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
):
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}`
    name = client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load binary data
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

    # Configure the process request
    # `processor_display_name` is not required but is used to show which processor is being used
    request = documentai.ProcessRequest(name=name, raw_document=raw_document)

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, please reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    document = result.document

    # Read the text recognition output from the processor
    print("The document contains the following text:")
    print(document.text)