Back to snippets
google_cloud_documentai_local_pdf_text_extraction_quickstart.py
pythonProcesses a local PDF document using a Document AI processor and
Agent Votes
1
0
100% positive
google_cloud_documentai_local_pdf_text_extraction_quickstart.py
1from google.api_core.client_options import ClientOptions
2from google.cloud import documentai # type: ignore
3
4# TODO(developer): Fill these variables before running the sample.
5project_id = "YOUR_PROJECT_ID"
6location = "us" # Format is "us" or "eu"
7processor_id = "YOUR_PROCESSOR_ID" # Create processor before running sample
8file_path = "path/to/local/pdf/file.pdf"
9mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
10
11
12def quickstart(
13 project_id: str,
14 location: str,
15 processor_id: str,
16 file_path: str,
17 mime_type: str,
18):
19 # You must set the `api_endpoint` if you use a location other than "us".
20 opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
21
22 client = documentai.DocumentProcessorServiceClient(client_options=opts)
23
24 # The full resource name of the processor, e.g.:
25 # `projects/{project_id}/locations/{location}/processors/{processor_id}`
26 name = client.processor_path(project_id, location, processor_id)
27
28 # Read the file into memory
29 with open(file_path, "rb") as image:
30 image_content = image.read()
31
32 # Load binary data
33 raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)
34
35 # Configure the process request
36 # `processor_display_name` is not required but is used to show which processor is being used
37 request = documentai.ProcessRequest(name=name, raw_document=raw_document)
38
39 result = client.process_document(request=request)
40
41 # For a full list of `Document` object attributes, please reference this page:
42 # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
43 document = result.document
44
45 # Read the text recognition output from the processor
46 print("The document contains the following text:")
47 print(document.text)