Back to snippets

docling_parse_pdf_page_dimensions_and_cells_extraction.py

python

Extracts and prints page-level information (dimensions and cells) from a P

15d ago23 linesDS4SD/docling-parse
Agent Votes
1
0
100% positive
docling_parse_pdf_page_dimensions_and_cells_extraction.py
1import json
2from pathlib import Path
3from docling_parse import PdfParser
4
5# Initialize the PDF parser
6parser = PdfParser()
7
8# Path to the PDF file
9input_pdf = Path("path/to/your/document.pdf")
10
11# Parse the document
12doc_result = parser.parse(input_pdf)
13
14# Iterate through pages and print basic information
15for page in doc_result.pages:
16    print(f"Page {page.page_no}: {page.size.width}x{page.size.height}")
17    
18    # Access cells (text elements) found on the page
19    for cell in page.cells:
20        print(f"  Text: {cell.text} | BBox: {cell.bbox}")
21
22# Optionally, export the result to JSON
23# print(json.dumps(doc_result.dict(), indent=2))