Skip to content

Types

Core data types used throughout the library. These Pydantic models define the structure of data flowing through extraction and evaluation pipelines.

Import

from document_extraction_tools.types import (
    PathIdentifier,
    DocumentBytes,
    Document,
    Page,
    TextData,
    ImageData,
    ExtractionResult,
    EvaluationExample,
    EvaluationResult,
    ExtractionSchema,
    PipelineContext,
)

PathIdentifier

A unified reference to a document source. Used to track where documents originate from.

Bases: BaseModel

A unified reference to a document source.

path class-attribute instance-attribute

path: str | Path = Field(..., description='The primary path identifier.')

metadata class-attribute instance-attribute

metadata: dict[str, Any] = Field(default_factory=dict, description='Optional metadata associated with the path identifier.')

Example:

from document_extraction_tools.types import PathIdentifier

# Simple path reference
path_id = PathIdentifier(path="/data/leases/lease_001.pdf")

# With additional metadata (e.g., for cloud storage)
path_id = PathIdentifier(
    path="gs://my-bucket/documents/lease.pdf",
    metadata={"bucket": "my-bucket", "region": "us-central1"}
)

DocumentBytes

A standardized container for raw document data in memory. This decouples extraction logic from storage sources.

Bases: BaseModel

A standardized container for raw document data in memory.

This model decouples the extraction logic from the storage source. It guarantees that the processor receives raw bytes regardless of origin.

file_bytes class-attribute instance-attribute

file_bytes: bytes = Field(..., description='The raw binary content of the file.')

path_identifier class-attribute instance-attribute

path_identifier: PathIdentifier = Field(..., description='Path identifier for the original source.')

metadata class-attribute instance-attribute

metadata: dict[str, Any] = Field(default_factory=dict, description='Optional metadata associated with the raw document.')

Example:

from document_extraction_tools.types import DocumentBytes, PathIdentifier

with open("lease.pdf", "rb") as f:
    doc_bytes = DocumentBytes(
        file_bytes=f.read(),
        path_identifier=PathIdentifier(path="lease.pdf"),
        metadata={"mime_type": "application/pdf"},
    )

Document

The master object representing a fully parsed document with pages.

Bases: BaseModel

The master object representing a fully parsed document.

id class-attribute instance-attribute

id: str = Field(..., description='A unique identifier for this document.')

content_type class-attribute instance-attribute

content_type: Literal['image', 'text'] = Field(..., description='The type of content extracted.')

pages class-attribute instance-attribute

pages: list[Page] = Field(default_factory=list, description='Ordered list of pages belonging to this document.')

path_identifier class-attribute instance-attribute

path_identifier: PathIdentifier = Field(..., description='Traceability link to the original source.')

metadata class-attribute instance-attribute

metadata: dict[str, Any] = Field(default_factory=dict, description='Arbitrary metadata.')

check_content_consistency

check_content_consistency() -> Document

Ensures page data types match the declared content_type.

Example:

from document_extraction_tools.types import Document, Page, ImageData, PathIdentifier

document = Document(
    id="lease_001",
    content_type="image",
    pages=[
        Page(page_number=1, data=ImageData(content=image_bytes)),
        Page(page_number=2, data=ImageData(content=image_bytes_2)),
    ],
    path_identifier=PathIdentifier(path="/data/lease_001.pdf"),
    metadata={"page_count": 2, "source": "local"},
)

Page

Represents a single page within a document.

Bases: BaseModel

Represents a single page within a document.

page_number class-attribute instance-attribute

page_number: int = Field(..., ge=1, description='The 1-based index of the page in the original document.')

data class-attribute instance-attribute

data: ImageData | TextData = Field(..., description='The payload for the page.')

Example:

from document_extraction_tools.types import Page, TextData, ImageData

# Text page
text_page = Page(page_number=1, data=TextData(content="Lease agreement..."))

# Image page
image_page = Page(page_number=1, data=ImageData(content=image_bytes))

TextData

Encapsulates textual content extracted from a page.

Bases: BaseModel

Encapsulates textual content.

content class-attribute instance-attribute

content: str = Field(..., description='The extracted text string.')

ImageData

Encapsulates image content in various formats (bytes, PIL Image, or NumPy array).

Bases: BaseModel

Encapsulates image content in various formats.

model_config class-attribute instance-attribute

model_config = ConfigDict(arbitrary_types_allowed=True)

content class-attribute instance-attribute

content: bytes | PILImageType | NumpyArrayType = Field(..., description='The image payload. Can be raw bytes, a PIL Image, or a NumPy array.')

Example:

from PIL import Image
from document_extraction_tools.types import ImageData

# From bytes
image_data = ImageData(content=raw_bytes)

# From PIL Image
pil_image = Image.open("page.png")
image_data = ImageData(content=pil_image)

# From NumPy array
import numpy as np
np_array = np.array(pil_image)
image_data = ImageData(content=np_array)

ExtractionSchema

A type variable representing any Pydantic model used as an extraction schema.

Usage:

from pydantic import BaseModel, Field
from document_extraction_tools.types import ExtractionSchema

class LeaseSchema(BaseModel):
    """Your custom extraction schema."""
    landlord_name: str = Field(..., description="Landlord name")
    tenant_name: str = Field(..., description="Tenant name")
    monthly_rent: float = Field(..., description="Monthly rent")

# LeaseSchema can be used wherever ExtractionSchema is expected

ExtractionResult

Wraps the extracted schema data along with optional metadata. This is the return type of the BaseExtractor.extract() method.

Bases: BaseModel, Generic[ExtractionSchema]

Wraps an extraction schema with optional metadata.

data instance-attribute

data: ExtractionSchema

metadata class-attribute instance-attribute

metadata: dict[str, Any] = Field(default_factory=dict, description='Optional metadata associated with the extraction result.')

Example:

from document_extraction_tools.types import ExtractionResult

# Create an extraction result with metadata
result = ExtractionResult(
    data=LeaseSchema(
        landlord_name="John Smith",
        tenant_name="Jane Doe",
        monthly_rent=2500.00,
    ),
    metadata={
        "model": "gpt-4",
        "tokens_used": 1234,
        "confidence": 0.95,
    },
)

# Access the extracted data
print(result.data.landlord_name)  # "John Smith"

# Access metadata
print(result.metadata.get("confidence"))  # 0.95

PipelineContext

A shared context object that can be passed through pipeline components to maintain state or share information across the pipeline.

Bases: BaseModel

Shared context passed through pipeline components.

context class-attribute instance-attribute

context: dict[str, Any] = Field(default_factory=dict, description='Shared context values available across pipeline components.')

Example:

from document_extraction_tools.types import PipelineContext

# Create context with runtime values
context = PipelineContext(
    context={
        "run_id": "extraction-2024-01-15",
        "started_at": "2024-01-15T10:30:00",
    }
)

# Access context values in components
run_id = context.context.get("run_id")

# Pass to orchestrator.run()
await orchestrator.run(file_paths, context=context)

EvaluationExample

Pairs a ground-truth schema with a source document for evaluation.

Bases: BaseModel, Generic[ExtractionSchema]

Pairs a ground-truth schema with a source document.

id class-attribute instance-attribute

id: str = Field(..., description='Identifier for the test example.')

path_identifier class-attribute instance-attribute

path_identifier: PathIdentifier = Field(..., description='Source location for the test example.')

true class-attribute instance-attribute

true: ExtractionResult[ExtractionSchema] = Field(..., description='Ground-truth data with metadata.')

metadata class-attribute instance-attribute

metadata: dict[str, Any] = Field(default_factory=dict, description='Optional metadata associated with the evaluation example.')

Example:

from document_extraction_tools.types import EvaluationExample, ExtractionResult, PathIdentifier

example = EvaluationExample(
    id="lease_001",
    path_identifier=PathIdentifier(
        path="data/leases/lease_001.pdf",
        metadata={"source": "local", "mime_type": "application/pdf"},
    ),
    true=ExtractionResult(
        data=LeaseSchema(
            landlord_name="John Smith",
            tenant_name="Jane Doe",
            monthly_rent=2500.00,
        ),
    ),
)

EvaluationResult

Represents a single evaluation result produced by an evaluator.

Bases: BaseModel

Represents a single evaluation result for one document.

name class-attribute instance-attribute

name: str = Field(..., description='Name of the evaluator or metric.')

result class-attribute instance-attribute

result: Any = Field(..., description='Computed metric value.')

description class-attribute instance-attribute

description: str = Field(..., description='Human-readable description.')

metadata class-attribute instance-attribute

metadata: dict[str, Any] = Field(default_factory=dict, description='Optional metadata associated with the evaluation result.')

Example:

from document_extraction_tools.types import EvaluationResult

result = EvaluationResult(
    name="field_accuracy",
    result=0.85,
    description="17/20 fields matched exactly",
)