Documentation

Document Types

Types for document upload, processing, and search operations

DocumentUploadResult

Result of a single document upload (returned by upload_one())

@dataclass(frozen=True)
class DocumentUploadResult:
document_id: str # Unique document identifier
filename: str # Original filename
object_key: str # Storage object key (S3 path)
content_type: str # MIME type
size_bytes: int # File size in bytes
text_extraction_status: DocumentProcessingStatus = PENDING # Processing status
embedding_status: Optional[str] = None
page_count: Optional[int] = None # Pages (for PDFs)
chunk_count: Optional[int] = None # Text chunks created
extracted_text_preview: Optional[str] = None
created_at: Optional[datetime] = None
processing_error: Optional[str] = None # Error message if failed
processing_error_type: Optional[str] = None # Error classification
processing_is_retryable: Optional[bool] = None # Whether retry may succeed
# Properties
is_failed: bool # text_extraction_status == FAILED
is_completed: bool # text_extraction_status == COMPLETED
is_pending: bool # text_extraction_status in (PENDING, PROCESSING)
def to_dict(self, exclude_none: bool = True) -> dict[str, Any]: ...

BatchDocumentUploadResults

List of DocumentUploadResult with batch helpers (returned by upload())

class BatchDocumentUploadResults(list):
# Properties
has_failures: bool # True if any processing failed
failed_count: int # Number of failed results
succeeded_count: int # Number of successful results
pending_count: int # Number still pending
# Methods
def failed(self) -> list[DocumentUploadResult]: ...
def succeeded(self) -> list[DocumentUploadResult]: ...
def pending(self) -> list[DocumentUploadResult]: ...
def retryable(self) -> list[DocumentUploadResult]: ...
def raise_on_failures(self) -> BatchDocumentUploadResults: ...
def summary(self) -> str: ... # e.g. "3 succeeded, 1 failed (1 retryable)"

DocumentItem

Document summary in list responses

@dataclass(frozen=True)
class DocumentItem:
id: str # Unique document identifier
filename: str # Original uploaded filename
content_type: str # MIME type
size_bytes: int # File size
text_extraction_status: str # pending | processing | completed | failed
title: Optional[str] = None
page_count: Optional[int] = None # Pages (for PDFs)
embedding_status: Optional[str] = None
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
folder_id: Optional[str] = None
tags: Optional[list[str]] = None

DocumentDetails

Full document details (from get response)

@dataclass(frozen=True)
class DocumentDetails:
id: str
filename: str
content_type: str
size_bytes: int
text_extraction_status: str
title: Optional[str] = None
page_count: Optional[int] = None
embedding_status: Optional[str] = None
chunk_count: Optional[int] = None # Number of text chunks
extracted_text_preview: Optional[str] = None # Preview of extracted text
created_at: Optional[datetime] = None
updated_at: Optional[datetime] = None
folder_id: Optional[str] = None
tags: Optional[list[str]] = None
object_key: Optional[str] = None
download_url: Optional[str] = None # Presigned download URL

DocumentList

Paginated list of documents

@dataclass(frozen=True)
class DocumentList:
documents: list[DocumentItem] # Document summaries
total_count: int # Total matching query
page: int # Current page number (1-based)
page_size: int # Items per page
has_more: bool # More pages available

DocumentChunk

Individual text chunk from a document

@dataclass(frozen=True)
class DocumentChunk:
id: str # Chunk identifier
document_id: str # Parent document ID
content: str # Text content
chunk_index: int # Position in document (0-based)
page_numbers: Optional[list[int]] = None # Pages this chunk spans
token_count: Optional[int] = None
embedding: Optional[list[float]] = None # Vector embedding (if requested)

DocumentChunksResponse

Response containing all chunks for a document

@dataclass(frozen=True)
class DocumentChunksResponse:
document_id: str # Parent document ID
chunks: list[DocumentChunk] # All chunks
total_chunks: int # Total number of chunks

DocumentSearchResult

Single search result (chunk with relevance score)

@dataclass(frozen=True)
class DocumentSearchResult:
chunk_id: str # Chunk identifier
document_id: str # Parent document ID
document_filename: str # Filename of the document
content: str # Matching text content
score: float # Similarity score (0-1)
page_numbers: Optional[list[int]] = None # Pages this chunk spans
chunk_index: Optional[int] = None # Position in document

DocumentSearchResponse

Complete search response

@dataclass(frozen=True)
class DocumentSearchResponse:
results: list[DocumentSearchResult] # Matching chunks
total_count: int # Total matches
search_time_ms: int # Search time in ms
query: str # The search query used

DocumentStatusResult

Document processing status (from get_status() or wait_for_processing())

@dataclass(frozen=True)
class DocumentStatusResult:
document_id: str
text_extraction_status: str # pending | processing | completed | failed
embedding_status: Optional[str] = None
page_count: Optional[int] = None
chunk_count: Optional[int] = None
error_message: Optional[str] = None
# Properties
is_completed: bool # text_extraction_status == "completed"
is_failed: bool # text_extraction_status == "failed"
is_processing: bool # text_extraction_status in ("pending", "processing")

DocumentQuotaCheck

Quota check response for document uploads

@dataclass(frozen=True)
class DocumentQuotaCheck:
can_proceed: bool # Whether upload can proceed
document_count: int # Current document count
document_limit: int # Maximum allowed
storage_used_bytes: int # Storage currently used
storage_limit_bytes: int # Storage limit
message: str = ""
# Properties
documents_remaining: int # document_limit - document_count
storage_remaining_bytes: int # storage_limit_bytes - storage_used_bytes

DocumentPresignedUploadResult

Presigned URL response for low-level upload workflow

@dataclass(frozen=True)
class DocumentPresignedUploadResult:
upload_url: str # Presigned URL for S3 upload
object_key: str # S3 object key
expires_at: Optional[datetime] = None # URL expiration
upload_headers: Optional[dict[str, str]] = None # Headers for upload request

DocumentConfirmResult

Upload confirmation response

@dataclass(frozen=True)
class DocumentConfirmResult:
document_id: str # Created document ID
filename: str # Original filename
status: str # Processing status
message: str = "" # Confirmation message

DocumentDeleteResult

Single document deletion result

@dataclass(frozen=True)
class DocumentDeleteResult:
id: str # Document identifier
status: str # Deletion status (deleted/failed)
message: str = "" # Additional information
deleted_at: Optional[datetime] = None # Deletion timestamp

DocumentBatchDeleteResponse

Response for batch document deletion

@dataclass(frozen=True)
class DocumentBatchDeleteResponse:
deleted: list[DocumentDeleteResult] # Successfully deleted
failed: list[DocumentDeleteResult] # Failed to delete
summary: dict[str, int] # {total, deleted, failed}

DocumentProcessingStatus

Enum for document processing status

class DocumentProcessingStatus(str, Enum):
PENDING = "pending" # Uploaded, not yet started
PROCESSING = "processing" # Text extraction in progress
COMPLETED = "completed" # Processing completed
FAILED = "failed" # Processing failed

DocumentProcessingErrorType

Classification of document processing errors for retry strategies

class DocumentProcessingErrorType(str, Enum):
TIMEOUT = "timeout" # Processing timed out (retryable)
EXTRACTION_ERROR = "extraction_error" # Text extraction failed (may be retryable)
UNSUPPORTED_FORMAT = "unsupported_format" # Format not supported (permanent)
CORRUPT_FILE = "corrupt_file" # File is corrupted (permanent)
PASSWORD_PROTECTED = "password_protected" # Requires password (permanent)
TOO_LARGE = "too_large" # Exceeds size limits (permanent)
EMBEDDING_ERROR = "embedding_error" # Embedding generation failed (retryable)
RESOURCE_LIMIT = "resource_limit" # Quota exceeded (permanent)
UNKNOWN = "unknown" # Unclassified (not retryable)

DocumentProcessingFailure

Details about a failed document processing operation with error classification

@dataclass(frozen=True)
class DocumentProcessingFailure:
document_id: str # ID of the document that failed
error_message: str # Human-readable error message
error_type: DocumentProcessingErrorType # Classification of the error
is_retryable: bool # Whether the operation can be retried
filename: Optional[str] = None # Original filename
# Class methods
@staticmethod
def classify_error(error_msg: str) -> tuple[DocumentProcessingErrorType, bool]: ...
@classmethod
def from_error_message(cls, document_id: str, error_msg: str, filename: Optional[str] = None) -> DocumentProcessingFailure: ...