Quick Start Examples

Parse Your First Document

from cerevox import Lexa

# Initialize the client
client = Lexa()  # Uses CEREVOX_API_KEY from environment

# Parse a financial document
documents = client.parse("invoice.pdf")

# Real output - not just {...}
doc = documents[0]
print(f"✅ Extracted {len(doc.content)} characters")
print(f"📊 Found {len(doc.tables)} tables")
print(f"💰 Content preview: {doc.content[:200]}...")

# Returns actual structured data:
# "Invoice #INV-2024-001
#  Bill To: Acme Corporation
#  Amount Due: $1,299.99
#  Due Date: 2024-02-15"

Real-World Use Cases

Financial Document Processing

from cerevox import Lexa

client = Lexa()

# Parse financial documents
documents = client.parse([
    "invoices/q1-2024-invoices.pdf",
    "statements/bank-statement.pdf",
    "receipts/expense-receipts.xlsx"
])

# Extract key financial data
for doc in documents:
    # Lexa preserves financial formatting
    print(f"Document: {doc.title}")
    print(f"Tables found: {len(doc.tables)}")
    
    # Tables contain actual structured data
    if doc.tables:
        table = doc.tables[0]  # First table
        print(f"Financial data: {table.rows} rows x {table.columns} columns")
        # Each table has real data, not placeholders

# Ready for accounting software integration

Research & Analysis

from cerevox import Lexa

client = Lexa()

# Parse research documents
documents = client.parse([
    "papers/ai-research-2024.pdf",
    "reports/market-analysis.docx",
    "data/survey-results.xlsx"
])

# Get structured research data
for doc in documents:
    print(f"📚 Paper: {doc.title}")
    print(f"📖 Content: {len(doc.content)} chars")
    print(f"📊 Data tables: {len(doc.tables)} tables")
    print(f"🖼️  Figures: {len(doc.images)} images")

# All formatting and structure preserved

Vector Database Integration

RAG Application Ready

from cerevox import Lexa

client = Lexa()

# Parse documents for RAG
documents = client.parse([
    "knowledge-base/product-docs.pdf",
    "support/troubleshooting.docx"
])

# Get perfectly sized chunks
chunks = documents.get_all_text_chunks(
    target_size=500,        # Perfect for most embedding models
    overlap_size=50,        # Prevents context loss
    include_metadata=True   # Rich metadata included
)

print(f"🔗 Ready for embedding: {len(chunks)} chunks")

# Each chunk is optimized for vector databases
for chunk in chunks[:2]:  # Show first 2
    print(f"\nChunk preview: {chunk.content[:100]}...")
    print(f"Metadata: page={chunk.page_number}, source={chunk.source_file}")
    # Rich metadata for better retrieval

Different Input Methods

File Processing

from cerevox import Lexa
from pathlib import Path

client = Lexa()

# Single file
doc = client.parse("reports/annual-report.pdf")[0]
print(f"✅ Parsed: {len(doc.content)} characters")

# Multiple files with Path objects
docs_folder = Path("documents")
pdf_files = list(docs_folder.glob("*.pdf"))
documents = client.parse(pdf_files)
print(f"✅ Processed {len(documents)} PDF files")

# Mixed file types - Lexa handles them all
mixed_files = [
    "data.xlsx",      # Excel spreadsheet
    "report.docx",    # Word document  
    "slides.pptx",    # PowerPoint
    "data.csv",       # CSV file
    "webpage.html"    # HTML file
]
documents = client.parse(mixed_files)
print(f"✅ Processed {len(documents)} mixed format files")

URL Processing

from cerevox import Lexa

client = Lexa()

# Parse from web URLs
url = "https://www.sec.gov/Archives/edgar/data/320193/000032019323000077/aapl-20230930.htm"
documents = client.parse_urls(url)

print(f"✅ Downloaded and parsed SEC filing")
print(f"📄 Content: {len(documents[0].content)} characters")
print(f"📊 Tables: {len(documents[0].tables)} financial tables")

# Real SEC data, properly structured

Processing Modes & Options

Performance Optimization

from cerevox import Lexa, ProcessingMode

client = Lexa()

# Default mode - fast and efficient (recommended)
documents = client.parse(
    "standard-document.pdf",
    mode=ProcessingMode.DEFAULT  # Fast processing for most use cases
)
print("✅ Fast processing complete")

# Advanced mode - maximum accuracy
documents = client.parse(
    "complex-report.pdf", 
    mode=ProcessingMode.ADVANCED  # Use for complex documents requiring maximum accuracy
)
print("✅ Advanced processing complete")

Error Handling

from cerevox import Lexa, LexaError

client = Lexa()

def safe_parse(files):
    try:
        documents = client.parse(files)
        print(f"✅ Successfully parsed {len(documents)} documents")
        return documents
        
    except LexaError as e:
        print(f"❌ Lexa API error: {e.message}")
        if "authentication" in e.message.lower():
            print("💡 Check your API key")
        elif "timeout" in e.message.lower():
            print("💡 Try smaller batches or increase timeout")
        return None
        
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        return None

# Use with any files
documents = safe_parse(["document1.pdf", "document2.docx"])

Ready for more? Check out async processing for handling multiple documents concurrently, or cloud integrations for S3, SharePoint, and more.