Examples
Code Examples
Copy-paste ready examples with real data - not toy examples
Quick Start Examples
Parse Your First Document
Copy
from cerevox import Lexa
# Initialize the client
client = Lexa() # Uses CEREVOX_API_KEY from environment
# Parse a financial document
documents = client.parse("invoice.pdf")
# Real output - not just {...}
doc = documents[0]
print(f"✅ Extracted {len(doc.content)} characters")
print(f"📊 Found {len(doc.tables)} tables")
print(f"💰 Content preview: {doc.content[:200]}...")
# Returns actual structured data:
# "Invoice #INV-2024-001
# Bill To: Acme Corporation
# Amount Due: $1,299.99
# Due Date: 2024-02-15"
Real-World Use Cases
Financial Document Processing
Copy
from cerevox import Lexa
client = Lexa()
# Parse financial documents
documents = client.parse([
"invoices/q1-2024-invoices.pdf",
"statements/bank-statement.pdf",
"receipts/expense-receipts.xlsx"
])
# Extract key financial data
for doc in documents:
# Lexa preserves financial formatting
print(f"Document: {doc.title}")
print(f"Tables found: {len(doc.tables)}")
# Tables contain actual structured data
if doc.tables:
table = doc.tables[0] # First table
print(f"Financial data: {table.rows} rows x {table.columns} columns")
# Each table has real data, not placeholders
# Ready for accounting software integration
Research & Analysis
Copy
from cerevox import Lexa
client = Lexa()
# Parse research documents
documents = client.parse([
"papers/ai-research-2024.pdf",
"reports/market-analysis.docx",
"data/survey-results.xlsx"
])
# Get structured research data
for doc in documents:
print(f"📚 Paper: {doc.title}")
print(f"📖 Content: {len(doc.content)} chars")
print(f"📊 Data tables: {len(doc.tables)} tables")
print(f"🖼️ Figures: {len(doc.images)} images")
# All formatting and structure preserved
Vector Database Integration
RAG Application Ready
Copy
from cerevox import Lexa
client = Lexa()
# Parse documents for RAG
documents = client.parse([
"knowledge-base/product-docs.pdf",
"support/troubleshooting.docx"
])
# Get perfectly sized chunks
chunks = documents.get_all_text_chunks(
target_size=500, # Perfect for most embedding models
overlap_size=50, # Prevents context loss
include_metadata=True # Rich metadata included
)
print(f"🔗 Ready for embedding: {len(chunks)} chunks")
# Each chunk is optimized for vector databases
for chunk in chunks[:2]: # Show first 2
print(f"\nChunk preview: {chunk.content[:100]}...")
print(f"Metadata: page={chunk.page_number}, source={chunk.source_file}")
# Rich metadata for better retrieval
Different Input Methods
File Processing
Copy
from cerevox import Lexa
from pathlib import Path
client = Lexa()
# Single file
doc = client.parse("reports/annual-report.pdf")[0]
print(f"✅ Parsed: {len(doc.content)} characters")
# Multiple files with Path objects
docs_folder = Path("documents")
pdf_files = list(docs_folder.glob("*.pdf"))
documents = client.parse(pdf_files)
print(f"✅ Processed {len(documents)} PDF files")
# Mixed file types - Lexa handles them all
mixed_files = [
"data.xlsx", # Excel spreadsheet
"report.docx", # Word document
"slides.pptx", # PowerPoint
"data.csv", # CSV file
"webpage.html" # HTML file
]
documents = client.parse(mixed_files)
print(f"✅ Processed {len(documents)} mixed format files")
URL Processing
Copy
from cerevox import Lexa
client = Lexa()
# Parse from web URLs
url = "https://www.sec.gov/Archives/edgar/data/320193/000032019323000077/aapl-20230930.htm"
documents = client.parse_urls(url)
print(f"✅ Downloaded and parsed SEC filing")
print(f"📄 Content: {len(documents[0].content)} characters")
print(f"📊 Tables: {len(documents[0].tables)} financial tables")
# Real SEC data, properly structured
Processing Modes & Options
Performance Optimization
Copy
from cerevox import Lexa, ProcessingMode
client = Lexa()
# Default mode - fast and efficient (recommended)
documents = client.parse(
"standard-document.pdf",
mode=ProcessingMode.DEFAULT # Fast processing for most use cases
)
print("✅ Fast processing complete")
# Advanced mode - maximum accuracy
documents = client.parse(
"complex-report.pdf",
mode=ProcessingMode.ADVANCED # Use for complex documents requiring maximum accuracy
)
print("✅ Advanced processing complete")
Error Handling
Copy
from cerevox import Lexa, LexaError
client = Lexa()
def safe_parse(files):
try:
documents = client.parse(files)
print(f"✅ Successfully parsed {len(documents)} documents")
return documents
except LexaError as e:
print(f"❌ Lexa API error: {e.message}")
if "authentication" in e.message.lower():
print("💡 Check your API key")
elif "timeout" in e.message.lower():
print("💡 Try smaller batches or increase timeout")
return None
except Exception as e:
print(f"❌ Unexpected error: {e}")
return None
# Use with any files
documents = safe_parse(["document1.pdf", "document2.docx"])
Ready for more? Check out async processing for handling multiple documents concurrently, or cloud integrations for S3, SharePoint, and more.
Assistant
Responses are generated using AI and may contain mistakes.