Copy and run this code to parse your first document:
Copy
from cerevox import Lexa# Initialize client (uses CEREVOX_API_KEY from environment)client = Lexa()# Parse a local file - replace with your file pathdocuments = client.parse(["path/to/your/document.pdf"])# See what you got backdoc = documents[0]print(f"✅ Success! Extracted {len(doc.content)} characters")print(f"📊 Found {len(doc.tables)} tables")print(f"📄 Content preview: {doc.content[:200]}...")# That's it! Your document is now structured data
Copy
from cerevox import Lexa# Initialize client (uses CEREVOX_API_KEY from environment)client = Lexa()# Parse a local file - replace with your file pathdocuments = client.parse(["path/to/your/document.pdf"])# See what you got backdoc = documents[0]print(f"✅ Success! Extracted {len(doc.content)} characters")print(f"📊 Found {len(doc.tables)} tables")print(f"📄 Content preview: {doc.content[:200]}...")# That's it! Your document is now structured data
Copy
from cerevox import Lexa# Parse raw text content (perfect for testing)client = Lexa()content = b"Invoice #12345\nTotal: $1,299.99\nVendor: Acme Corp"documents = client.parse(content)doc = documents[0]print(f"✅ Parsed: {doc.content}")# Returns: "Invoice #12345\nTotal: $1,299.99\nVendor: Acme Corp"# Ready for your application!
Copy
from cerevox import Lexaclient = Lexa()# Parse documents directly from URLsdocuments = client.parse_urls([ "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"])print(f"✅ Parsed {len(documents)} documents from URL")print(f"📄 Content: {documents[0].content[:100]}...")
Run this verification script to confirm everything works:
Copy
from cerevox import Lexa, LexaErrordef verify_setup(): try: client = Lexa() # Quick test with sample content test_content = b"Test document for Lexa API verification." documents = client.parse(test_content) if documents and len(documents) > 0: print("🎉 Perfect! Lexa is working correctly.") print(f"📄 Test result: {documents[0].content}") return True except LexaError as e: print(f"❌ API Error: {e.message}") print("💡 Check your API key and try again") return False except Exception as e: print(f"❌ Error: {e}") return False# Run verificationif verify_setup(): print("\n✅ You're ready to parse documents with Lexa!")
You’re Ready! 🎉 Lexa is configured and working. Start parsing your documents!
import asynciofrom cerevox import AsyncLexaasync def parse_multiple(): async with AsyncLexa() as client: # Process multiple files concurrently - much faster! documents = await client.parse([ "report1.pdf", "report2.docx", "data.xlsx" ]) print(f"✅ Processed {len(documents)} documents") # Get vector DB ready chunks chunks = documents.get_all_text_chunks(target_size=500) print(f"🔗 Ready for embedding: {len(chunks)} chunks")asyncio.run(parse_multiple())
Cloud Storage Integration
Copy
# Parse from Amazon S3documents = client.parse_s3_folder( bucket_name="my-documents", folder_path="invoices/")# Parse from SharePointdocuments = client.parse_sharepoint_folder( site_id="your-site-id", drive_id="your-drive-id", folder_path="Documents")print(f"✅ Processed {len(documents)} documents from cloud storage")
Vector Database Ready
Copy
# Parse and get RAG-optimized chunksdocuments = client.parse(["document.pdf"])chunks = documents.get_all_text_chunks( target_size=500, # Perfect for most embeddings overlap_size=50, # Prevents context loss include_metadata=True # Rich metadata included)# Each chunk is ready for your vector databasefor chunk in chunks[:3]: print(f"Chunk: {chunk.content[:100]}...") print(f"Metadata: {chunk.metadata}")
Copy and run this code to parse your first document:
Copy
from cerevox import Lexa# Initialize client (uses CEREVOX_API_KEY from environment)client = Lexa()# Parse a local file - replace with your file pathdocuments = client.parse(["path/to/your/document.pdf"])# See what you got backdoc = documents[0]print(f"✅ Success! Extracted {len(doc.content)} characters")print(f"📊 Found {len(doc.tables)} tables")print(f"📄 Content preview: {doc.content[:200]}...")# That's it! Your document is now structured data
Copy
from cerevox import Lexa# Initialize client (uses CEREVOX_API_KEY from environment)client = Lexa()# Parse a local file - replace with your file pathdocuments = client.parse(["path/to/your/document.pdf"])# See what you got backdoc = documents[0]print(f"✅ Success! Extracted {len(doc.content)} characters")print(f"📊 Found {len(doc.tables)} tables")print(f"📄 Content preview: {doc.content[:200]}...")# That's it! Your document is now structured data
Copy
from cerevox import Lexa# Parse raw text content (perfect for testing)client = Lexa()content = b"Invoice #12345\nTotal: $1,299.99\nVendor: Acme Corp"documents = client.parse(content)doc = documents[0]print(f"✅ Parsed: {doc.content}")# Returns: "Invoice #12345\nTotal: $1,299.99\nVendor: Acme Corp"# Ready for your application!
Copy
from cerevox import Lexaclient = Lexa()# Parse documents directly from URLsdocuments = client.parse_urls([ "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"])print(f"✅ Parsed {len(documents)} documents from URL")print(f"📄 Content: {documents[0].content[:100]}...")
Run this verification script to confirm everything works:
Copy
from cerevox import Lexa, LexaErrordef verify_setup(): try: client = Lexa() # Quick test with sample content test_content = b"Test document for Lexa API verification." documents = client.parse(test_content) if documents and len(documents) > 0: print("🎉 Perfect! Lexa is working correctly.") print(f"📄 Test result: {documents[0].content}") return True except LexaError as e: print(f"❌ API Error: {e.message}") print("💡 Check your API key and try again") return False except Exception as e: print(f"❌ Error: {e}") return False# Run verificationif verify_setup(): print("\n✅ You're ready to parse documents with Lexa!")
You’re Ready! 🎉 Lexa is configured and working. Start parsing your documents!
import asynciofrom cerevox import AsyncLexaasync def parse_multiple(): async with AsyncLexa() as client: # Process multiple files concurrently - much faster! documents = await client.parse([ "report1.pdf", "report2.docx", "data.xlsx" ]) print(f"✅ Processed {len(documents)} documents") # Get vector DB ready chunks chunks = documents.get_all_text_chunks(target_size=500) print(f"🔗 Ready for embedding: {len(chunks)} chunks")asyncio.run(parse_multiple())
Cloud Storage Integration
Copy
# Parse from Amazon S3documents = client.parse_s3_folder( bucket_name="my-documents", folder_path="invoices/")# Parse from SharePointdocuments = client.parse_sharepoint_folder( site_id="your-site-id", drive_id="your-drive-id", folder_path="Documents")print(f"✅ Processed {len(documents)} documents from cloud storage")
Vector Database Ready
Copy
# Parse and get RAG-optimized chunksdocuments = client.parse(["document.pdf"])chunks = documents.get_all_text_chunks( target_size=500, # Perfect for most embeddings overlap_size=50, # Prevents context loss include_metadata=True # Rich metadata included)# Each chunk is ready for your vector databasefor chunk in chunks[:3]: print(f"Chunk: {chunk.content[:100]}...") print(f"Metadata: {chunk.metadata}")