Core Parsing Methods

parse()

Parse local files, file-like objects, or raw bytes content.
def parse(
    files: Union[str, Path, bytes, BinaryIO, List[Union[str, Path, bytes, BinaryIO]]],
    mode: ProcessingMode = ProcessingMode.DEFAULT,
    progress_callback: Optional[Callable[[JobStatus], None]] = None,
    timeout: float = 60.0,
    poll_interval: float = 2.0
) -> DocumentBatch
Parameters:
files
Union[str, Path, bytes, BinaryIO, List[...]]
required
Files to parse. Can be:
  • File path (string or Path object)
  • Raw bytes content
  • File-like object (BinaryIO)
  • List of any of the above
mode
ProcessingMode
default:"ProcessingMode.DEFAULT"
Processing mode: DEFAULT or ADVANCED
progress_callback
Optional[Callable]
default:"None"
Callback function to monitor parsing progress
timeout
float
default:"60.0"
Maximum time to wait for parsing completion (seconds)
poll_interval
float
default:"2.0"
Interval between status checks (seconds)
Returns: DocumentBatch - Collection of parsed documents

parse_urls()

Parse documents from URLs.
def parse_urls(
    urls: Union[str, List[str]],
    mode: ProcessingMode = ProcessingMode.DEFAULT,
    progress_callback: Optional[Callable[[JobStatus], None]] = None,
    timeout: float = 120.0,
    poll_interval: float = 2.0
) -> DocumentBatch
Parameters:
urls
Union[str, List[str]]
required
URLs to parse. Can be a single URL string or list of URLs
mode
ProcessingMode
default:"ProcessingMode.DEFAULT"
Processing mode: DEFAULT or ADVANCED
progress_callback
Optional[Callable]
default:"None"
Callback function to monitor parsing progress
timeout
float
default:"120.0"
Maximum time to wait for parsing completion (seconds)
poll_interval
float
default:"2.0"
Interval between status checks (seconds)
Returns: DocumentBatch - Collection of parsed documents

get_job_status()

Get the current status of a parsing job.
def get_job_status(job_id: str) -> JobStatus
Parameters:
job_id
str
required
The job ID to check status for
Returns: JobStatus - Current job status information

Amazon S3 Methods

list_s3_buckets()

List available S3 buckets.
def list_s3_buckets() -> S3BucketList
Returns: S3BucketList - List of available S3 buckets

list_s3_folder()

List contents of an S3 folder.
def list_s3_folder(
    bucket: str,
    folder_path: str = "",
    max_items: int = 1000
) -> S3FolderContents
Parameters:
bucket
str
required
S3 bucket name
folder_path
str
default:""
Path within the bucket (empty for root)
max_items
int
default:"1000"
Maximum number of items to return
Returns: S3FolderContents - Contents of the S3 folder

parse_s3_folder()

Parse all documents in an S3 folder.
def parse_s3_folder(
    bucket: str,
    folder_path: str = "",
    mode: ProcessingMode = ProcessingMode.DEFAULT,
    progress_callback: Optional[Callable[[JobStatus], None]] = None,
    timeout: float = 300.0,
    poll_interval: float = 5.0
) -> DocumentBatch
Parameters:
bucket
str
required
S3 bucket name
folder_path
str
default:""
Path within the bucket to parse
mode
ProcessingMode
default:"ProcessingMode.DEFAULT"
Processing mode: DEFAULT or ADVANCED
progress_callback
Optional[Callable]
default:"None"
Callback function to monitor parsing progress
timeout
float
default:"300.0"
Maximum time to wait for parsing completion (seconds)
poll_interval
float
default:"5.0"
Interval between status checks (seconds)
Returns: DocumentBatch - Collection of parsed documents

Microsoft SharePoint Methods

list_sharepoint_sites()

List available SharePoint sites.
def list_sharepoint_sites() -> SharePointSiteList
Returns: SharePointSiteList - List of available SharePoint sites

list_sharepoint_drives()

List drives in a SharePoint site.
def list_sharepoint_drives(site_id: str) -> SharePointDriveList
Parameters:
site_id
str
required
SharePoint site ID
Returns: SharePointDriveList - List of drives in the site

parse_sharepoint_folder()

Parse documents in a SharePoint folder.
def parse_sharepoint_folder(
    site_id: str,
    drive_id: str,
    folder_path: str = "",
    mode: ProcessingMode = ProcessingMode.DEFAULT,
    progress_callback: Optional[Callable[[JobStatus], None]] = None,
    timeout: float = 300.0,
    poll_interval: float = 5.0
) -> DocumentBatch
Parameters:
site_id
str
required
SharePoint site ID
drive_id
str
required
SharePoint drive ID
folder_path
str
default:""
Path within the drive to parse
mode
ProcessingMode
default:"ProcessingMode.DEFAULT"
Processing mode: DEFAULT or ADVANCED
progress_callback
Optional[Callable]
default:"None"
Callback function to monitor parsing progress
timeout
float
default:"300.0"
Maximum time to wait for parsing completion (seconds)
poll_interval
float
default:"5.0"
Interval between status checks (seconds)
Returns: DocumentBatch - Collection of parsed documents

Box Methods

list_box_folders()

List folders in Box.
def list_box_folders(parent_folder_id: str = "0") -> BoxFolderList
Parameters:
parent_folder_id
str
default:"0"
Parent folder ID (“0” for root folder)
Returns: BoxFolderList - List of folders

parse_box_folder()

Parse documents in a Box folder.
def parse_box_folder(
    folder_id: str,
    mode: ProcessingMode = ProcessingMode.DEFAULT,
    progress_callback: Optional[Callable[[JobStatus], None]] = None,
    timeout: float = 300.0,
    poll_interval: float = 5.0
) -> DocumentBatch
Parameters:
folder_id
str
required
Box folder ID to parse
mode
ProcessingMode
default:"ProcessingMode.DEFAULT"
Processing mode: DEFAULT or ADVANCED
progress_callback
Optional[Callable]
default:"None"
Callback function to monitor parsing progress
timeout
float
default:"300.0"
Maximum time to wait for parsing completion (seconds)
poll_interval
float
default:"5.0"
Interval between status checks (seconds)
Returns: DocumentBatch - Collection of parsed documents

Dropbox Methods

list_dropbox_folders()

List folders in Dropbox.
def list_dropbox_folders(folder_path: str = "") -> DropboxFolderList
Parameters:
folder_path
str
default:""
Dropbox folder path (empty for root)
Returns: DropboxFolderList - List of folders

parse_dropbox_folder()

Parse documents in a Dropbox folder.
def parse_dropbox_folder(
    folder_path: str,
    mode: ProcessingMode = ProcessingMode.DEFAULT,
    progress_callback: Optional[Callable[[JobStatus], None]] = None,
    timeout: float = 300.0,
    poll_interval: float = 5.0
) -> DocumentBatch
Parameters:
folder_path
str
required
Dropbox folder path to parse
mode
ProcessingMode
default:"ProcessingMode.DEFAULT"
Processing mode: DEFAULT or ADVANCED
progress_callback
Optional[Callable]
default:"None"
Callback function to monitor parsing progress
timeout
float
default:"300.0"
Maximum time to wait for parsing completion (seconds)
poll_interval
float
default:"5.0"
Interval between status checks (seconds)
Returns: DocumentBatch - Collection of parsed documents

Async Methods

All methods are available in async versions with the AsyncLexa client:
import asyncio
from cerevox import AsyncLexa, ProcessingMode

async def main():
    async with AsyncLexa(api_key="your-api-key") as client:
        # All methods are available with await
        documents = await client.parse(["document.pdf"])
        
        # Concurrent processing
        tasks = [
            client.parse(["doc1.pdf"]),
            client.parse(["doc2.pdf"]),
            client.parse_urls(["https://example.com/doc.pdf"])
        ]
        
        results = await asyncio.gather(*tasks)
        all_documents = [doc for batch in results for doc in batch]
        
        return all_documents

asyncio.run(main())

Next Steps

Learn about error handling and client configuration for production use.