Python document processing library for parsing PDF, DOCX, and 10+ formats with advanced layout understanding, unified document representation, and AI ecosystem integrations (LangChain, LlamaIndex,...
Docling is a powerful Python library developed by IBM Research that simplifies document processing for generative AI applications. With 48,400+ GitHub stars and 159 contributors, Docling excels at parsing diverse document formats—including advanced PDF understanding with layout analysis—and provides seamless integrations with AI frameworks like LangChain, LlamaIndex, and Model Context Protocol (MCP) servers.
langchain-docling package with document loadersInput Document → Format Detection → Backend Selection → Pipeline Execution → Docling Document
↓
[Export] → Markdown, HTML, JSON, Text
[Serialize] → Chunking, Embedding
# Standard installation
pip install docling
# Verify installation
python -c "import docling; print(docling.__version__)"
# Prefetch models
docling-tools models download
# Or specify artifacts path
export DOCLING_ARTIFACTS_PATH=/path/to/models
# Download custom HuggingFace models
docling-tools download-hf-repo --repo-id <model_id>
from docling.document_converter import DocumentConverter
# Initialize converter
converter = DocumentConverter()
# Convert document
result = converter.convert("document.pdf")
# Export to Markdown
markdown_content = result.document.export_to_markdown()
print(markdown_content)
# Export to HTML
html_content = result.document.export_to_html()
# Export to JSON (lossless)
json_data = result.document.export_to_dict()
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TableFormerMode
)
# Configure PDF processing
pipeline_options = PdfPipelineOptions()
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
pipeline_options.do_ocr = True
# Initialize converter with options
converter = DocumentConverter(
pipeline_options=pipeline_options
)
# Convert with advanced features
result = converter.convert("complex_report.pdf")
# Access structured content
for table in result.document.tables:
print(f"Table: {table.to_markdown()}")
for figure in result.document.pictures:
print(f"Figure caption: {figure.caption}")
from docling.document_converter import DocumentConverter
# Configure resource constraints
converter = DocumentConverter(
max_file_size=50_000_000, # 50 MB limit
max_num_pages=100 # First 100 pages only
)
# Process multiple documents
documents = ["doc1.pdf", "doc2.docx", "doc3.xlsx"]
for doc_path in documents:
try:
result = converter.convert(doc_path)
output_path = doc_path.replace(".pdf", ".md")
with open(output_path, "w") as f:
f.write(result.document.export_to_markdown())
print(f"✅ Converted: {doc_path}")
except Exception as e:
print(f"❌ Failed: {doc_path} - {e}")
from langchain_docling import DoclingLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
# Load documents with Docling
loader = DoclingLoader(
file_path="technical_manual.pdf",
export_type="markdown" # or "json" for lossless
)
documents = loader.load()
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
splits = text_splitter.split_documents(documents)
# Create vector store
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(splits, embeddings)
# Query the documents
retriever = vectorstore.as_retriever()
relevant_docs = retriever.get_relevant_documents("What is the installation process?")
for doc in relevant_docs:
print(doc.page_content)
from llama_index.readers.docling import DoclingReader
from llama_index.node_parser.docling import DoclingNodeParser
from llama_index.core import VectorStoreIndex
# Load documents with Docling Reader
reader = DoclingReader(export_type="json") # Lossless serialization
documents = reader.load_data(file_path="research_paper.pdf")
# Parse into nodes
node_parser = DoclingNodeParser()
nodes = node_parser.get_nodes_from_documents(documents)
# Build index
index = VectorStoreIndex(nodes)
# Query
query_engine = index.as_query_engine()
response = query_engine.query("Summarize the methodology section")
print(response)
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.backend.pdf_backend import PyPdfiumBackend
import os
# Configure threading (for performance)
os.environ["OMP_NUM_THREADS"] = "8" # Use 8 CPU threads
# Custom pipeline options
pipeline_options = PdfPipelineOptions()
pipeline_options.do_cell_matching = True # Enable table cell matching
pipeline_options.generate_page_images = True # Extract page images
pipeline_options.generate_picture_images = True # Extract figures
# Use specific backend
converter = DocumentConverter(
allowed_formats=["pdf"],
format_options={"pdf": PyPdfiumBackend}
)
# Convert with custom settings
result = converter.convert(
"scientific_paper.pdf",
pipeline_options=pipeline_options
)
# Save extracted images
for i, image in enumerate(result.document.pictures):
image.save(f"figure_{i}.png")
from docling.document_converter import DocumentConverter, DocumentStream
from io import BytesIO
# Load PDF as binary stream
with open("document.pdf", "rb") as f:
pdf_bytes = BytesIO(f.read())
# Create document stream
doc_stream = DocumentStream(
name="document.pdf",
stream=pdf_bytes
)
# Convert from stream
converter = DocumentConverter()
result = converter.convert(doc_stream)
markdown = result.document.export_to_markdown()
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
# IMPORTANT: Explicit opt-in required for remote services
# Main purpose of Docling is local execution
pipeline_options = PdfPipelineOptions()
pipeline_options.enable_remote_services = True # Explicit consent
# Configure cloud OCR (if needed)
converter = DocumentConverter(pipeline_options=pipeline_options)
# Process document (may use cloud services)
result = converter.convert("scanned_document.pdf")
from docling.datamodel.pipeline_options import TableFormerMode
# FAST mode (faster processing)
pipeline_options.table_structure_options.mode = TableFormerMode.FAST
# ACCURATE mode (better quality)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
Trade-offs:
Markdown Export:
# With embedded images
markdown = result.document.export_to_markdown(image_mode="embedded")
# With image references
markdown = result.document.export_to_markdown(image_mode="referenced")
HTML Export:
# With custom CSS
html = result.document.export_to_html(
include_styles=True,
custom_css="body { font-family: Arial; }"
)
JSON Export (Lossless):
# Complete document structure
json_data = result.document.export_to_dict()
# Includes:
# - Full layout information
# - Reading order
# - Bounding boxes
# - Confidence scores
# - Metadata
from docling.chunking import HybridChunker
# Configure chunker
chunker = HybridChunker(
chunk_size=1000, # Target chunk size
chunk_overlap=200, # Overlap between chunks
respect_boundaries=True # Respect document structure
)
# Chunk document
chunks = chunker.chunk(result.document)
for i, chunk in enumerate(chunks):
print(f"Chunk {i}:")
print(chunk.text)
print(f"Metadata: {chunk.metadata}")
# Access confidence scores for extracted content
for element in result.document.elements:
if hasattr(element, 'confidence'):
print(f"Element: {element.text[:50]}")
print(f"Confidence: {element.confidence}")
Docling provides a Model Context Protocol (MCP) server for integration with agentic applications like Claude Desktop.
# Install MCP server
pip install docling-mcp-server
# Start server
docling-mcp-server --port 3000
{
"mcpServers": {
"docling": {
"command": "docling-mcp-server",
"args": ["--port", "3000"],
"env": {
"DOCLING_ARTIFACTS_PATH": "/path/to/models"
}
}
}
}
# Set number of threads (default: 4)
export OMP_NUM_THREADS=8
# Or in Python
import os
os.environ["OMP_NUM_THREADS"] = "8"
# Process documents in batches to manage memory
def process_batch(file_paths, batch_size=10):
converter = DocumentConverter()
for i in range(0, len(file_paths), batch_size):
batch = file_paths[i:i+batch_size]
for file_path in batch:
result = converter.convert(file_path)
# Process result
# Clear memory between batches
import gc
gc.collect()
# Download all models in advance
docling-tools models download
# Verify models
ls $HOME/.cache/docling/models
| Format | Extension | Notes |
|---|---|---|
.pdf |
Advanced layout understanding, table extraction | |
| Microsoft Word | .docx |
Office 2007+ (Open XML) |
| Excel | .xlsx |
Spreadsheet data extraction |
| PowerPoint | .pptx |
Slide content and structure |
| HTML | .html, .xhtml |
Web page content |
| Markdown | .md |
Plain text markup |
| AsciiDoc | .adoc, .asciidoc |
Technical documentation |
| CSV | .csv |
Tabular data |
| Images | .png, .jpg, .tiff, .bmp, .webp |
OCR processing |
| USPTO XML | .xml |
Patent documents |
| JATS XML | .xml |
Journal articles |
| WebVTT | .vtt |
Video subtitle files |
| Format | Use Case | Lossless |
|---|---|---|
| Markdown | Human-readable, AI-friendly | No |
| HTML | Web rendering | No |
| JSON | Complete structure preservation | Yes |
| Plain Text | Simple text extraction | No |
| Doctags | Layout-aware markup | Partial |
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert("whitepaper.pdf")
# Export to Markdown
markdown = result.document.export_to_markdown()
# Save for RAG ingestion
with open("whitepaper.md", "w") as f:
f.write(markdown)
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert("financial_report.xlsx")
# Extract all tables
for table in result.document.tables:
print(table.to_markdown())
# Or: table.to_dataframe() for pandas integration
from docling.document_converter import DocumentConverter
from pathlib import Path
converter = DocumentConverter()
# Process directory
input_dir = Path("documents/")
output_dir = Path("processed/")
for file_path in input_dir.glob("*"):
if file_path.suffix in [".pdf", ".docx", ".xlsx", ".pptx"]:
result = converter.convert(str(file_path))
output_file = output_dir / f"{file_path.stem}.md"
with open(output_file, "w") as f:
f.write(result.document.export_to_markdown())
# Manually download models
docling-tools models download
# Check model cache
ls ~/.cache/docling/models
# Set custom cache location
export DOCLING_ARTIFACTS_PATH=/custom/path
# Limit pages processed
converter = DocumentConverter(max_num_pages=50)
# Or limit file size
converter = DocumentConverter(max_file_size=20_000_000) # 20 MB
# Ensure OCR is enabled
from docling.datamodel.pipeline_options import PdfPipelineOptions
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
converter = DocumentConverter(pipeline_options=pipeline_options)
# Try ACCURATE mode
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE
# Enable cell matching
pipeline_options.do_cell_matching = True
# MUST explicitly enable
pipeline_options.enable_remote_services = True
Only use remote services when:
langchain-doclingllama-index-readers-docling, llama-index-node-parser-doclingdocling-mcp-serverUse the Docling skill when:
Skill Type: Document Processing Library Complexity Level: Intermediate to Advanced Maintenance Status: ✅ Active (v2.66.0, December 2025) Community Health: ✅ Excellent (48.4k stars, 159 contributors)