Convert PDF pages to editable Word documents (.docx) while preserving layout...
Convert PDF pages to editable Word documents while preserving layout structure.
python scripts/extract_pdf_page.py /path/to/document.pdf 1 -o /output/dir
python scripts/create_two_column_docx.py /output/dir/page1_text.txt output.docx \
--title "Document Title" \
--author "Author Name" \
--page-number 1 \
--total-pages 8
When scripts don't match the exact layout needed, follow this manual process:
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
page = pdf.pages[0] # 0-indexed
pil_image = page.to_image(resolution=200).original
pil_image.save("page1.png", "PNG")
tesseract page1.png page1_text -l eng
Read the extracted image to understand:
from docx import Document
from docx.shared import Pt, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
doc = Document()
# Set margins
for section in doc.sections:
section.top_margin = Cm(1.5)
section.bottom_margin = Cm(1.5)
section.left_margin = Cm(1.5)
section.right_margin = Cm(1.5)
# Two-column layout using borderless table
table = doc.add_table(rows=1, cols=2)
# Remove borders from cells
def remove_borders(cell):
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
tcBorders = OxmlElement('w:tcBorders')
for edge in ('left', 'top', 'right', 'bottom'):
el = OxmlElement(f'w:{edge}')
el.set(qn('w:val'), 'nil')
tcBorders.append(el)
tcPr.append(tcBorders)
for cell in table.rows[0].cells:
remove_borders(cell)
cell.width = Cm(8.5)
# Add content to left column
left_cell = table.rows[0].cells[0]
p = left_cell.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
run = p.add_run("Content here...")
run.font.size = Pt(9)
doc.save("output.docx")
[Figure X - See original PDF]Required:
brew install tesseract)Install Python packages:
pip install pdfplumber pillow python-docx
# Or use uvx:
uvx --with pdfplumber --with pillow --with python-docx python script.py