# PDF 处理指南
## 概览
本指南涵盖了使用 Python 库和命令行工具进行的基本 PDF 处理操作。有关高级功能、JavaScript 库和详细示例,请参阅 REFERENCE.md。如果您需要填写 PDF 表单,请阅读 FORMS.md 并遵循其中的说明。
## 快速入门
python
from pypdf import PdfReader, PdfWriter
# 读取 PDF
reader = PdfReader("document.pdf")
print(f"页数: {len(reader.pages)}")
# 提取文本
text = ""
for page in reader.pages:
text += page.extract_text()
## Python 库
### pypdf - 基础操作
#### 合并 PDF
python
from pypdf import PdfWriter, PdfReader
writer = PdfWriter()
for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
reader = PdfReader(pdf_file)
for page in reader.pages:
writer.add_page(page)
with open("merged.pdf", "wb") as output:
writer.write(output)
#### 拆分 PDF
python
reader = PdfReader("input.pdf")
for i, page in enumerate(reader.pages):
writer = PdfWriter()
writer.add_page(page)
with open(f"page_{i+1}.pdf", "wb") as output:
writer.write(output)
#### 提取元数据
python
reader = PdfReader("document.pdf")
meta = reader.metadata
print(f"标题: {meta.title}")
print(f"作者: {meta.author}")
print(f"主题: {meta.subject}")
print(f"创建者: {meta.creator}")
#### 旋转页面
python
reader = PdfReader("input.pdf")
writer = PdfWriter()
page = reader.pages[0]
page.rotate(90) # 顺时针旋转 90 度
writer.add_page(page)
with open("rotated.pdf", "wb") as output:
writer.write(output)
### pdfplumber - 文本与表格提取
#### 提取带布局的文本
python
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
for page in pdf.pages:
text = page.extract_text()
print(text)
#### 提取表格
python
with pdfplumber.open("document.pdf") as pdf:
for i, page in enumerate(pdf.pages):
tables = page.extract_tables()
for j, table in enumerate(tables):
print(f"第 {i+1} 页的第 {j+1} 个表格:")
for row in table:
print(row)
#### 高级表格提取
python
import pandas as pd
with pdfplumber.open("document.pdf") as pdf:
all_tables = []
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if table: # 检查表格是否为空
df = pd.DataFrame(table[1:], columns=table[0])
数据来源:claude-code-templates(MIT),中文翻译由 AI 生成。详见关于我们。