[ PROMPT_NODE_27512 ]
模型训练与微调
[ SKILL_DOCUMENTATION ]
# 训练与微调
## 概述
使用 Trainer API 在自定义数据集上微调预训练模型。Trainer 处理训练循环、梯度累积、混合精度、日志记录和检查点保存。
## 基础微调工作流
### 第 1 步:加载并预处理数据
python
from datasets import load_dataset
# 加载数据集
dataset = load_dataset("yelp_review_full")
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
# 分词
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize_function(examples):
return tokenizer(
examples["text"],
padding="max_length",
truncation=True,
max_length=512
)
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
### 第 2 步:加载模型
python
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=5 # 类别数量
)
### 第 3 步:定义评估指标
python
import evaluate
import numpy as np
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
### 第 4 步:配置训练参数
python
from transformers import TrainingArguments
training_args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01,
logging_dir="./logs",
logging_steps=10,
load_best_model_at_end=True,
metric_for_best_model="accuracy",
)
### 第 5 步:创建 Trainer 并开始训练
python
from transformers import Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
)
# 开始训练
trainer.train()
# 评估
results = trainer.evaluate()
print(results)
### 第 6 步:保存模型
python
trainer.save_model("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")
# 或者推送到 Hub
trainer.push_to_hub("username/my-finetuned-model")
## TrainingArguments 参数
### 核心参数
**output_dir**: 用于保存检查点的目录