[ PROMPT_NODE_22682 ]
tracking
[ SKILL_DOCUMENTATION ]
# 综合跟踪指南
使用 MLflow 进行实验跟踪的完整指南。
## 目录
- 记录参数
- 记录指标
- 记录产出物
- 记录模型
- 自动记录
- 运行和实验
- 搜索和比较
## 记录参数
### 基础参数记录
python
import mlflow
with mlflow.start_run():
# 单个参数
mlflow.log_param("learning_rate", 0.001)
mlflow.log_param("batch_size", 32)
mlflow.log_param("optimizer", "Adam")
# 一次记录多个参数
mlflow.log_params({
"epochs": 50,
"dropout": 0.2,
"weight_decay": 1e-4,
"momentum": 0.9
})
### 结构化参数
python
# 嵌套配置
config = {
"model": {
"architecture": "ResNet50",
"pretrained": True,
"num_classes": 10
},
"training": {
"lr": 0.001,
"batch_size": 32,
"epochs": 50
},
"data": {
"dataset": "ImageNet",
"augmentation": True
}
}
with mlflow.start_run():
# 记录为扁平化参数
for section, params in config.items():
for key, value in params.items():
mlflow.log_param(f"{section}.{key}", value)
# 或者将整个配置记录为产出物
mlflow.log_dict(config, "config.json")
### 参数最佳实践
python
with mlflow.start_run():
# ✅ 推荐:记录所有超参数
mlflow.log_params({
"learning_rate": 0.001,
"batch_size": 32,
"optimizer": "Adam",
"scheduler": "CosineAnnealing",
"weight_decay": 1e-4
})
# ✅ 推荐:记录数据信息
mlflow.log_params({
"dataset": "ImageNet",
"train_samples": len(train_dataset),
"val_samples": len(val_dataset),
"num_classes": 1000
})
# ✅ 推荐:记录环境信息
mlflow.log_params({
"framework": "PyTorch 2.0",
"cuda_version": torch.version.cuda,
"gpu": torch.cuda.get_device_name(0)
})
## 记录指标
### 时间序列指标
python
with mlflow.start_run():
for epoch in range(num_epochs):
# 训练
train_loss, train_acc = train_epoch()
# 验证
val_loss, val_acc = validate()
# 按步长记录指标
mlflow.log_metric("train_loss", train_loss, step=epoch)
mlflow.log_metric("train_accuracy", train_acc, step=epoch)
mlflow.log_metric("val_loss", val_loss, step=epoch)
mlflow.log_metric("val_accura