[ PROMPT_NODE_22746 ]
Multimodal Audiocraft 高级用法
[ SKILL_DOCUMENTATION ]
# AudioCraft 高级用法指南
## 微调 MusicGen
### 自定义数据集准备
python
import os
import json
from pathlib import Path
import torchaudio
def prepare_dataset(audio_dir, output_dir, metadata_file):
"""
为 MusicGen 微调准备数据集。
目录结构:
output_dir/
├── audio/
│ ├── 0001.wav
│ ├── 0002.wav
│ └── ...
└── metadata.json
"""
output_dir = Path(output_dir)
audio_output = output_dir / "audio"
audio_output.mkdir(parents=True, exist_ok=True)
# 加载元数据 (格式: {"path": "...", "description": "..."})
with open(metadata_file) as f:
metadata = json.load(f)
processed = []
for idx, item in enumerate(metadata):
audio_path = Path(audio_dir) / item["path"]
# 加载并重采样至 32kHz
wav, sr = torchaudio.load(str(audio_path))
if sr != 32000:
resampler = torchaudio.transforms.Resample(sr, 32000)
wav = resampler(wav)
# 如果是立体声则转换为单声道
if wav.shape[0] > 1:
wav = wav.mean(dim=0, keepdim=True)
# 保存处理后的音频
output_path = audio_output / f"{idx:04d}.wav"
torchaudio.save(str(output_path), wav, sample_rate=32000)
processed.append({
"path": str(output_path.relative_to(output_dir)),
"description": item["description"],
"duration": wav.shape[1] / 32000
})
# 保存处理后的元数据
with open(output_dir / "metadata.json", "w") as f:
json.dump(processed, f, indent=2)
print(f"已处理 {len(processed)} 个样本")
return processed
### 使用 dora 进行微调
bash
# AudioCraft 使用 dora 进行实验管理
# 安装 dora
pip install dora-search
# 克隆 AudioCraft
git clone https://github.com/facebookresearch/audiocraft.git
cd audiocraft
# 创建微调配置文件
cat > config/solver/musicgen/finetune.yaml << 'EOF'
defaults:
- musicgen/musicgen_base
- /model: lm/musicgen_lm
- /conditioner: cond_base
solver: musicgen
autocast: true
autocast_dtype: float16
optim:
epochs: 100
batch_size: 4
lr: 1e-4
ema: 0.999
optimizer: adamw
dataset:
batch_size: 4
num_workers: 4
train:
- dset: your_dataset
root: /path/to/dataset
valid:
- dset: your_dataset
root: /path/to/dataset
checkpoint:
save_every: 10
keep_every_states: null
EOF
# 运行微调
dora run solver=musicgen/finetune
#