[ PROMPT_NODE_22642 ]
Mechanistic Interpretability Saelens 教程
[ SKILL_DOCUMENTATION ]
# SAELens 教程
## 教程 1:加载和分析预训练的 SAE
### 目标
加载预训练的 SAE 并分析哪些特征在特定输入上激活。
### 分步指南
python
from transformer_lens import HookedTransformer
from sae_lens import SAE
import torch
# 1. 加载模型和 SAE
model = HookedTransformer.from_pretrained("gpt2-small", device="cuda")
sae, cfg_dict, sparsity = SAE.from_pretrained(
release="gpt2-small-res-jb",
sae_id="blocks.8.hook_resid_pre",
device="cuda"
)
print(f"SAE 输入维度: {sae.cfg.d_in}")
print(f"SAE 隐藏层维度: {sae.cfg.d_sae}")
print(f"扩展因子: {sae.cfg.d_sae / sae.cfg.d_in:.1f}x")
# 2. 获取模型激活
prompt = "The capital of France is Paris"
tokens = model.to_tokens(prompt)
_, cache = model.run_with_cache(tokens)
activations = cache["resid_pre", 8] # [1, seq_len, 768]
# 3. 编码为 SAE 特征
features = sae.encode(activations) # [1, seq_len, d_sae]
# 4. 分析稀疏性
active_per_token = (features > 0).sum(dim=-1)
print(f"每个 token 的平均激活特征数: {active_per_token.float().mean():.1f}")
# 5. 查找每个 token 的前几位特征
str_tokens = model.to_str_tokens(prompt)
for pos in range(len(str_tokens)):
top_features = features[0, pos].topk(5)
print(f"nToken '{str_tokens[pos]}':")
for feat_idx, feat_val in zip(top_features.indices, top_features.values):
print(f" 特征 {feat_idx.item()}: {feat_val.item():.3f}")
# 6. 检查重构质量
reconstructed = sae.decode(features)
mse = ((activations - reconstructed) ** 2).mean()
print(f"n重构 MSE: {mse.item():.6f}")
---
## 教程 2:训练自定义 SAE
### 目标
在 GPT-2 激活上训练稀疏自编码器。
### 分步指南
python
from sae_lens import LanguageModelSAERunnerConfig, SAETrainingRunner
# 1. 配置训练
cfg = LanguageModelSAERunnerConfig(
# 模型
model_name="gpt2-small",
hook_name="blocks.6.hook_resid_pre",
hook_layer=6,
d_in=768,
# SAE 架构
architecture="standard",
d_sae=768 * 8, # 8 倍扩展
activation_fn="relu",
# 训练
lr=4e-4,
l1_coefficient=8e-5,
l1_warm_up_steps=1000,
train_batch_size_tokens=4096,
training_tokens=10_000_000, # 演示用的小规模运行
# 数据
dataset_path="monology/pile-uncopyrighted",
streaming=True,
context_size=128,
# 防止死特征
use_ghost_grads=True,
dead_feature_window=5000,
# 日志
log_to_wandb