[ PROMPT_NODE_27074 ]
Pufferlib 策略
[ SKILL_DOCUMENTATION ]
# PufferLib 策略指南
## 概述
PufferLib 策略是标准的 PyTorch 模块,并带有用于观测处理和 LSTM 集成的可选工具。该框架提供了默认架构和工具,同时在策略设计上保持完全的灵活性。
## 策略架构
### 基础策略结构
python
import torch
import torch.nn as nn
from pufferlib.pytorch import layer_init
class BasicPolicy(nn.Module):
def __init__(self, observation_space, action_space):
super().__init__()
self.observation_space = observation_space
self.action_space = action_space
# 编码器网络
self.encoder = nn.Sequential(
layer_init(nn.Linear(observation_space.shape[0], 256)),
nn.ReLU(),
layer_init(nn.Linear(256, 256)),
nn.ReLU()
)
# 策略头 (actor)
self.actor = layer_init(nn.Linear(256, action_space.n), std=0.01)
# 价值头 (critic)
self.critic = layer_init(nn.Linear(256, 1), std=1.0)
def forward(self, observations):
"""策略的前向传播。"""
# 编码观测值
features = self.encoder(observations)
# 获取动作逻辑值和价值
logits = self.actor(features)
value = self.critic(features)
return logits, value
def get_action(self, observations, deterministic=False):
"""从策略中采样动作。"""
logits, value = self.forward(observations)
if deterministic:
action = logits.argmax(dim=-1)
else:
dist = torch.distributions.Categorical(logits=logits)
action = dist.sample()
return action, value
### 层初始化
PufferLib 提供 `layer_init` 用于正确的权重初始化:
python
from pufferlib.pytorch import layer_init
# 默认正交初始化
layer = layer_init(nn.Linear(256, 256))
# 自定义标准差
actor_head = layer_init(nn.Linear(256, num_actions), std=0.01)
critic_head = layer_init(nn.Linear(256, 1), std=1.0)
# 适用于任何层类型
conv = layer_init(nn.Conv2d(3, 32, kernel_size=8, stride=4))
## CNN 策略
针对基于图像的观测:
python
class CNNPolicy(nn.Module):
def __init__(self, observation_space, action_space):
super().__init__()
# 用于图像的 CNN 编码器
self.encoder = nn.Sequential(
layer_init(nn.Conv2d(3, 32, kernel_size=8, stride=4)),