时间:2026-03-14 22:00
人气:
作者:admin
Unitree RL GYM 是一个开源的 基于 Unitree 机器人强化学习(Reinforcement Learning, RL)控制示例项目,用于训练、测试和部署四足机器人控制策略。该仓库支持多种 Unitree 机器人型号,包括 Go2、H1、H1_2 和 G1。仓库地址 
本系列将着手解析整个仓库的核心代码与算法实现和训练教程。此系列默认读者拥有一定的强化学习基础和代码基础,故在部分原理和基础代码逻辑不做解释,对强化学习基础感兴趣的读者可以阅读我的入门系列:
阅读本系列的前置知识:
python语法,明白面向对象的封装pytorch基础使用Policy Gradient、Actor-Critic 和 PPO本系列:
本期将讲解rsl_rl仓库的ActorCritic网络和ActorCriticRecurrent网络的python实现。
ActorCritic网络,这里快速的回顾一下核心的内容:rsl_rl仓库git clone https://github.com/leggedrobotics/rsl_rl.git
cd rsl_rl
git checkout v1.0.2
modules文件夹下可以找到actor_critic.py# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Copyright (c) 2021 ETH Zurich, Nikita Rudin
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Normal
from torch.nn.modules import rnn
class ActorCritic(nn.Module):
is_recurrent = False
def __init__(self, num_actor_obs,
num_critic_obs,
num_actions,
actor_hidden_dims=[256, 256, 256],
critic_hidden_dims=[256, 256, 256],
activation='elu',
init_noise_std=1.0,
**kwargs):
if kwargs:
print("ActorCritic.__init__ got unexpected arguments, which will be ignored: " + str([key for key in kwargs.keys()]))
super(ActorCritic, self).__init__()
activation = get_activation(activation)
mlp_input_dim_a = num_actor_obs
mlp_input_dim_c = num_critic_obs
# Policy
actor_layers = []
actor_layers.append(nn.Linear(mlp_input_dim_a, actor_hidden_dims[0]))
actor_layers.append(activation)
for l in range(len(actor_hidden_dims)):
if l == len(actor_hidden_dims) - 1:
actor_layers.append(nn.Linear(actor_hidden_dims[l], num_actions))
else:
actor_layers.append(nn.Linear(actor_hidden_dims[l], actor_hidden_dims[l + 1]))
actor_layers.append(activation)
self.actor = nn.Sequential(*actor_layers)
# Value function
critic_layers = []
critic_layers.append(nn.Linear(mlp_input_dim_c, critic_hidden_dims[0]))
critic_layers.append(activation)
for l in range(len(critic_hidden_dims)):
if l == len(critic_hidden_dims) - 1:
critic_layers.append(nn.Linear(critic_hidden_dims[l], 1))
else:
critic_layers.append(nn.Linear(critic_hidden_dims[l], critic_hidden_dims[l + 1]))
critic_layers.append(activation)
self.critic = nn.Sequential(*critic_layers)
print(f"Actor MLP: {self.actor}")
print(f"Critic MLP: {self.critic}")
# Action noise
self.std = nn.Parameter(init_noise_std * torch.ones(num_actions))
self.distribution = None
# disable args validation for speedup
Normal.set_default_validate_args = False
# seems that we get better performance without init
# self.init_memory_weights(self.memory_a, 0.001, 0.)
# self.init_memory_weights(self.memory_c, 0.001, 0.)
@staticmethod
# not used at the moment
def init_weights(sequential, scales):
[torch.nn.init.orthogonal_(module.weight, gain=scales[idx]) for idx, module in
enumerate(mod for mod in sequential if isinstance(mod, nn.Linear))]
def reset(self, dones=None):
pass
def forward(self):
raise NotImplementedError
@property
def action_mean(self):
return self.distribution.mean
@property
def action_std(self):
return self.distribution.stddev
@property
def entropy(self):
return self.distribution.entropy().sum(dim=-1)
def update_distribution(self, observations):
mean = self.actor(observations)
self.distribution = Normal(mean, mean*0. + self.std)
def act(self, observations, **kwargs):
self.update_distribution(observations)
return self.distribution.sample()
def get_actions_log_prob(self, actions):
return self.distribution.log_prob(actions).sum(dim=-1)
def act_inference(self, observations):
actions_mean = self.actor(observations)
return actions_mean
def evaluate(self, critic_observations, **kwargs):
value = self.critic(critic_observations)
return value
def get_activation(act_name):
if act_name == "elu":
return nn.ELU()
elif act_name == "selu":
return nn.SELU()
elif act_name == "relu":
return nn.ReLU()
elif act_name == "crelu":
return nn.ReLU()
elif act_name == "lrelu":
return nn.LeakyReLU()
elif act_name == "tanh":
return nn.Tanh()
elif act_name == "sigmoid":
return nn.Sigmoid()
else:
print("invalid activation function!")
return None
class ActorCritic(nn.Module):
is_recurrent = False
def __init__(self, num_actor_obs,
num_critic_obs,
num_actions,
actor_hidden_dims=[256, 256, 256],
critic_hidden_dims=[256, 256, 256],
activation='elu',
init_noise_std=1.0,
**kwargs):
if kwargs:
print("ActorCritic.__init__ got unexpected arguments, which will be ignored: " + str([key for key in kwargs.keys()]))
super(ActorCritic, self).__init__()
activation = get_activation(activation)
mlp_input_dim_a = num_actor_obs
mlp_input_dim_c = num_critic_obs
# Policy
actor_layers = []
actor_layers.append(nn.Linear(mlp_input_dim_a, actor_hidden_dims[0]))
actor_layers.append(activation)
for l in range(len(actor_hidden_dims)):
if l == len(actor_hidden_dims) - 1:
actor_layers.append(nn.Linear(actor_hidden_dims[l], num_actions))
else:
actor_layers.append(nn.Linear(actor_hidden_dims[l], actor_hidden_dims[l + 1]))
actor_layers.append(activation)
self.actor = nn.Sequential(*actor_layers)
# Value function
critic_layers = []
critic_layers.append(nn.Linear(mlp_input_dim_c, critic_hidden_dims[0]))
critic_layers.append(activation)
for l in range(len(critic_hidden_dims)):
if l == len(critic_hidden_dims) - 1:
critic_layers.append(nn.Linear(critic_hidden_dims[l], 1))
else:
critic_layers.append(nn.Linear(critic_hidden_dims[l], critic_hidden_dims[l + 1]))
critic_layers.append(activation)
self.critic = nn.Sequential(*critic_layers)
print(f"Actor MLP: {self.actor}")
print(f"Critic MLP: {self.critic}")
# Action noise
self.std = nn.Parameter(init_noise_std * torch.ones(num_actions))
self.distribution = None
# disable args validation for speedup
Normal.set_default_validate_args = False
# seems that we get better performance without init
# self.init_memory_weights(self.memory_a, 0.001, 0.)
# self.init_memory_weights(self.memory_c, 0.001, 0.)
def __init__(self,
num_actor_obs,
num_critic_obs,
num_actions,
actor_hidden_dims=[256, 256, 256],
critic_hidden_dims=[256, 256, 256],
activation='elu',
init_noise_std=1.0,
**kwargs):
num_actor_obs:Actor 网络的输入维度,也就是策略网络可以看到的观测状态数量num_critic_obs:Critic 网络的输入维度,也就是价值网络看到的观测状态数量num_actions:动作空间维度actor_hidden_dims:Actor 网络每一隐藏层的神经元个数critic_hidden_dims:Critic 网络每一隐藏层神经元个数activation:隐藏层激活函数,对应get_activation()函数提供的几个激活函数,"elu","selu","relu","crelu","lrelu","tanh", "sigmoid"init_noise_std:Actor 输出动作的初始噪声标准差 σ \sigma σ# Policy
actor_layers = []
actor_layers.append(nn.Linear(mlp_input_dim_a, actor_hidden_dims[0]))
actor_layers.append(activation)
for l in range(len(actor_hidden_dims)):
if l == len(actor_hidden_dims) - 1:
actor_layers.append(nn.Linear(actor_hidden_dims[l], num_actions))
else:
actor_layers.append(nn.Linear(actor_hidden_dims[l], actor_hidden_dims[l + 1]))
actor_layers.append(activation)
self.actor = nn.Sequential(*actor_layers)
# Value function
critic_layers = []
critic_layers.append(nn.Linear(mlp_input_dim_c, critic_hidden_dims[0]))
critic_layers.append(activation)
for l in range(len(critic_hidden_dims)):
if l == len(critic_hidden_dims) - 1:
critic_layers.append(nn.Linear(critic_hidden_dims[l], 1))
else:
critic_layers.append(nn.Linear(critic_hidden_dims[l], critic_hidden_dims[l + 1]))
critic_layers.append(activation)
self.critic = nn.Sequential(*critic_layers)
print(f"Actor MLP: {self.actor}")
print(f"Critic MLP: {self.critic}")
| 特征 | Actor 网络 | Critic 网络 |
|---|---|---|
| 输入 | num_actor_obs |
num_critic_obs(可不同) |
| 输出 | 动作向量 μ θ ( s ) \mu_\theta(s) μθ(s) | 状态价值标量 V ϕ ( s ) V_\phi(s) Vϕ(s) |
| 输出层维度 | 动作空间维度 | 1 |
| 功能 | 选择动作(策略) | 评估状态(价值函数) |
| 使用 | 策略梯度更新 | 计算优势函数指导 Actor |
# Action noise
self.std = nn.Parameter(init_noise_std * torch.ones(num_actions))
self.distribution = None
Normal.set_default_validate_args = False
self.std :动作分布的标准差 σ \sigma σ,控制动作探索self.distribution:动作分布对象 Normal(mean, std),用于采样、计算 log_prob 和熵@property
def action_mean(self):
return self.distribution.mean
@property
def action_std(self):
return self.distribution.stddev
@property
def entropy(self):
return self.distribution.entropy().sum(dim=-1)
action_mean():返回当前状态下动作分布的均值 μ θ ( s ) \mu_\theta(s) μθ(s)action_std():返回当前状态下动作分布标准差 σ \sigma σentropy():返回动作分布熵 H [ π θ ] H[\pi_\theta] H[πθ]algorithms/ppo.py计算损失函数的时候被调用mu_batch = self.actor_critic.action_mean
sigma_batch = self.actor_critic.action_std
entropy_batch = self.actor_critic.entropy
act()def update_distribution(self, observations):
mean = self.actor(observations)
self.distribution = Normal(mean, mean*0. + self.std)
def act(self, observations, **kwargs):
self.update_distribution(observations)
return self.distribution.sample()
mean*0. 只是生成与 mean 形状相同的零张量,保证广播正确。algorithms/ppo.py中每一个batch训练的第一步就是Actor前向计算,重新计算 当前策略的动作分布self.actor_critic.act(obs_batch, masks=masks_batch, hidden_states=hid_states_batch[0])
get_actions_log_prob() def get_actions_log_prob(self, actions):
return self.distribution.log_prob(actions).sum(dim=-1)
algorithms/ppo.py中直接调用:actions_log_prob_batch = self.actor_critic.get_actions_log_prob(actions_batch)
evaluatedef evaluate(self, critic_observations, **kwargs):
value = self.critic(critic_observations)
return value
algorithms/ppo.py中直接调用:value_batch = self.actor_critic.evaluate(critic_obs_batch)
tanh, ReLU等)| 特征 | RNN 记忆 | 经验池(Replay Buffer) |
|---|---|---|
| 数据类型 | 隐藏状态(隐藏的网络向量) | 完整的状态、动作、奖励轨迹 |
| 功能 | 捕捉时间依赖,生成连续决策 | 存储历史经验,用于训练网络 |
| 更新方式 | 随每个时间步前向传播动态更新 | 离线采样随机更新 |
| 生命周期 | 每条轨迹或 episode 内有效 | 可跨多个 episode 持久存在 |
ActorCriticRecurrent 是 ActorCritic 的 RNN 版本,引入循环神经网络使得 Actor 和 Critic 能够记住历史信息,适合处理 部分可观测环境(POMDP) 或时间依赖的任务。reset(dones) 把对应位置的隐藏状态清零# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Copyright (c) 2021 ETH Zurich, Nikita Rudin
import numpy as np
import torch
import torch.nn as nn
from torch.distributions import Normal
from torch.nn.modules import rnn
from .actor_critic import ActorCritic, get_activation
from rsl_rl.utils import unpad_trajectories
class ActorCriticRecurrent(ActorCritic):
is_recurrent = True
def __init__(self, num_actor_obs,
num_critic_obs,
num_actions,
actor_hidden_dims=[256, 256, 256],
critic_hidden_dims=[256, 256, 256],
activation='elu',
rnn_type='lstm',
rnn_hidden_size=256,
rnn_num_layers=1,
init_noise_std=1.0,
**kwargs):
if kwargs:
print("ActorCriticRecurrent.__init__ got unexpected arguments, which will be ignored: " + str(kwargs.keys()),)
super().__init__(num_actor_obs=rnn_hidden_size,
num_critic_obs=rnn_hidden_size,
num_actions=num_actions,
actor_hidden_dims=actor_hidden_dims,
critic_hidden_dims=critic_hidden_dims,
activation=activation,
init_noise_std=init_noise_std)
activation = get_activation(activation)
self.memory_a = Memory(num_actor_obs, type=rnn_type, num_layers=rnn_num_layers, hidden_size=rnn_hidden_size)
self.memory_c = Memory(num_critic_obs, type=rnn_type, num_layers=rnn_num_layers, hidden_size=rnn_hidden_size)
print(f"Actor RNN: {self.memory_a}")
print(f"Critic RNN: {self.memory_c}")
def reset(self, dones=None):
self.memory_a.reset(dones)
self.memory_c.reset(dones)
def act(self, observations, masks=None, hidden_states=None):
input_a = self.memory_a(observations, masks, hidden_states)
return super().act(input_a.squeeze(0))
def act_inference(self, observations):
input_a = self.memory_a(observations)
return super().act_inference(input_a.squeeze(0))
def evaluate(self, critic_observations, masks=None, hidden_states=None):
input_c = self.memory_c(critic_observations, masks, hidden_states)
return super().evaluate(input_c.squeeze(0))
def get_hidden_states(self):
return self.memory_a.hidden_states, self.memory_c.hidden_states
class Memory(torch.nn.Module):
def __init__(self, input_size, type='lstm', num_layers=1, hidden_size=256):
super().__init__()
# RNN
rnn_cls = nn.GRU if type.lower() == 'gru' else nn.LSTM
self.rnn = rnn_cls(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
self.hidden_states = None
def forward(self, input, masks=None, hidden_states=None):
batch_mode = masks is not None
if batch_mode:
# batch mode (policy update): need saved hidden states
if hidden_states is None:
raise ValueError("Hidden states not passed to memory module during policy update")
out, _ = self.rnn(input, hidden_states)
out = unpad_trajectories(out, masks)
else:
# inference mode (collection): use hidden states of last step
out, self.hidden_states = self.rnn(input.unsqueeze(0), self.hidden_states)
return out
def reset(self, dones=None):
# When the RNN is an LSTM, self.hidden_states_a is a list with hidden_state and cell_state
for hidden_state in self.hidden_states:
hidden_state[..., dones, :] = 0.0
ActorCritic不同的地方class Memory(torch.nn.Module):
def __init__(self, input_size, type='lstm', num_layers=1, hidden_size=256):
super().__init__()
# RNN
rnn_cls = nn.GRU if type.lower() == 'gru' else nn.LSTM
self.rnn = rnn_cls(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
self.hidden_states = None
def forward(self, input, masks=None, hidden_states=None):
batch_mode = masks is not None
if batch_mode:
# batch mode (policy update): need saved hidden states
if hidden_states is None:
raise ValueError("Hidden states not passed to memory module during policy update")
out, _ = self.rnn(input, hidden_states)
out = unpad_trajectories(out, masks)
else:
# inference mode (collection): use hidden states of last step
out, self.hidden_states = self.rnn(input.unsqueeze(0), self.hidden_states)
return out
def reset(self, dones=None):
# When the RNN is an LSTM, self.hidden_states_a is a list with hidden_state and cell_state
for hidden_state in self.hidden_states:
hidden_state[..., dones, :] = 0.0
Memory 是 封装 RNN 的模块,主要目的是:
torch.nn.Moduledef __init__(self, input_size, type='lstm', num_layers=1, hidden_size=256):
super().__init__()
rnn_cls = nn.GRU if type.lower() == 'gru' else nn.LSTM
self.rnn = rnn_cls(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
self.hidden_states = None
input_size:RNN 每一步的输入维度(比如观测向量的维度)type:RNN 类型,支持 'lstm' 或 'gru'num_layers:RNN 堆叠层数hidden_size:RNN 隐藏状态维度self.hidden_states:保存 RNN 的隐藏状态,用于推理模式下连续更新forwarddef forward(self, input, masks=None, hidden_states=None):
batch_mode = masks is not None
if batch_mode:
# batch mode (policy update)
if hidden_states is None:
raise ValueError("Hidden states not passed to memory module during policy update")
out, _ = self.rnn(input, hidden_states)
out = unpad_trajectories(out, masks)
else:
# inference mode (collection)
out, self.hidden_states = self.rnn(input.unsqueeze(0), self.hidden_states)
return out
masks 不为空时,表示训练阶段(一次性处理一个 batch 的多条轨迹)
hidden_states(前一 batch 的隐藏状态)unpad_trajectories:去掉填充的时间步,保证序列长度正确masks 为空时,表示推理或收集经验
input.unsqueeze(0):增加时间维度self.hidden_states → 下次继续使用def reset(self, dones=None):
for hidden_state in self.hidden_states:
hidden_state[..., dones, :] = 0.0
dones 是布尔向量,表示哪些环境/样本 episode 已经结束
rsl_rl 仓库中 ActorCritic 与 ActorCriticRecurrent 的 Python 实现,回顾了 Actor-Critic 的核心原理,重点讲解了 ActorCriticRecurrent 引入 RNN/Memory 模块以增强网络对历史信息的记忆能力,区分了训练和推理模式下隐藏状态的处理,并对网络构建、动作采样、价值评估等函数实现进行了详细剖析,为理解复杂机器人控制任务中的策略与价值网络打下基础。