Skip to main content

HumanoidGym

1. 复现结果

先上结果。

episodes: 3000

ImageImage
Episode/rew_action_smoothness:-0.052391108
Episode/rew_base_acc:0.030835867
Episode/rew_base_height:0.025896015
Episode/rew_collision:-0.00023611114
Episode/rew_default_joint_pos:0.034609184
Episode/rew_dof_acc:-0.055873536
Episode/rew_dof_vel:-0.022730388
Episode/rew_feet_air_time:0.0022297923
Episode/rew_feet_clearance:0.01163889
Episode/rew_feet_contact_forces:-0.21839587
Episode/rew_feet_contact_number:0.39704362
Episode/rew_feet_distance:0.14127402
Episode/rew_foot_slip:-0.021376245
Episode/rew_joint_pos:0.46255687
Episode/rew_knee_distance:0.108565286
Episode/rew_low_speed:-0.05085073
Episode/rew_orientation:0.30785814
Episode/rew_torques:-0.1378756
Episode/rew_track_vel_hard:-0.035626132
Episode/rew_tracking_ang_vel:0.29886228
Episode/rew_tracking_lin_vel:0.4788447
Episode/rew_vel_mismatch_exp:0.15188387
global_step:9,255
Loss/learning_rate:0.00011390625
Loss/surrogate:-0.0014583393
Loss/value_function:3.0858552
Perf/collection time:14.409651
Perf/learning_time:0.4499824
Perf/total_fps:1,033
Policy/mean_noise_std:0.7732231
Train/mean_episode_length:1,743.2
Train/mean_episode_length/time:1,743.2
Train/mean_reward:49.464386
Train/mean_reward/time:49.464386

事实上我的电脑在中途大概2900轮的时候没电了。Cli的结果我没来得及保存。

2. 代码分析

humanoid_env.py:

# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Copyright (c) 2024 Beijing RobotEra TECHNOLOGY CO.,LTD. All rights reserved.


from humanoid.envs.base.legged_robot_config import LeggedRobotCfg

from isaacgym.torch_utils import *
from isaacgym import gymtorch, gymapi

import torch
from humanoid.envs import LeggedRobot

from humanoid.utils.terrain import HumanoidTerrain


class XBotLFreeEnv(LeggedRobot):
'''
XBotLFreeEnv is a class that represents a custom environment for a legged robot.

Args:
cfg (LeggedRobotCfg): Configuration object for the legged robot.
sim_params: Parameters for the simulation.
physics_engine: Physics engine used in the simulation.
sim_device: Device used for the simulation.
headless: Flag indicating whether the simulation should be run in headless mode.

Attributes:
last_feet_z (float): The z-coordinate of the last feet position.
feet_height (torch.Tensor): Tensor representing the height of the feet.
sim (gymtorch.GymSim): The simulation object.
terrain (HumanoidTerrain): The terrain object.
up_axis_idx (int): The index representing the up axis.
command_input (torch.Tensor): Tensor representing the command input.
privileged_obs_buf (torch.Tensor): Tensor representing the privileged observations buffer.
obs_buf (torch.Tensor): Tensor representing the observations buffer.
obs_history (collections.deque): Deque containing the history of observations.
critic_history (collections.deque): Deque containing the history of critic observations.

Methods:
_push_robots(): Randomly pushes the robots by setting a randomized base velocity.
_get_phase(): Calculates the phase of the gait cycle.
_get_gait_phase(): Calculates the gait phase.
compute_ref_state(): Computes the reference state.
create_sim(): Creates the simulation, terrain, and environments.
_get_noise_scale_vec(cfg): Sets a vector used to scale the noise added to the observations.
step(actions): Performs a simulation step with the given actions.
compute_observations(): Computes the observations.
reset_idx(env_ids): Resets the environment for the specified environment IDs.
'''
def __init__(self, cfg: LeggedRobotCfg, sim_params, physics_engine, sim_device, headless):
super().__init__(cfg, sim_params, physics_engine, sim_device, headless)
self.last_feet_z = 0.05
self.feet_height = torch.zeros((self.num_envs, 2), device=self.device)
self.reset_idx(torch.tensor(range(self.num_envs), device=self.device))
self.compute_observations()

def _push_robots(self):
""" Random pushes the robots. Emulates an impulse by setting a randomized base velocity.
"""
max_vel = self.cfg.domain_rand.max_push_vel_xy
max_push_angular = self.cfg.domain_rand.max_push_ang_vel
self.rand_push_force[:, :2] = torch_rand_float(
-max_vel, max_vel, (self.num_envs, 2), device=self.device) # lin vel x/y
self.root_states[:, 7:9] = self.rand_push_force[:, :2]

self.rand_push_torque = torch_rand_float(
-max_push_angular, max_push_angular, (self.num_envs, 3), device=self.device)

self.root_states[:, 10:13] = self.rand_push_torque

self.gym.set_actor_root_state_tensor(
self.sim, gymtorch.unwrap_tensor(self.root_states))

def _get_phase(self):
cycle_time = self.cfg.rewards.cycle_time
phase = self.episode_length_buf * self.dt / cycle_time
return phase

def _get_gait_phase(self):
# return float mask 1 is stance, 0 is swing
phase = self._get_phase()
sin_pos = torch.sin(2 * torch.pi * phase)
# Add double support phase
stance_mask = torch.zeros((self.num_envs, 2), device=self.device)
# left foot stance
stance_mask[:, 0] = sin_pos >= 0
# right foot stance
stance_mask[:, 1] = sin_pos < 0
# Double support phase
stance_mask[torch.abs(sin_pos) < 0.1] = 1

return stance_mask


def compute_ref_state(self):
phase = self._get_phase()
sin_pos = torch.sin(2 * torch.pi * phase)
sin_pos_l = sin_pos.clone()
sin_pos_r = sin_pos.clone()
self.ref_dof_pos = torch.zeros_like(self.dof_pos)
scale_1 = self.cfg.rewards.target_joint_pos_scale
scale_2 = 2 * scale_1
# left foot stance phase set to default joint pos
sin_pos_l[sin_pos_l > 0] = 0
self.ref_dof_pos[:, 2] = sin_pos_l * scale_1
self.ref_dof_pos[:, 3] = sin_pos_l * scale_2
self.ref_dof_pos[:, 4] = sin_pos_l * scale_1
# right foot stance phase set to default joint pos
sin_pos_r[sin_pos_r < 0] = 0
self.ref_dof_pos[:, 8] = sin_pos_r * scale_1
self.ref_dof_pos[:, 9] = sin_pos_r * scale_2
self.ref_dof_pos[:, 10] = sin_pos_r * scale_1
# Double support phase
self.ref_dof_pos[torch.abs(sin_pos) < 0.1] = 0

self.ref_action = 2 * self.ref_dof_pos


def create_sim(self):
""" Creates simulation, terrain and evironments
"""
self.up_axis_idx = 2 # 2 for z, 1 for y -> adapt gravity accordingly
self.sim = self.gym.create_sim(
self.sim_device_id, self.graphics_device_id, self.physics_engine, self.sim_params)
mesh_type = self.cfg.terrain.mesh_type
if mesh_type in ['heightfield', 'trimesh']:
self.terrain = HumanoidTerrain(self.cfg.terrain, self.num_envs)
if mesh_type == 'plane':
self._create_ground_plane()
elif mesh_type == 'heightfield':
self._create_heightfield()
elif mesh_type == 'trimesh':
self._create_trimesh()
elif mesh_type is not None:
raise ValueError(
"Terrain mesh type not recognised. Allowed types are [None, plane, heightfield, trimesh]")
self._create_envs()


def _get_noise_scale_vec(self, cfg):
""" Sets a vector used to scale the noise added to the observations.
[NOTE]: Must be adapted when changing the observations structure

Args:
cfg (Dict): Environment config file

Returns:
[torch.Tensor]: Vector of scales used to multiply a uniform distribution in [-1, 1]
"""
noise_vec = torch.zeros(
self.cfg.env.num_single_obs, device=self.device)
self.add_noise = self.cfg.noise.add_noise
noise_scales = self.cfg.noise.noise_scales
noise_vec[0: 5] = 0. # commands
noise_vec[5: 17] = noise_scales.dof_pos * self.obs_scales.dof_pos
noise_vec[17: 29] = noise_scales.dof_vel * self.obs_scales.dof_vel
noise_vec[29: 41] = 0. # previous actions
noise_vec[41: 44] = noise_scales.ang_vel * self.obs_scales.ang_vel # ang vel
noise_vec[44: 47] = noise_scales.quat * self.obs_scales.quat # euler x,y
return noise_vec


def step(self, actions):
if self.cfg.env.use_ref_actions:
actions += self.ref_action
actions = torch.clip(actions, -self.cfg.normalization.clip_actions, self.cfg.normalization.clip_actions)
# dynamic randomization
delay = torch.rand((self.num_envs, 1), device=self.device) * self.cfg.domain_rand.action_delay
actions = (1 - delay) * actions + delay * self.actions
actions += self.cfg.domain_rand.action_noise * torch.randn_like(actions) * actions
return super().step(actions)


def compute_observations(self):

phase = self._get_phase()
self.compute_ref_state()

sin_pos = torch.sin(2 * torch.pi * phase).unsqueeze(1)
cos_pos = torch.cos(2 * torch.pi * phase).unsqueeze(1)

stance_mask = self._get_gait_phase()
contact_mask = self.contact_forces[:, self.feet_indices, 2] > 5.

self.command_input = torch.cat(
(sin_pos, cos_pos, self.commands[:, :3] * self.commands_scale), dim=1)

q = (self.dof_pos - self.default_dof_pos) * self.obs_scales.dof_pos
dq = self.dof_vel * self.obs_scales.dof_vel

diff = self.dof_pos - self.ref_dof_pos

self.privileged_obs_buf = torch.cat((
self.command_input, # 2 + 3
(self.dof_pos - self.default_joint_pd_target) * \
self.obs_scales.dof_pos, # 12
self.dof_vel * self.obs_scales.dof_vel, # 12
self.actions, # 12
diff, # 12
self.base_lin_vel * self.obs_scales.lin_vel, # 3
self.base_ang_vel * self.obs_scales.ang_vel, # 3
self.base_euler_xyz * self.obs_scales.quat, # 3
self.rand_push_force[:, :2], # 2
self.rand_push_torque, # 3
self.env_frictions, # 1
self.body_mass / 30., # 1
stance_mask, # 2
contact_mask, # 2
), dim=-1)

obs_buf = torch.cat((
self.command_input, # 5 = 2D(sin cos) + 3D(vel_x, vel_y, aug_vel_yaw)
q, # 12D
dq, # 12D
self.actions, # 12D
self.base_ang_vel * self.obs_scales.ang_vel, # 3
self.base_euler_xyz * self.obs_scales.quat, # 3
), dim=-1)

if self.cfg.terrain.measure_heights:
heights = torch.clip(self.root_states[:, 2].unsqueeze(1) - 0.5 - self.measured_heights, -1, 1.) * self.obs_scales.height_measurements
self.privileged_obs_buf = torch.cat((self.obs_buf, heights), dim=-1)

if self.add_noise:
obs_now = obs_buf.clone() + torch.randn_like(obs_buf) * self.noise_scale_vec * self.cfg.noise.noise_level
else:
obs_now = obs_buf.clone()
self.obs_history.append(obs_now)
self.critic_history.append(self.privileged_obs_buf)


obs_buf_all = torch.stack([self.obs_history[i]
for i in range(self.obs_history.maxlen)], dim=1) # N,T,K

self.obs_buf = obs_buf_all.reshape(self.num_envs, -1) # N, T*K
self.privileged_obs_buf = torch.cat([self.critic_history[i] for i in range(self.cfg.env.c_frame_stack)], dim=1)

def reset_idx(self, env_ids):
super().reset_idx(env_ids)
for i in range(self.obs_history.maxlen):
self.obs_history[i][env_ids] *= 0
for i in range(self.critic_history.maxlen):
self.critic_history[i][env_ids] *= 0

# ================================================ Rewards ================================================== #
def _reward_joint_pos(self):
"""
Calculates the reward based on the difference between the current joint positions and the target joint positions.
"""
joint_pos = self.dof_pos.clone()
pos_target = self.ref_dof_pos.clone()
diff = joint_pos - pos_target
r = torch.exp(-2 * torch.norm(diff, dim=1)) - 0.2 * torch.norm(diff, dim=1).clamp(0, 0.5)
return r

def _reward_feet_distance(self):
"""
Calculates the reward based on the distance between the feet. Penalize feet get close to each other or too far away.
"""
foot_pos = self.rigid_state[:, self.feet_indices, :2]
foot_dist = torch.norm(foot_pos[:, 0, :] - foot_pos[:, 1, :], dim=1)
fd = self.cfg.rewards.min_dist
max_df = self.cfg.rewards.max_dist
d_min = torch.clamp(foot_dist - fd, -0.5, 0.)
d_max = torch.clamp(foot_dist - max_df, 0, 0.5)
return (torch.exp(-torch.abs(d_min) * 100) + torch.exp(-torch.abs(d_max) * 100)) / 2


def _reward_knee_distance(self):
"""
Calculates the reward based on the distance between the knee of the humanoid.
"""
foot_pos = self.rigid_state[:, self.knee_indices, :2]
foot_dist = torch.norm(foot_pos[:, 0, :] - foot_pos[:, 1, :], dim=1)
fd = self.cfg.rewards.min_dist
max_df = self.cfg.rewards.max_dist / 2
d_min = torch.clamp(foot_dist - fd, -0.5, 0.)
d_max = torch.clamp(foot_dist - max_df, 0, 0.5)
return (torch.exp(-torch.abs(d_min) * 100) + torch.exp(-torch.abs(d_max) * 100)) / 2


def _reward_foot_slip(self):
"""
Calculates the reward for minimizing foot slip. The reward is based on the contact forces
and the speed of the feet. A contact threshold is used to determine if the foot is in contact
with the ground. The speed of the foot is calculated and scaled by the contact condition.
"""
contact = self.contact_forces[:, self.feet_indices, 2] > 5.
foot_speed_norm = torch.norm(self.rigid_state[:, self.feet_indices, 7:9], dim=2)
rew = torch.sqrt(foot_speed_norm)
rew *= contact
return torch.sum(rew, dim=1)

def _reward_feet_air_time(self):
"""
Calculates the reward for feet air time, promoting longer steps. This is achieved by
checking the first contact with the ground after being in the air. The air time is
limited to a maximum value for reward calculation.
"""
contact = self.contact_forces[:, self.feet_indices, 2] > 5.
stance_mask = self._get_gait_phase()
self.contact_filt = torch.logical_or(torch.logical_or(contact, stance_mask), self.last_contacts)
self.last_contacts = contact
first_contact = (self.feet_air_time > 0.) * self.contact_filt
self.feet_air_time += self.dt
air_time = self.feet_air_time.clamp(0, 0.5) * first_contact
self.feet_air_time *= ~self.contact_filt
return air_time.sum(dim=1)

def _reward_feet_contact_number(self):
"""
Calculates a reward based on the number of feet contacts aligning with the gait phase.
Rewards or penalizes depending on whether the foot contact matches the expected gait phase.
"""
contact = self.contact_forces[:, self.feet_indices, 2] > 5.
stance_mask = self._get_gait_phase()
reward = torch.where(contact == stance_mask, 1.0, -0.3)
return torch.mean(reward, dim=1)

def _reward_orientation(self):
"""
Calculates the reward for maintaining a flat base orientation. It penalizes deviation
from the desired base orientation using the base euler angles and the projected gravity vector.
"""
quat_mismatch = torch.exp(-torch.sum(torch.abs(self.base_euler_xyz[:, :2]), dim=1) * 10)
orientation = torch.exp(-torch.norm(self.projected_gravity[:, :2], dim=1) * 20)
return (quat_mismatch + orientation) / 2.

def _reward_feet_contact_forces(self):
"""
Calculates the reward for keeping contact forces within a specified range. Penalizes
high contact forces on the feet.
"""
return torch.sum((torch.norm(self.contact_forces[:, self.feet_indices, :], dim=-1) - self.cfg.rewards.max_contact_force).clip(0, 400), dim=1)

def _reward_default_joint_pos(self):
"""
Calculates the reward for keeping joint positions close to default positions, with a focus
on penalizing deviation in yaw and roll directions. Excludes yaw and roll from the main penalty.
"""
joint_diff = self.dof_pos - self.default_joint_pd_target
left_yaw_roll = joint_diff[:, :2]
right_yaw_roll = joint_diff[:, 6: 8]
yaw_roll = torch.norm(left_yaw_roll, dim=1) + torch.norm(right_yaw_roll, dim=1)
yaw_roll = torch.clamp(yaw_roll - 0.1, 0, 50)
return torch.exp(-yaw_roll * 100) - 0.01 * torch.norm(joint_diff, dim=1)

def _reward_base_height(self):
"""
Calculates the reward based on the robot's base height. Penalizes deviation from a target base height.
The reward is computed based on the height difference between the robot's base and the average height
of its feet when they are in contact with the ground.
"""
stance_mask = self._get_gait_phase()
measured_heights = torch.sum(
self.rigid_state[:, self.feet_indices, 2] * stance_mask, dim=1) / torch.sum(stance_mask, dim=1)
base_height = self.root_states[:, 2] - (measured_heights - 0.05)
return torch.exp(-torch.abs(base_height - self.cfg.rewards.base_height_target) * 100)

def _reward_base_acc(self):
"""
Computes the reward based on the base's acceleration. Penalizes high accelerations of the robot's base,
encouraging smoother motion.
"""
root_acc = self.last_root_vel - self.root_states[:, 7:13]
rew = torch.exp(-torch.norm(root_acc, dim=1) * 3)
return rew


def _reward_vel_mismatch_exp(self):
"""
Computes a reward based on the mismatch in the robot's linear and angular velocities.
Encourages the robot to maintain a stable velocity by penalizing large deviations.
"""
lin_mismatch = torch.exp(-torch.square(self.base_lin_vel[:, 2]) * 10)
ang_mismatch = torch.exp(-torch.norm(self.base_ang_vel[:, :2], dim=1) * 5.)

c_update = (lin_mismatch + ang_mismatch) / 2.

return c_update

def _reward_track_vel_hard(self):
"""
Calculates a reward for accurately tracking both linear and angular velocity commands.
Penalizes deviations from specified linear and angular velocity targets.
"""
# Tracking of linear velocity commands (xy axes)
lin_vel_error = torch.norm(
self.commands[:, :2] - self.base_lin_vel[:, :2], dim=1)
lin_vel_error_exp = torch.exp(-lin_vel_error * 10)

# Tracking of angular velocity commands (yaw)
ang_vel_error = torch.abs(
self.commands[:, 2] - self.base_ang_vel[:, 2])
ang_vel_error_exp = torch.exp(-ang_vel_error * 10)

linear_error = 0.2 * (lin_vel_error + ang_vel_error)

return (lin_vel_error_exp + ang_vel_error_exp) / 2. - linear_error

def _reward_tracking_lin_vel(self):
"""
Tracks linear velocity commands along the xy axes.
Calculates a reward based on how closely the robot's linear velocity matches the commanded values.
"""
lin_vel_error = torch.sum(torch.square(
self.commands[:, :2] - self.base_lin_vel[:, :2]), dim=1)
return torch.exp(-lin_vel_error * self.cfg.rewards.tracking_sigma)

def _reward_tracking_ang_vel(self):
"""
Tracks angular velocity commands for yaw rotation.
Computes a reward based on how closely the robot's angular velocity matches the commanded yaw values.
"""

ang_vel_error = torch.square(
self.commands[:, 2] - self.base_ang_vel[:, 2])
return torch.exp(-ang_vel_error * self.cfg.rewards.tracking_sigma)

def _reward_feet_clearance(self):
"""
Calculates reward based on the clearance of the swing leg from the ground during movement.
Encourages appropriate lift of the feet during the swing phase of the gait.
"""
# Compute feet contact mask
contact = self.contact_forces[:, self.feet_indices, 2] > 5.

# Get the z-position of the feet and compute the change in z-position
feet_z = self.rigid_state[:, self.feet_indices, 2] - 0.05
delta_z = feet_z - self.last_feet_z
self.feet_height += delta_z
self.last_feet_z = feet_z

# Compute swing mask
swing_mask = 1 - self._get_gait_phase()

# feet height should be closed to target feet height at the peak
rew_pos = torch.abs(self.feet_height - self.cfg.rewards.target_feet_height) < 0.01
rew_pos = torch.sum(rew_pos * swing_mask, dim=1)
self.feet_height *= ~contact
return rew_pos

def _reward_low_speed(self):
"""
Rewards or penalizes the robot based on its speed relative to the commanded speed.
This function checks if the robot is moving too slow, too fast, or at the desired speed,
and if the movement direction matches the command.
"""
# Calculate the absolute value of speed and command for comparison
absolute_speed = torch.abs(self.base_lin_vel[:, 0])
absolute_command = torch.abs(self.commands[:, 0])

# Define speed criteria for desired range
speed_too_low = absolute_speed < 0.5 * absolute_command
speed_too_high = absolute_speed > 1.2 * absolute_command
speed_desired = ~(speed_too_low | speed_too_high)

# Check if the speed and command directions are mismatched
sign_mismatch = torch.sign(
self.base_lin_vel[:, 0]) != torch.sign(self.commands[:, 0])

# Initialize reward tensor
reward = torch.zeros_like(self.base_lin_vel[:, 0])

# Assign rewards based on conditions
# Speed too low
reward[speed_too_low] = -1.0
# Speed too high
reward[speed_too_high] = 0.
# Speed within desired range
reward[speed_desired] = 1.2
# Sign mismatch has the highest priority
reward[sign_mismatch] = -2.0
return reward * (self.commands[:, 0].abs() > 0.1)

def _reward_torques(self):
"""
Penalizes the use of high torques in the robot's joints. Encourages efficient movement by minimizing
the necessary force exerted by the motors.
"""
return torch.sum(torch.square(self.torques), dim=1)

def _reward_dof_vel(self):
"""
Penalizes high velocities at the degrees of freedom (DOF) of the robot. This encourages smoother and
more controlled movements.
"""
return torch.sum(torch.square(self.dof_vel), dim=1)

def _reward_dof_acc(self):
"""
Penalizes high accelerations at the robot's degrees of freedom (DOF). This is important for ensuring
smooth and stable motion, reducing wear on the robot's mechanical parts.
"""
return torch.sum(torch.square((self.last_dof_vel - self.dof_vel) / self.dt), dim=1)

def _reward_collision(self):
"""
Penalizes collisions of the robot with the environment, specifically focusing on selected body parts.
This encourages the robot to avoid undesired contact with objects or surfaces.
"""
return torch.sum(1.*(torch.norm(self.contact_forces[:, self.penalised_contact_indices, :], dim=-1) > 0.1), dim=1)

def _reward_action_smoothness(self):
"""
Encourages smoothness in the robot's actions by penalizing large differences between consecutive actions.
This is important for achieving fluid motion and reducing mechanical stress.
"""
term_1 = torch.sum(torch.square(
self.last_actions - self.actions), dim=1)
term_2 = torch.sum(torch.square(
self.actions + self.last_last_actions - 2 * self.last_actions), dim=1)
term_3 = 0.05 * torch.sum(torch.abs(self.actions), dim=1)
return term_1 + term_2 + term_3

humanoid_config.py:

# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Copyright (c) 2024 Beijing RobotEra TECHNOLOGY CO.,LTD. All rights reserved.


from humanoid.envs.base.legged_robot_config import LeggedRobotCfg, LeggedRobotCfgPPO


class XBotLCfg(LeggedRobotCfg):
"""
Configuration class for the XBotL humanoid robot.
"""
class env(LeggedRobotCfg.env):
# change the observation dim
frame_stack = 15
c_frame_stack = 3
num_single_obs = 47
num_observations = int(frame_stack * num_single_obs)
single_num_privileged_obs = 73
num_privileged_obs = int(c_frame_stack * single_num_privileged_obs)
num_actions = 12
num_envs = 4096
episode_length_s = 24 # episode length in seconds
use_ref_actions = False # speed up training by using reference actions

class safety:
# safety factors
pos_limit = 1.0
vel_limit = 1.0
torque_limit = 0.85

class asset(LeggedRobotCfg.asset):
file = '{LEGGED_GYM_ROOT_DIR}/resources/robots/XBot/urdf/XBot-L.urdf'

name = "XBot-L"
foot_name = "ankle_roll"
knee_name = "knee"

terminate_after_contacts_on = ['base_link']
penalize_contacts_on = ["base_link"]
self_collisions = 0 # 1 to disable, 0 to enable...bitwise filter
flip_visual_attachments = False
replace_cylinder_with_capsule = False
fix_base_link = False

class terrain(LeggedRobotCfg.terrain):
mesh_type = 'plane'
# mesh_type = 'trimesh'
curriculum = False
# rough terrain only:
measure_heights = False
static_friction = 0.6
dynamic_friction = 0.6
terrain_length = 8.
terrain_width = 8.
num_rows = 20 # number of terrain rows (levels)
num_cols = 20 # number of terrain cols (types)
max_init_terrain_level = 10 # starting curriculum state
# plane; obstacles; uniform; slope_up; slope_down, stair_up, stair_down
terrain_proportions = [0.2, 0.2, 0.4, 0.1, 0.1, 0, 0]
restitution = 0.

class noise:
add_noise = True
noise_level = 0.6 # scales other values

class noise_scales:
dof_pos = 0.05
dof_vel = 0.5
ang_vel = 0.1
lin_vel = 0.05
quat = 0.03
height_measurements = 0.1

class init_state(LeggedRobotCfg.init_state):
pos = [0.0, 0.0, 0.95]

default_joint_angles = { # = target angles [rad] when action = 0.0
'left_leg_roll_joint': 0.,
'left_leg_yaw_joint': 0.,
'left_leg_pitch_joint': 0.,
'left_knee_joint': 0.,
'left_ankle_pitch_joint': 0.,
'left_ankle_roll_joint': 0.,
'right_leg_roll_joint': 0.,
'right_leg_yaw_joint': 0.,
'right_leg_pitch_joint': 0.,
'right_knee_joint': 0.,
'right_ankle_pitch_joint': 0.,
'right_ankle_roll_joint': 0.,
}

class control(LeggedRobotCfg.control):
# PD Drive parameters:
stiffness = {'leg_roll': 200.0, 'leg_pitch': 350.0, 'leg_yaw': 200.0,
'knee': 350.0, 'ankle': 15}
damping = {'leg_roll': 10, 'leg_pitch': 10, 'leg_yaw':
10, 'knee': 10, 'ankle': 10}

# action scale: target angle = actionScale * action + defaultAngle
action_scale = 0.25
# decimation: Number of control action updates @ sim DT per policy DT
decimation = 10 # 100hz

class sim(LeggedRobotCfg.sim):
dt = 0.001 # 1000 Hz
substeps = 1
up_axis = 1 # 0 is y, 1 is z

class physx(LeggedRobotCfg.sim.physx):
num_threads = 10
solver_type = 1 # 0: pgs, 1: tgs
num_position_iterations = 4
num_velocity_iterations = 1
contact_offset = 0.01 # [m]
rest_offset = 0.0 # [m]
bounce_threshold_velocity = 0.1 # [m/s]
max_depenetration_velocity = 1.0
max_gpu_contact_pairs = 2**23 # 2**24 -> needed for 8000 envs and more
default_buffer_size_multiplier = 5
# 0: never, 1: last sub-step, 2: all sub-steps (default=2)
contact_collection = 2

class domain_rand:
randomize_friction = True
friction_range = [0.1, 2.0]
randomize_base_mass = True
added_mass_range = [-5., 5.]
push_robots = True
push_interval_s = 4
max_push_vel_xy = 0.2
max_push_ang_vel = 0.4
# dynamic randomization
action_delay = 0.5
action_noise = 0.02

class commands(LeggedRobotCfg.commands):
# Vers: lin_vel_x, lin_vel_y, ang_vel_yaw, heading (in heading mode ang_vel_yaw is recomputed from heading error)
num_commands = 4
resampling_time = 8. # time before command are changed[s]
heading_command = True # if true: compute ang vel command from heading error

class ranges:
lin_vel_x = [-0.3, 0.6] # min max [m/s]
lin_vel_y = [-0.3, 0.3] # min max [m/s]
ang_vel_yaw = [-0.3, 0.3] # min max [rad/s]
heading = [-3.14, 3.14]

class rewards:
base_height_target = 0.89
min_dist = 0.2
max_dist = 0.5
# put some settings here for LLM parameter tuning
target_joint_pos_scale = 0.17 # rad
target_feet_height = 0.06 # m
cycle_time = 0.64 # sec
# if true negative total rewards are clipped at zero (avoids early termination problems)
only_positive_rewards = True
# tracking reward = exp(error*sigma)
tracking_sigma = 5
max_contact_force = 700 # Forces above this value are penalized

class scales:
# reference motion tracking
joint_pos = 1.6
feet_clearance = 1.
feet_contact_number = 1.2
# gait
feet_air_time = 1.
foot_slip = -0.05
feet_distance = 0.2
knee_distance = 0.2
# contact
feet_contact_forces = -0.01
# vel tracking
tracking_lin_vel = 1.2
tracking_ang_vel = 1.1
vel_mismatch_exp = 0.5 # lin_z; ang x,y
low_speed = 0.2
track_vel_hard = 0.5
# base pos
default_joint_pos = 0.5
orientation = 1.
base_height = 0.2
base_acc = 0.2
# energy
action_smoothness = -0.002
torques = -1e-5
dof_vel = -5e-4
dof_acc = -1e-7
collision = -1.

class normalization:
class obs_scales:
lin_vel = 2.
ang_vel = 1.
dof_pos = 1.
dof_vel = 0.05
quat = 1.
height_measurements = 5.0
clip_observations = 18.
clip_actions = 18.


class XBotLCfgPPO(LeggedRobotCfgPPO):
seed = 5
runner_class_name = 'OnPolicyRunner' # DWLOnPolicyRunner

class policy:
init_noise_std = 1.0
actor_hidden_dims = [512, 256, 128]
critic_hidden_dims = [768, 256, 128]

class algorithm(LeggedRobotCfgPPO.algorithm):
entropy_coef = 0.001
learning_rate = 1e-5
num_learning_epochs = 2
gamma = 0.994
lam = 0.9
num_mini_batches = 4

class runner:
policy_class_name = 'ActorCritic'
algorithm_class_name = 'PPO'
num_steps_per_env = 60 # per iteration
max_iterations = 3001 # number of policy updates

# logging
save_interval = 100 # Please check for potential savings every `save_interval` iterations.
experiment_name = 'XBot_ppo'
run_name = ''
# Load and resume
resume = False
load_run = -1 # -1 = last run
checkpoint = -1 # -1 = last saved model
resume_path = None # updated from load_run and chkpt

PPO.py:

# SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2021 ETH Zurich, Nikita Rudin
# SPDX-License-Identifier: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Copyright (c) 2024 Beijing RobotEra TECHNOLOGY CO.,LTD. All rights reserved.

import torch
import torch.nn as nn
import torch.optim as optim

from .actor_critic import ActorCritic
from .rollout_storage import RolloutStorage

class PPO:
actor_critic: ActorCritic
def __init__(self,
actor_critic,
num_learning_epochs=1,
num_mini_batches=1,
clip_param=0.2,
gamma=0.998,
lam=0.95,
value_loss_coef=1.0,
entropy_coef=0.0,
learning_rate=1e-3,
max_grad_norm=1.0,
use_clipped_value_loss=True,
schedule="fixed",
desired_kl=0.01,
device='cpu',
):

self.device = device

self.desired_kl = desired_kl
self.schedule = schedule
self.learning_rate = learning_rate

# PPO components
self.actor_critic = actor_critic
self.actor_critic.to(self.device)
self.storage = None # initialized later
self.optimizer = optim.Adam(self.actor_critic.parameters(), lr=learning_rate)
self.transition = RolloutStorage.Transition()

# PPO parameters
self.clip_param = clip_param
self.num_learning_epochs = num_learning_epochs
self.num_mini_batches = num_mini_batches
self.value_loss_coef = value_loss_coef
self.entropy_coef = entropy_coef
self.gamma = gamma
self.lam = lam
self.max_grad_norm = max_grad_norm
self.use_clipped_value_loss = use_clipped_value_loss

def init_storage(self, num_envs, num_transitions_per_env, actor_obs_shape, critic_obs_shape, action_shape):
self.storage = RolloutStorage(num_envs, num_transitions_per_env, actor_obs_shape, critic_obs_shape, action_shape, self.device)

def test_mode(self):
self.actor_critic.test()

def train_mode(self):
self.actor_critic.train()

def act(self, obs, critic_obs):
# Compute the actions and values
self.transition.actions = self.actor_critic.act(obs).detach()
self.transition.values = self.actor_critic.evaluate(critic_obs).detach()
self.transition.actions_log_prob = self.actor_critic.get_actions_log_prob(self.transition.actions).detach()
self.transition.action_mean = self.actor_critic.action_mean.detach()
self.transition.action_sigma = self.actor_critic.action_std.detach()
# need to record obs and critic_obs before env.step()
self.transition.observations = obs
self.transition.critic_observations = critic_obs
return self.transition.actions

def process_env_step(self, rewards, dones, infos):
self.transition.rewards = rewards.clone()
self.transition.dones = dones
# Bootstrapping on time outs
if 'time_outs' in infos:
self.transition.rewards += self.gamma * torch.squeeze(self.transition.values * infos['time_outs'].unsqueeze(1).to(self.device), 1)

# Record the transition
self.storage.add_transitions(self.transition)
self.transition.clear()
self.actor_critic.reset(dones)

def compute_returns(self, last_critic_obs):
last_values= self.actor_critic.evaluate(last_critic_obs).detach()
self.storage.compute_returns(last_values, self.gamma, self.lam)

def update(self):
mean_value_loss = 0
mean_surrogate_loss = 0

generator = self.storage.mini_batch_generator(self.num_mini_batches, self.num_learning_epochs)
for obs_batch, critic_obs_batch, actions_batch, target_values_batch, advantages_batch, returns_batch, old_actions_log_prob_batch, \
old_mu_batch, old_sigma_batch, hid_states_batch, masks_batch in generator:


self.actor_critic.act(obs_batch, masks=masks_batch, hidden_states=hid_states_batch[0])
actions_log_prob_batch = self.actor_critic.get_actions_log_prob(actions_batch)
value_batch = self.actor_critic.evaluate(critic_obs_batch, masks=masks_batch, hidden_states=hid_states_batch[1])
mu_batch = self.actor_critic.action_mean
sigma_batch = self.actor_critic.action_std
entropy_batch = self.actor_critic.entropy

# KL
if self.desired_kl != None and self.schedule == 'adaptive':
with torch.inference_mode():
kl = torch.sum(
torch.log(sigma_batch / old_sigma_batch + 1.e-5) + (torch.square(old_sigma_batch) + torch.square(old_mu_batch - mu_batch)) / (2.0 * torch.square(sigma_batch)) - 0.5, axis=-1)
kl_mean = torch.mean(kl)

if kl_mean > self.desired_kl * 2.0:
self.learning_rate = max(1e-5, self.learning_rate / 1.5)
elif kl_mean < self.desired_kl / 2.0 and kl_mean > 0.0:
self.learning_rate = min(1e-2, self.learning_rate * 1.5)

for param_group in self.optimizer.param_groups:
param_group['lr'] = self.learning_rate


# Surrogate loss
ratio = torch.exp(actions_log_prob_batch - torch.squeeze(old_actions_log_prob_batch))
surrogate = -torch.squeeze(advantages_batch) * ratio
surrogate_clipped = -torch.squeeze(advantages_batch) * torch.clamp(ratio, 1.0 - self.clip_param,
1.0 + self.clip_param)
surrogate_loss = torch.max(surrogate, surrogate_clipped).mean()

# Value function loss
if self.use_clipped_value_loss:
value_clipped = target_values_batch + (value_batch - target_values_batch).clamp(-self.clip_param,
self.clip_param)
value_losses = (value_batch - returns_batch).pow(2)
value_losses_clipped = (value_clipped - returns_batch).pow(2)
value_loss = torch.max(value_losses, value_losses_clipped).mean()
else:
value_loss = (returns_batch - value_batch).pow(2).mean()

loss = surrogate_loss + self.value_loss_coef * value_loss - self.entropy_coef * entropy_batch.mean()

# Gradient step
self.optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.max_grad_norm)
self.optimizer.step()

mean_value_loss += value_loss.item()
mean_surrogate_loss += surrogate_loss.item()

num_updates = self.num_learning_epochs * self.num_mini_batches
mean_value_loss /= num_updates
mean_surrogate_loss /= num_updates
self.storage.clear()

return mean_value_loss, mean_surrogate_loss

本文字数:0

预计阅读时间:0 分钟


统计信息加载中...

有问题?请向我提出issue