From 9a21e81abdecac3212fb1117d79be5b0a5770efe Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Wed, 17 Apr 2024 11:53:36 +0800 Subject: [PATCH 01/18] feat: support cbf methods --- omnisafe/adapter/__init__.py | 1 + omnisafe/adapter/barrier_function_adapter.py | 219 ++++++++ .../adapter/beta_barrier_function_adapter.py | 245 +++++++++ .../offpolicy_barrier_function_adapter.py | 151 ++++++ .../robust_barrier_function_adapter.py | 174 ++++++ omnisafe/algorithms/__init__.py | 4 + omnisafe/algorithms/off_policy/__init__.py | 15 +- omnisafe/algorithms/off_policy/ddpg_cbf.py | 93 ++++ omnisafe/algorithms/off_policy/sac_rcbf.py | 175 ++++++ omnisafe/algorithms/on_policy/__init__.py | 3 + .../on_policy/barrier_function/__init__.py | 24 + .../on_policy/barrier_function/ppo_cbf.py | 106 ++++ .../on_policy/barrier_function/trpo_cbf.py | 117 ++++ omnisafe/common/barrier_comp.py | 86 +++ omnisafe/common/barrier_solver.py | 251 +++++++++ omnisafe/common/buffer/onpolicy_buffer.py | 12 +- .../common/buffer/vector_onpolicy_buffer.py | 17 + omnisafe/common/robust_barrier_solver.py | 428 +++++++++++++++ omnisafe/common/robust_gp_model.py | 498 ++++++++++++++++++ omnisafe/common/utils.py | 215 ++++++++ omnisafe/configs/off-policy/DDPGCBF.yaml | 171 ++++++ omnisafe/configs/off-policy/SACRCBF.yaml | 148 ++++++ omnisafe/configs/on-policy/IPO.yaml | 20 +- omnisafe/configs/on-policy/PPOBetaCBF.yaml | 120 +++++ omnisafe/configs/on-policy/TRPO.yaml | 32 ++ omnisafe/configs/on-policy/TRPOCBF.yaml | 139 +++++ omnisafe/envs/__init__.py | 2 + omnisafe/envs/barrier_function_env.py | 209 ++++++++ omnisafe/envs/robust_barrier_function_env.py | 224 ++++++++ omnisafe/envs/unicycle_env.py | 366 +++++++++++++ omnisafe/models/actor/actor_builder.py | 9 + omnisafe/models/actor/beta_learning_actor.py | 144 +++++ omnisafe/typing.py | 2 +- 33 files changed, 4386 insertions(+), 34 deletions(-) create mode 100644 omnisafe/adapter/barrier_function_adapter.py create mode 100644 omnisafe/adapter/beta_barrier_function_adapter.py create mode 100644 omnisafe/adapter/offpolicy_barrier_function_adapter.py create mode 100644 omnisafe/adapter/robust_barrier_function_adapter.py create mode 100644 omnisafe/algorithms/off_policy/ddpg_cbf.py create mode 100644 omnisafe/algorithms/off_policy/sac_rcbf.py create mode 100644 omnisafe/algorithms/on_policy/barrier_function/__init__.py create mode 100644 omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py create mode 100644 omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py create mode 100644 omnisafe/common/barrier_comp.py create mode 100644 omnisafe/common/barrier_solver.py create mode 100644 omnisafe/common/robust_barrier_solver.py create mode 100644 omnisafe/common/robust_gp_model.py create mode 100644 omnisafe/common/utils.py create mode 100644 omnisafe/configs/off-policy/DDPGCBF.yaml create mode 100644 omnisafe/configs/off-policy/SACRCBF.yaml create mode 100644 omnisafe/configs/on-policy/PPOBetaCBF.yaml create mode 100644 omnisafe/configs/on-policy/TRPOCBF.yaml create mode 100644 omnisafe/envs/barrier_function_env.py create mode 100644 omnisafe/envs/robust_barrier_function_env.py create mode 100644 omnisafe/envs/unicycle_env.py create mode 100644 omnisafe/models/actor/beta_learning_actor.py diff --git a/omnisafe/adapter/__init__.py b/omnisafe/adapter/__init__.py index ba768a7eb..75d4539ba 100644 --- a/omnisafe/adapter/__init__.py +++ b/omnisafe/adapter/__init__.py @@ -22,3 +22,4 @@ from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter from omnisafe.adapter.saute_adapter import SauteAdapter from omnisafe.adapter.simmer_adapter import SimmerAdapter +from omnisafe.adapter.beta_barrier_function_adapter import BetaBarrierFunctionAdapter diff --git a/omnisafe/adapter/barrier_function_adapter.py b/omnisafe/adapter/barrier_function_adapter.py new file mode 100644 index 000000000..47fa9b871 --- /dev/null +++ b/omnisafe/adapter/barrier_function_adapter.py @@ -0,0 +1,219 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""BarrierFunction Adapter for OmniSafe.""" + +from __future__ import annotations + +import torch +from rich.progress import track + +from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.common.buffer import VectorOnPolicyBuffer +from omnisafe.common.logger import Logger +from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic +from omnisafe.utils.config import Config +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.common.barrier_comp import BarrierCompensator + +from omnisafe.envs.wrapper import ( + AutoReset, + CostNormalize, + RewardNormalize, + TimeLimit, + Unsqueeze, +) + +class BarrierFunctionAdapter(OnPolicyAdapter): + """BarrierFunction Adapter for OmniSafe. + + The BarrierFunction Adapter is used to establish the logic of interaction between agents and the + environment based on control barrier functions. Its key feature is the introduction of action + compensators and barrier function solvers. + + Args: + env_id (str): The environment id. + num_envs (int): The number of parallel environments. + seed (int): The random seed. + cfgs (Config): The configuration passed from yaml file. + """ + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + """Initialize an instance of :class:`BarrierFunctionAdapter`.""" + super().__init__(env_id, num_envs, seed, cfgs) + self.solver = None + self.compensator = None + self.first_iter = 1 + + def _wrapper( + self, + obs_normalize: bool = False, + reward_normalize: bool = True, + cost_normalize: bool = True, + ) -> None: + """Wrapper the environment. + + .. warning:: + Since solving the optimization problem requires obtaining physical quantities with practical + significance from state observations, the Barrier Function Adapter does not support + normalization of observations. + + Args: + obs_normalize (bool, optional): Whether to normalize the observation. Defaults to False. + reward_normalize (bool, optional): Whether to normalize the reward. Defaults to True. + cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True. + """ + assert not obs_normalize, 'Barrier function does not support observation normalization!' + if self._env.need_time_limit_wrapper: + self._env = TimeLimit(self._env, time_limit=1000, device=self._device) + self._eval_env = TimeLimit(self._eval_env, time_limit=1000, device=self._device) + if self._env.need_auto_reset_wrapper: + self._env = AutoReset(self._env, device=self._device) + self._eval_env = AutoReset(self._eval_env, device=self._device) + if reward_normalize: + self._env = RewardNormalize(self._env, device=self._device) + if cost_normalize: + self._env = CostNormalize(self._env, device=self._device) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env, device=self._device) + self._eval_env = Unsqueeze(self._eval_env, device=self._device) + + def set_solver(self, solver: PendulumSolver): + """Set the barrier function solver for Pendulum environment.""" + self.solver: PendulumSolver = solver + + def set_compensator(self, compensator: BarrierCompensator): + """Set the action compensator.""" + self.compensator: BarrierCompensator = compensator + + def reset_gp_model(self): + """Reset the gaussian processing model of barrier function solver.""" + self.solver.GP_model_prev = self.solver.GP_model.copy() + self.solver.build_GP_model() + + def rollout( # pylint: disable=too-many-locals + self, + steps_per_epoch: int, + agent: ConstraintActorCritic, + buffer: VectorOnPolicyBuffer, + logger: Logger, + ) -> None: + """Rollout the environment and store the data in the buffer. + + .. warning:: + As OmniSafe uses :class:`AutoReset` wrapper, the environment will be reset automatically, + so the final observation will be stored in ``info['final_observation']``. + + Args: + steps_per_epoch (int): Number of steps per epoch. + agent (ConstraintActorCritic): Constraint actor-critic, including actor , reward critic + and cost critic. + buffer (VectorOnPolicyBuffer): Vector on-policy buffer. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + """ + self._reset_log() + if not self.first_iter: + self.reset_gp_model() + + obs, _ = self.reset() + while abs(self._env.unwrapped.state[0]) > 1: + obs, _ = self._env.reset() + path_obs = [] + path_act = [] + for step in track( + range(steps_per_epoch), + description=f'Processing rollout for epoch: {logger.current_epoch}...', + ): + with torch.no_grad(): + value_r = agent.reward_critic(obs)[0] + value_c = agent.cost_critic(obs)[0] + act_dist = agent.actor(obs) + act_mean, act_std = act_dist.mean, agent.actor.std + + approx_compensating_act = self.compensator(obs=obs) + compensated_act_mean_raw = act_mean + approx_compensating_act + + if self.first_iter: + [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model = False) + else: + [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model = True) + + compensating_act = self.solver.control_barrier(compensated_act_mean_raw, f, g, x, std) + + compensated_act_mean = compensated_act_mean_raw + compensating_act + final_act = torch.normal(compensated_act_mean, act_std) + + logp = agent.actor.log_prob(final_act).detach() + path_obs.append(obs.detach().cpu().squeeze().numpy()) + path_act.append(final_act.detach().cpu().squeeze().numpy()) + + next_obs, reward, cost, terminated, truncated, info = self.step(final_act) + + self._log_value(reward=reward, cost=cost, info=info) + + if self._cfgs.algo_cfgs.use_cost: + logger.store({'Value/cost': value_c}) + logger.store({'Value/reward': value_r}) + logger.store({'Metrics/angle': cost}) + + buffer.store( + obs=obs, + act=final_act, + reward=reward, + cost=cost, + value_r=value_r, + value_c=value_c, + logp=logp, + approx_compensating_act=approx_compensating_act.detach(), + compensating_act=compensating_act.detach(), + ) + + obs = next_obs + epoch_end = step >= steps_per_epoch + for idx, (done, time_out) in enumerate(zip(terminated, truncated)): + if epoch_end or done or time_out: + last_value_r = torch.zeros(1) + last_value_c = torch.zeros(1) + if not done: + if epoch_end: + logger.log( + f'Warning: trajectory cut off when rollout by epoch at {self._ep_len[idx]} steps.', + ) + _, last_value_r, last_value_c, _ = agent.step(obs[idx]) + if time_out: + _, last_value_r, last_value_c, _ = agent.step( + obs[idx], + ) + last_value_r = last_value_r.unsqueeze(0) + last_value_c = last_value_c.unsqueeze(0) + + if done or time_out: + self._log_metrics(logger, idx) + self._reset_log(idx) + + self._ep_ret[idx] = 0.0 + self._ep_cost[idx] = 0.0 + self._ep_len[idx] = 0.0 + + if step < 650: + self.solver.update_GP_dynamics(obs = path_obs, act = path_act) + + path_obs = [] + path_act = [] + obs, _ = self.reset() + while abs(self._env.unwrapped.state[0]) > 1: + obs, _ = self._env.reset() + buffer.finish_path(last_value_r, last_value_c, idx) + self.first_iter = 0 + diff --git a/omnisafe/adapter/beta_barrier_function_adapter.py b/omnisafe/adapter/beta_barrier_function_adapter.py new file mode 100644 index 000000000..f785c3062 --- /dev/null +++ b/omnisafe/adapter/beta_barrier_function_adapter.py @@ -0,0 +1,245 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""BarrierFunction Adapter for OmniSafe.""" + +from __future__ import annotations + +import torch +import numpy as np +from rich.progress import track + +from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.common.buffer import VectorOnPolicyBuffer +from omnisafe.common.logger import Logger +from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic +from omnisafe.utils.config import Config +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.common.barrier_comp import BarrierCompensator + +from omnisafe.envs.wrapper import ( + AutoReset, + CostNormalize, + RewardNormalize, + TimeLimit, + Unsqueeze, +) + + +def cbf(state=None, eta: float = 0.99): + """ + Calculates CBF constraint set at a given state. Default is + the current state. + """ + + state = state + g = 9.8 + m = 1 + l = 1 + tau = 5e-2 + theta_safety_bounds = [-1.0, 1.0] + thetadot_safety_bounds = [-np.inf, np.inf] + torque_bounds = [-15.0, 15.0] + if (eta>1-1e-3) or (eta<1e-5): + raise ValueError("eta should be inside (0, 1)") + c1 = ((3 * g)/(2 * l)) + c2 = (3 /(m * (l ** 2))) + + theta, thetadot = state[0], state[1] + theta_min, theta_max = theta_safety_bounds[0], theta_safety_bounds[1] + thetadot_min, thetadot_max = thetadot_safety_bounds[0], thetadot_safety_bounds[1] + u_min1 = (1/c2) * (((1 / (tau **2)) * (-eta * (theta - theta_min) - tau * thetadot)) - c1 * np.sin(theta) ) + u_max1 = (1/c2) * (((1 / (tau **2)) * ( eta * (theta_max - theta) - tau * thetadot)) - c1 * np.sin(theta) ) + + + u_min2 = (1/c2) * (((1 / (tau)) * (-eta * (thetadot - thetadot_min))) - c1 * np.sin(theta) ) + u_max2 = (1/c2) * (((1 / (tau)) * ( eta * (thetadot_max - thetadot))) - c1 * np.sin(theta) ) + + u_min = max(u_min1, u_min2, torque_bounds[0]) + u_max = min(u_max1, u_max2, torque_bounds[1]) + + u_min=torque_bounds[0] + u_max=torque_bounds[1] + if u_min>u_max: + raise ValueError("Infeasible") + else: + return [u_min, u_max] + +def vectorize_f(f): #--vipul :added action_dim + """ + Converts a function f defined on 1D numpy arrays and outputting pairs of + scalars into a vectorized function accepting batches of + torch tensorized arrays and output pairs of torch tensors. + """ + + def vectorized_f_(obs): #--vipul :added action_dim + + obs = obs.cpu().detach().numpy() + + if len(obs.shape) == 1: # check to see if obs is a batch or single obs + batch_size = 1 + lbs, ubs = f(obs) + lbs=np.array(lbs) + ubs=np.array(ubs) + #lbs = -5 + #ubs = 5 + + else: + batch_size = obs.shape[0] + lbs = np.zeros([batch_size, 1]) + ubs = np.zeros([batch_size, 1]) + for i in range(batch_size): + lbs[i], ubs[i] = f(obs[i]) + + lbs = torch.FloatTensor(lbs).reshape(batch_size, 1) + ubs = torch.FloatTensor(ubs).reshape(batch_size, 1) + + return lbs, ubs + + return vectorized_f_ + + +class BetaBarrierFunctionAdapter(OnPolicyAdapter): + """BarrierFunction Adapter for OmniSafe. + + The BarrierFunction Adapter is used to establish the logic of interaction between agents and the + environment based on control barrier functions. Its key feature is the introduction of action + compensators and barrier function solvers. + + Args: + env_id (str): The environment id. + num_envs (int): The number of parallel environments. + seed (int): The random seed. + cfgs (Config): The configuration passed from yaml file. + """ + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + """Initialize an instance of :class:`BarrierFunctionAdapter`.""" + super().__init__(env_id, num_envs, seed, cfgs) + self.solver = None + self.compensator = None + self.first_iter = 1 + self.constraint_fn = vectorize_f(cbf) + + def _wrapper( + self, + obs_normalize: bool = False, + reward_normalize: bool = True, + cost_normalize: bool = True, + ) -> None: + """Wrapper the environment. + + .. warning:: + Since solving the optimization problem requires obtaining physical quantities with practical + significance from state observations, the Barrier Function Adapter does not support + normalization of observations. + + Args: + obs_normalize (bool, optional): Whether to normalize the observation. Defaults to False. + reward_normalize (bool, optional): Whether to normalize the reward. Defaults to True. + cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True. + """ + assert not obs_normalize, 'Barrier function does not support observation normalization!' + if reward_normalize: + self._env = RewardNormalize(self._env, device=self._device) + if cost_normalize: + self._env = CostNormalize(self._env, device=self._device) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env, device=self._device) + self._eval_env = Unsqueeze(self._eval_env, device=self._device) + + def rollout( # pylint: disable=too-many-locals + self, + steps_per_epoch: int, + agent: ConstraintActorCritic, + buffer: VectorOnPolicyBuffer, + logger: Logger, + ) -> None: + """Rollout the environment and store the data in the buffer. + + .. warning:: + As OmniSafe uses :class:`AutoReset` wrapper, the environment will be reset automatically, + so the final observation will be stored in ``info['final_observation']``. + + Args: + steps_per_epoch (int): Number of steps per epoch. + agent (ConstraintActorCritic): Constraint actor-critic, including actor , reward critic + and cost critic. + buffer (VectorOnPolicyBuffer): Vector on-policy buffer. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + """ + self._reset_log() + obs, _ = self.reset() + while abs(self._env.unwrapped.state[0]) > 1: + obs, _ = self._env.reset() + for step in track( + range(steps_per_epoch), + description=f'Processing rollout for epoch: {logger.current_epoch}...', + ): + with torch.no_grad(): + act, value_r, value_c, logp = agent.step(obs) + lb, ub = self.constraint_fn(obs) + final_act = lb + (ub-lb)*act + + next_obs, reward, cost, terminated, truncated, info = self.step(final_act) + + self._log_value(reward=reward, cost=cost, info=info) + + if self._cfgs.algo_cfgs.use_cost: + logger.store({'Value/cost': value_c}) + logger.store({'Value/reward': value_r}) + logger.store({'Metrics/angle': info.get('original_cost', cost).cpu()}) + + buffer.store( + obs=obs, + act=act, + reward=reward, + cost=cost, + value_r=value_r, + value_c=value_c, + logp=logp, + ) + + obs = next_obs + epoch_end = step >= steps_per_epoch + for idx, (done, time_out) in enumerate(zip(terminated, truncated)): + if epoch_end or done or time_out: + last_value_r = torch.zeros(1) + last_value_c = torch.zeros(1) + if not done: + if epoch_end: + logger.log( + f'Warning: trajectory cut off when rollout by epoch at {self._ep_len[idx]} steps.', + ) + _, last_value_r, last_value_c, _ = agent.step(obs[idx]) + if time_out: + _, last_value_r, last_value_c, _ = agent.step( + obs[idx], + ) + last_value_r = last_value_r.unsqueeze(0) + last_value_c = last_value_c.unsqueeze(0) + + if done or time_out: + self._log_metrics(logger, idx) + self._reset_log(idx) + + self._ep_ret[idx] = 0.0 + self._ep_cost[idx] = 0.0 + self._ep_len[idx] = 0.0 + obs, _ = self.reset() + while abs(self._env.unwrapped.state[0]) > 1: + obs, _ = self._env.reset() + buffer.finish_path(last_value_r, last_value_c, idx) + self.first_iter = 0 + diff --git a/omnisafe/adapter/offpolicy_barrier_function_adapter.py b/omnisafe/adapter/offpolicy_barrier_function_adapter.py new file mode 100644 index 000000000..b05e950cb --- /dev/null +++ b/omnisafe/adapter/offpolicy_barrier_function_adapter.py @@ -0,0 +1,151 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""BarrierFunction Adapter for OmniSafe.""" + +from __future__ import annotations + +import torch +import numpy as np + +from omnisafe.adapter.offpolicy_adapter import OffPolicyAdapter +from omnisafe.common.buffer import VectorOffPolicyBuffer +from omnisafe.common.logger import Logger +from omnisafe.utils.config import Config +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.common.robust_barrier_solver import CBFQPLayer +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.models.actor_critic.constraint_actor_q_critic import ConstraintActorQCritic +from omnisafe.common.robust_gp_model import DynamicsModel + +from omnisafe.envs.wrapper import ( + CostNormalize, + RewardNormalize, + Unsqueeze, +) + +class OffPolicyBarrierFunctionAdapter(OffPolicyAdapter): + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + """Initialize an instance of :class:`BarrierFunctionAdapter`.""" + super().__init__(env_id, num_envs, seed, cfgs) + self.solver = None + self.compensator = None + self.first_iter = 1 + self.episode_rollout = {} + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] + self.episode_rollout['approx_compensating_act'] = [] + self.episode_rollout['compensating_act'] = [] + + def _wrapper( + self, + obs_normalize: bool = False, + reward_normalize: bool = True, + cost_normalize: bool = True, + ) -> None: + assert not obs_normalize, 'Barrier function does not support observation normalization!' + if reward_normalize: + self._env = RewardNormalize(self._env, device=self._device) + if cost_normalize: + self._env = CostNormalize(self._env, device=self._device) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env, device=self._device) + self._eval_env = Unsqueeze(self._eval_env, device=self._device) + + def set_solver(self, solver: PendulumSolver): + """Set the barrier function solver for Pendulum environment.""" + self.solver: PendulumSolver = solver + + def set_compensator(self, compensator: BarrierCompensator): + """Set the action compensator.""" + self.compensator: BarrierCompensator = compensator + + def reset_gp_model(self): + """Reset the gaussian processing model of barrier function solver.""" + self.solver.GP_model_prev = self.solver.GP_model.copy() + self.solver.build_GP_model() + + def rollout( # pylint: disable=too-many-locals + self, + rollout_step: int, + agent: ConstraintActorQCritic, + buffer: VectorOffPolicyBuffer, + logger: Logger, + use_rand_action: bool, + ) -> None: + for _ in range(rollout_step): + if use_rand_action: + act = torch.normal(torch.zeros(self.action_space.shape), torch.ones(self.action_space.shape)).unsqueeze(0) + else: + act = agent.actor.predict(self._current_obs, deterministic=False) + + final_act = self.get_safe_action(obs=self._current_obs, act=act) + + self.episode_rollout['obs'].append(self._current_obs) + self.episode_rollout['final_act'].append(final_act) + + next_obs, reward, cost, terminated, truncated, info = self.step(final_act) + logger.store({'Metrics/angle': cost}) + + self._log_value(reward=reward, cost=cost, info=info) + + buffer.store( + obs=self._current_obs, + act=act, + reward=reward, + cost=cost, + done=torch.logical_and(terminated, torch.logical_xor(terminated, truncated)), + next_obs=next_obs, + ) + + self._current_obs = next_obs + for idx, done in enumerate(torch.logical_or(terminated, truncated)): + if done: + self._log_metrics(logger, idx) + compensator_loss = self.compensator.train( + torch.cat(self.episode_rollout['obs']), + torch.cat(self.episode_rollout['approx_compensating_act']), + torch.cat(self.episode_rollout['compensating_act']), + ) + logger.store({'Value/Loss_compensator': compensator_loss.item()}) + self.solver.update_GP_dynamics(obs=torch.cat(self.episode_rollout['obs']), act=torch.cat(self.episode_rollout['final_act'])) + + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] + self.episode_rollout['approx_compensating_act'] = [] + self.episode_rollout['compensating_act'] = [] + + self._reset_log(idx) + self._current_obs, _ = self._env.reset() + self.first_iter = 0 + if not self.first_iter: + self.reset_gp_model() + + @torch.no_grad + def get_safe_action(self, obs, act): + approx_compensating_act = self.compensator(obs=self._current_obs) + compensated_act_mean_raw = act + approx_compensating_act + + if self.first_iter: + [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model = False) + else: + [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model = True) + + compensating_act = self.solver.control_barrier(compensated_act_mean_raw, f, g, x, std) + safe_act = compensated_act_mean_raw + compensating_act + + self.episode_rollout['compensating_act'].append(compensating_act) + self.episode_rollout['approx_compensating_act'].append(approx_compensating_act) + return safe_act \ No newline at end of file diff --git a/omnisafe/adapter/robust_barrier_function_adapter.py b/omnisafe/adapter/robust_barrier_function_adapter.py new file mode 100644 index 000000000..f58f1e176 --- /dev/null +++ b/omnisafe/adapter/robust_barrier_function_adapter.py @@ -0,0 +1,174 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""BarrierFunction Adapter for OmniSafe.""" + +from __future__ import annotations + +import torch +import numpy as np + +from omnisafe.adapter.offpolicy_adapter import OffPolicyAdapter +from omnisafe.common.buffer import VectorOffPolicyBuffer +from omnisafe.common.logger import Logger +from omnisafe.utils.config import Config +from omnisafe.common.robust_barrier_solver import CBFQPLayer +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.models.actor_critic.constraint_actor_q_critic import ConstraintActorQCritic +from omnisafe.typing import OmnisafeSpace +from omnisafe.common.robust_gp_model import DynamicsModel + + +from omnisafe.envs.wrapper import ( + CostNormalize, + RewardNormalize, + Unsqueeze, +) + +class RobustBarrierFunctionAdapter(OffPolicyAdapter): + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + """Initialize an instance of :class:`BarrierFunctionAdapter`.""" + super().__init__(env_id, num_envs, seed, cfgs) + self.solver = None + self.compensator = None + self._current_steps = 0 + self._num_episodes = 0 + + def _wrapper( + self, + obs_normalize: bool = False, + reward_normalize: bool = True, + cost_normalize: bool = True, + ) -> None: + """Wrapper the environment. + + .. warning:: + Since solving the optimization problem requires obtaining physical quantities with practical + significance from state observations, the Barrier Function Adapter does not support + normalization of observations. + + Args: + obs_normalize (bool, optional): Whether to normalize the observation. Defaults to False. + reward_normalize (bool, optional): Whether to normalize the reward. Defaults to True. + cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True. + """ + assert not obs_normalize, 'Barrier function does not support observation normalization!' + if reward_normalize: + self._env = RewardNormalize(self._env, device=self._device) + if cost_normalize: + self._env = CostNormalize(self._env, device=self._device) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env, device=self._device) + self._eval_env = Unsqueeze(self._eval_env, device=self._device) + # self._env = ActionScale(self._env, low=-1.0, high=1.0, device=self._device) + # self._eval_env = ActionScale(self._eval_env, low=-1.0, high=1.0, device=self._device) + + def set_solver(self, solver: CBFQPLayer): + """Set the barrier function solver for Pendulum environment.""" + self.solver: CBFQPLayer = solver + self.solver.env = self._env + + def set_dynamics_model(self, dynamics_model: DynamicsModel): + """Set the dynamics model.""" + self.dynamics_model = dynamics_model + self.dynamics_model.env = self._env + + def rollout( # pylint: disable=too-many-locals + self, + rollout_step: int, + agent: ConstraintActorQCritic, + buffer: VectorOffPolicyBuffer, + logger: Logger, + use_rand_action: bool, + ) -> None: + """Rollout the environment and store the data in the buffer. + + .. warning:: + As OmniSafe uses :class:`AutoReset` wrapper, the environment will be reset automatically, + so the final observation will be stored in ``info['final_observation']``. + + Args: + rollout_step (int): Number of rollout steps. + agent (ConstraintActorCritic): Constraint actor-critic, including actor, reward critic, + and cost critic. + buffer (VectorOnPolicyBuffer): Vector on-policy buffer. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + use_rand_action (bool): Whether to use random action. + """ + for _ in range(rollout_step): + state = self.dynamics_model.get_state(self._current_obs) # 动态模型将观测转换为状态,状态和观测之间有一个互逆的转换 + self._current_steps += 1 + if use_rand_action: + act = torch.normal(torch.zeros(self.action_space.shape), torch.ones(self.action_space.shape)).unsqueeze(0).to(self._device) + else: + act = agent.step(self._current_obs, deterministic=False) + + final_act = self.get_safe_action(obs=self._current_obs, act=act) + next_obs, reward, cost, terminated, truncated, info = self.step(final_act) + self._log_value(reward=reward, cost=cost, info=info) + + buffer.store( + obs=self._current_obs, + act=final_act, + reward=reward, + cost=cost, + done=torch.logical_and(terminated, torch.logical_xor(terminated, truncated)), + next_obs=next_obs, + ) + + if self._ep_len[0] % 2 == 0 and self._num_episodes < self._cfgs.dynamics_model_cfgs.gp_max_episodes: + next_state = self.dynamics_model.get_state(next_obs) + self.dynamics_model.append_transition(state.cpu().detach().numpy(), final_act.cpu().detach().numpy(), next_state.cpu().detach().numpy(), t_batch=np.array([self._ep_len[0]*self._env.dt])) + + self._current_obs = next_obs + for idx, done in enumerate(torch.logical_or(terminated, truncated)): + if done: + self._log_metrics(logger, idx) + self._reset_log(idx) + self._num_episodes += 1 + self._current_obs, _ = self._env.reset() + + @property + def safe_action_space(self) -> OmnisafeSpace: + if hasattr(self._env, 'safe_action_space'): + return self._env.safe_action_space + else: + return self._env.action_space + + def get_safe_action(self, obs, act, modular=False, cbf_info_batch=None): + """Given a nominal action, returns a minimally-altered safe action to take. + + Parameters + ---------- + obs : torch.tensor + act : torch.tensor + dynamics_model : DynamicsModel + + Returns + ------- + safe_act : torch.tensor + Safe actions to be taken (cbf_action + action). + """ + state_batch = self.dynamics_model.get_state(obs) + mean_pred_batch, sigma_pred_batch = self.dynamics_model.predict_disturbance(state_batch) + safe_act = self.solver.get_safe_action(state_batch, act, mean_pred_batch, sigma_pred_batch, modular=modular, cbf_info_batch=cbf_info_batch) + + return safe_act + + def __getattr__(self, name): + try: + return getattr(self._env, name) + except AttributeError: + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") \ No newline at end of file diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index df6832226..f25928ad2 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -35,6 +35,8 @@ DDPGLag, SACLag, TD3Lag, + SACRCBF, + DDPGCBF, ) # Offline Safe @@ -63,6 +65,8 @@ TRPOLag, TRPOSaute, TRPOSimmerPID, + TRPOCBF, + PPOBetaCBF, ) diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py index 80e48e1a0..e87bd82f2 100644 --- a/omnisafe/algorithms/off_policy/__init__.py +++ b/omnisafe/algorithms/off_policy/__init__.py @@ -24,17 +24,8 @@ from omnisafe.algorithms.off_policy.td3 import TD3 from omnisafe.algorithms.off_policy.td3_lag import TD3Lag from omnisafe.algorithms.off_policy.td3_pid import TD3PID +from omnisafe.algorithms.off_policy.sac_rcbf import SACRCBF +from omnisafe.algorithms.off_policy.ddpg_cbf import DDPGCBF -__all__ = [ - 'DDPG', - 'TD3', - 'SAC', - 'DDPGLag', - 'TD3Lag', - 'SACLag', - 'DDPGPID', - 'TD3PID', - 'SACPID', - 'CRABS', -] +__all__ = ['DDPG', 'TD3', 'SAC', 'DDPGLag', 'TD3Lag', 'SACLag', 'DDPGPID', 'TD3PID', 'SACPID', 'SACRCBF', 'DDPGCBF', 'CRABS'] diff --git a/omnisafe/algorithms/off_policy/ddpg_cbf.py b/omnisafe/algorithms/off_policy/ddpg_cbf.py new file mode 100644 index 000000000..12692db67 --- /dev/null +++ b/omnisafe/algorithms/off_policy/ddpg_cbf.py @@ -0,0 +1,93 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the DDPG algorithm with Control Barrier Function.""" + +import torch + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.adapter.offpolicy_barrier_function_adapter import OffPolicyBarrierFunctionAdapter +from omnisafe.common.barrier_comp import BarrierCompensator + + +@registry.register +# pylint: disable-next=too-many-instance-attributes, too-few-public-methods +class DDPGCBF(DDPG): + """The Soft Actor-Critic algorithm with Control Barrier Function. + + References: + - Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + - Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. + - URL: `DDPG `_ + """ + + def _init_env(self) -> None: + self._env: OffPolicyBarrierFunctionAdapter=OffPolicyBarrierFunctionAdapter( + self._env_id, + self._cfgs.train_cfgs.vector_env_nums, + self._seed, + self._cfgs, + ) + solver = PendulumSolver(device=self._cfgs.train_cfgs.device) + compensator = BarrierCompensator( + obs_dim=self._env.observation_space.shape[0], + act_dim=self._env.action_space.shape[0], + cfgs=self._cfgs.compensator_cfgs, + ) + + self._env.set_compensator(compensator=compensator) + self._env.set_solver(solver=solver) + + assert ( + self._cfgs.algo_cfgs.steps_per_epoch % self._cfgs.train_cfgs.vector_env_nums == 0 + ), 'The number of steps per epoch is not divisible by the number of environments.' + + assert ( + int(self._cfgs.train_cfgs.total_steps) % self._cfgs.algo_cfgs.steps_per_epoch == 0 + ), 'The total number of steps is not divisible by the number of steps per epoch.' + self._epochs: int=int( + self._cfgs.train_cfgs.total_steps // self._cfgs.algo_cfgs.steps_per_epoch, + ) + self._epoch: int=0 + self._steps_per_epoch: int=( + self._cfgs.algo_cfgs.steps_per_epoch // self._cfgs.train_cfgs.vector_env_nums + ) + + self._update_cycle: int=self._cfgs.algo_cfgs.update_cycle + assert ( + self._steps_per_epoch % self._update_cycle == 0 + ), 'The number of steps per epoch is not divisible by the number of steps per sample.' + self._samples_per_epoch: int=self._steps_per_epoch // self._update_cycle + self._update_count: int=0 + + def _init(self) -> None: + super()._init() + self._buf.add_field(name='approx_compensating_act', shape=self._env.action_space.shape, dtype=torch.float32) + self._buf.add_field(name='compensating_act', shape=self._env.action_space.shape, dtype=torch.float32) + + def _init_log(self) -> None: + # """Log the DDPGRCBF specific information. + + # +----------------------------+--------------------------+ + # | Things to log | Description | + # +============================+==========================+ + # | Metrics/LagrangeMultiplier | The Lagrange multiplier. | + # +----------------------------+--------------------------+ + # """ + super()._init_log() + if self._cfgs.env_id == 'Pendulum-v1': + self._logger.register_key('Metrics/angle', min_and_max=True) + self._logger.register_key('Value/Loss_compensator') \ No newline at end of file diff --git a/omnisafe/algorithms/off_policy/sac_rcbf.py b/omnisafe/algorithms/off_policy/sac_rcbf.py new file mode 100644 index 000000000..e980025c3 --- /dev/null +++ b/omnisafe/algorithms/off_policy/sac_rcbf.py @@ -0,0 +1,175 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Soft Actor-Critic algorithm with Robust Control Barrier Function.""" + + +import torch +from torch import nn +from torch.nn.utils.clip_grad import clip_grad_norm_ + +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.sac import SAC +from omnisafe.common.robust_barrier_solver import CBFQPLayer +from omnisafe.adapter.robust_barrier_function_adapter import RobustBarrierFunctionAdapter +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.robust_gp_model import DynamicsModel + + +@registry.register +# pylint: disable-next=too-many-instance-attributes, too-few-public-methods +class SACRCBF(SAC): + """The Soft Actor-Critic algorithm with Robust Control Barrier Function. + + References: + - Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + - Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. + - URL: `SAC `_ + """ + + def _init_env(self) -> None: + self._env: RobustBarrierFunctionAdapter=RobustBarrierFunctionAdapter( + self._env_id, + self._cfgs.train_cfgs.vector_env_nums, + self._seed, + self._cfgs, + ) + solver=CBFQPLayer( + env=self._env, + device=self._cfgs.train_cfgs.device, + gamma_b=self._cfgs.cbf_cfgs.gamma_b, + k_d=self._cfgs.cbf_cfgs.k_d, + l_p=self._cfgs.cbf_cfgs.l_p, + ) + dynamics_model=DynamicsModel(env=self._env) + + self._env.set_dynamics_model(dynamics_model=dynamics_model) + self._env.set_solver(solver=solver) + + assert ( + self._cfgs.algo_cfgs.steps_per_epoch % self._cfgs.train_cfgs.vector_env_nums == 0 + ), 'The number of steps per epoch is not divisible by the number of environments.' + + assert ( + int(self._cfgs.train_cfgs.total_steps) % self._cfgs.algo_cfgs.steps_per_epoch == 0 + ), 'The total number of steps is not divisible by the number of steps per epoch.' + self._epochs: int=int( + self._cfgs.train_cfgs.total_steps // self._cfgs.algo_cfgs.steps_per_epoch, + ) + self._epoch: int=0 + self._steps_per_epoch: int=( + self._cfgs.algo_cfgs.steps_per_epoch // self._cfgs.train_cfgs.vector_env_nums + ) + + self._update_cycle: int=self._cfgs.algo_cfgs.update_cycle + assert ( + self._steps_per_epoch % self._update_cycle == 0 + ), 'The number of steps per epoch is not divisible by the number of steps per sample.' + self._samples_per_epoch: int=self._steps_per_epoch // self._update_cycle + self._update_count: int=0 + + def _init_log(self) -> None: + # """Log the SACRCBF specific information. + + # +----------------------------+--------------------------+ + # | Things to log | Description | + # +============================+==========================+ + # | Metrics/LagrangeMultiplier | The Lagrange multiplier. | + # +----------------------------+--------------------------+ + # """ + super()._init_log() + if self._cfgs.env_id == 'Pendulum-v1': + self._logger.register_key('Metrics/angle', min_and_max=True) + + def _update_actor( + self, + obs: torch.Tensor, + ) -> None: + super()._update_actor(obs) + + if self._cfgs.algo_cfgs.auto_alpha: + with torch.no_grad(): + action = self._actor_critic.actor.predict(obs, deterministic=False) + action = self._env.get_safe_action(obs, action) + log_prob = self._actor_critic.actor.log_prob(action) + alpha_loss = -self._log_alpha * (log_prob + self._target_entropy).mean() + + self._alpha_optimizer.zero_grad() + alpha_loss.backward() + self._alpha_optimizer.step() + self._logger.store( + { + 'Loss/alpha_loss': alpha_loss.mean().item(), + }, + ) + self._logger.store( + { + 'Value/alpha': self._alpha, + }, + ) + + def _update_reward_critic( + self, + obs: torch.Tensor, + action: torch.Tensor, + reward: torch.Tensor, + done: torch.Tensor, + next_obs: torch.Tensor, + ) -> None: + with torch.no_grad(): + next_action = self._actor_critic.actor.predict(next_obs, deterministic=False) + next_action = self._env.get_safe_action(next_obs, next_action) + next_logp = self._actor_critic.actor.log_prob(next_action) + next_q1_value_r, next_q2_value_r = self._actor_critic.target_reward_critic( + next_obs, + next_action, + ) + next_q_value_r = torch.min(next_q1_value_r, next_q2_value_r) - next_logp * self._alpha + target_q_value_r = reward + self._cfgs.algo_cfgs.gamma * (1 - done) * next_q_value_r + + q1_value_r, q2_value_r = self._actor_critic.reward_critic(obs, action) + loss = nn.functional.mse_loss(q1_value_r, target_q_value_r) + nn.functional.mse_loss( + q2_value_r, + target_q_value_r, + ) + + if self._cfgs.algo_cfgs.use_critic_norm: + for param in self._actor_critic.reward_critic.parameters(): + loss += param.pow(2).sum() * self._cfgs.algo_cfgs.critic_norm_coeff + + self._actor_critic.reward_critic_optimizer.zero_grad() + loss.backward() + + if self._cfgs.algo_cfgs.max_grad_norm: + clip_grad_norm_( + self._actor_critic.reward_critic.parameters(), + self._cfgs.algo_cfgs.max_grad_norm, + ) + self._actor_critic.reward_critic_optimizer.step() + self._logger.store( + { + 'Loss/Loss_reward_critic': loss.mean().item(), + 'Value/reward_critic': q1_value_r.mean().item(), + }, + ) + + def _loss_pi( + self, + obs: torch.Tensor, + ) -> torch.Tensor: + action = self._actor_critic.actor.predict(obs, deterministic=False) + action = self._env.get_safe_action(obs, action) + log_prob = self._actor_critic.actor.log_prob(action) + q1_value_r, q2_value_r = self._actor_critic.reward_critic(obs, action) + return (self._alpha * log_prob - torch.min(q1_value_r, q2_value_r)).mean() \ No newline at end of file diff --git a/omnisafe/algorithms/on_policy/__init__.py b/omnisafe/algorithms/on_policy/__init__.py index 722ce0b11..06932a307 100644 --- a/omnisafe/algorithms/on_policy/__init__.py +++ b/omnisafe/algorithms/on_policy/__init__.py @@ -25,6 +25,7 @@ saute, second_order, simmer, + barrier_function, ) from omnisafe.algorithms.on_policy.base import PPO, TRPO, NaturalPG, PolicyGradient from omnisafe.algorithms.on_policy.early_terminated import PPOEarlyTerminated, TRPOEarlyTerminated @@ -36,6 +37,7 @@ from omnisafe.algorithms.on_policy.saute import PPOSaute, TRPOSaute from omnisafe.algorithms.on_policy.second_order import CPO, PCPO from omnisafe.algorithms.on_policy.simmer import PPOSimmerPID, TRPOSimmerPID +from omnisafe.algorithms.on_policy.barrier_function import TRPOCBF, PPOBetaCBF __all__ = [ @@ -49,4 +51,5 @@ *saute.__all__, *second_order.__all__, *simmer.__all__, + *barrier_function.__all__, ] diff --git a/omnisafe/algorithms/on_policy/barrier_function/__init__.py b/omnisafe/algorithms/on_policy/barrier_function/__init__.py new file mode 100644 index 000000000..273ca2831 --- /dev/null +++ b/omnisafe/algorithms/on_policy/barrier_function/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Control Barrier Function Safe Reinforcement Learning algorithms.""" + +from omnisafe.algorithms.on_policy.barrier_function.trpo_cbf import TRPOCBF +from omnisafe.algorithms.on_policy.barrier_function.ppo_cbf import PPOBetaCBF + + +__all__ = [ + 'TRPOCBF', + 'PPOBetaCBF', +] diff --git a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py new file mode 100644 index 000000000..e7711ed3c --- /dev/null +++ b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py @@ -0,0 +1,106 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the PPO algorithm with Control Barrier Function.""" + +from __future__ import annotations + +import torch + +from omnisafe.adapter.beta_barrier_function_adapter import BetaBarrierFunctionAdapter +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO +from omnisafe.utils import distributed + + +@registry.register +class PPOBetaCBF(PPO): + + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/angle', min_and_max=True) + self._logger.register_key('Value/Loss_compensator') + + def _init_env(self) -> None: + self._env: BetaBarrierFunctionAdapter = BetaBarrierFunctionAdapter( + self._env_id, + self._cfgs.train_cfgs.vector_env_nums, + self._seed, + self._cfgs, + ) + assert (self._cfgs.algo_cfgs.steps_per_epoch) % ( + distributed.world_size() * self._cfgs.train_cfgs.vector_env_nums + ) == 0, 'The number of steps per epoch is not divisible by the number of environments.' + self._steps_per_epoch: int = ( + self._cfgs.algo_cfgs.steps_per_epoch + // distributed.world_size() + // self._cfgs.train_cfgs.vector_env_nums + ) + + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/angle', min_and_max=True) + + def _loss_pi( + self, + obs: torch.Tensor, + act: torch.Tensor, + logp: torch.Tensor, + adv: torch.Tensor, + ) -> torch.Tensor: + r"""Computing pi/actor loss. + + In Proximal Policy Optimization, the loss is defined as: + + .. math:: + + L^{CLIP} = \underset{s_t \sim \rho_{\theta}}{\mathbb{E}} \left[ + \min ( r_t A^{R}_{\pi_{\theta}} (s_t, a_t) , \text{clip} (r_t, 1 - \epsilon, 1 + \epsilon) + A^{R}_{\pi_{\theta}} (s_t, a_t) + \right] + + where :math:`r_t = \frac{\pi_{\theta}^{'} (a_t|s_t)}{\pi_{\theta} (a_t|s_t)}`, + :math:`\epsilon` is the clip parameter, and :math:`A^{R}_{\pi_{\theta}} (s_t, a_t)` is the + advantage. + + Args: + obs (torch.Tensor): The ``observation`` sampled from buffer. + act (torch.Tensor): The ``action`` sampled from buffer. + logp (torch.Tensor): The ``log probability`` of action sampled from buffer. + adv (torch.Tensor): The ``advantage`` processed. ``reward_advantage`` here. + + Returns: + The loss of pi/actor. + """ + distribution = self._actor_critic.actor(obs) + logp_ = self._actor_critic.actor.log_prob(act) + std = self._actor_critic.actor.std + ratio = torch.exp(logp_ - logp) + ratio_cliped = torch.clamp( + ratio, + 1 - self._cfgs.algo_cfgs.clip, + 1 + self._cfgs.algo_cfgs.clip, + ) + loss = -torch.min(ratio * adv, ratio_cliped * adv).mean() + loss -= self._cfgs.algo_cfgs.entropy_coef * distribution.entropy().mean() + # useful extra info + entropy = distribution.entropy().mean().item() + self._logger.store( + { + 'Train/Entropy': entropy, + 'Train/PolicyRatio': ratio, + 'Loss/Loss_pi': loss.mean().item(), + }, + ) + return loss \ No newline at end of file diff --git a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py new file mode 100644 index 000000000..404776d72 --- /dev/null +++ b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py @@ -0,0 +1,117 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the TRPO algorithm with Control Barrier Function.""" + +from __future__ import annotations + +import torch +from torch.utils.data import DataLoader, TensorDataset + +from omnisafe.adapter.barrier_function_adapter import BarrierFunctionAdapter +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.trpo import TRPO +from omnisafe.utils import distributed +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.common.barrier_comp import BarrierCompensator + +@registry.register +class TRPOCBF(TRPO): + + def _init_log(self) -> None: + super()._init_log() + self._logger.register_key('Metrics/angle', min_and_max=True) + self._logger.register_key('Value/Loss_compensator') + + def _init_env(self) -> None: + self._env: BarrierFunctionAdapter = BarrierFunctionAdapter( + self._env_id, + self._cfgs.train_cfgs.vector_env_nums, + self._seed, + self._cfgs, + ) + assert (self._cfgs.algo_cfgs.steps_per_epoch) % ( + distributed.world_size() * self._cfgs.train_cfgs.vector_env_nums + ) == 0, 'The number of steps per epoch is not divisible by the number of environments.' + self._steps_per_epoch: int = ( + self._cfgs.algo_cfgs.steps_per_epoch + // distributed.world_size() + // self._cfgs.train_cfgs.vector_env_nums + ) + self.solver = PendulumSolver(device=self._cfgs.train_cfgs.device) + self.compensator = BarrierCompensator( + obs_dim = self._env.observation_space.shape[0], + act_dim = self._env.action_space.shape[0], + cfgs = self._cfgs.compensator_cfgs, + ) + self._env.set_solver(solver=self.solver) + self._env.set_compensator(compensator=self.compensator) + + def _init(self) -> None: + super()._init() + self._buf.add_field(name='approx_compensating_act', shape=self._env.action_space.shape, dtype=torch.float32) + self._buf.add_field(name='compensating_act', shape=self._env.action_space.shape, dtype=torch.float32) + + def _update(self) -> None: + """Update actor, critic. + + .. hint:: + Here are some differences between NPG and Policy Gradient (PG): In PG, the actor network + and the critic network are updated together. When the KL divergence between the old + policy, and the new policy is larger than a threshold, the update is rejected together. + + In NPG, the actor network and the critic network are updated separately. When the KL + divergence between the old policy, and the new policy is larger than a threshold, the + update of the actor network is rejected, but the update of the critic network is still + accepted. + """ + data = self._buf.get() + + obs, act, logp, target_value_r, target_value_c, adv_r, adv_c, approx_compensating_act, compensating_act = ( + data['obs'], + data['act'], + data['logp'], + data['target_value_r'], + data['target_value_c'], + data['adv_r'], + data['adv_c'], + data['approx_compensating_act'], + data['compensating_act'], + ) + + self._update_actor(obs, act, logp, adv_r, adv_c) + compensator_loss = self._env.compensator.train(observation=obs, approx_compensating_act=approx_compensating_act, compensating_act=compensating_act) + dataloader = DataLoader( + dataset=TensorDataset(obs, target_value_r, target_value_c), + batch_size=self._cfgs.algo_cfgs.batch_size, + shuffle=True, + ) + + for _ in range(self._cfgs.algo_cfgs.update_iters): + for ( + obs, + target_value_r, + target_value_c, + ) in dataloader: + self._update_reward_critic(obs, target_value_r) + if self._cfgs.algo_cfgs.use_cost: + self._update_cost_critic(obs, target_value_c) + + self._logger.store( + { + 'Train/StopIter': self._cfgs.algo_cfgs.update_iters, + 'Value/Adv': adv_r.mean().item(), + 'Value/Loss_compensator': compensator_loss.item(), + }, + ) diff --git a/omnisafe/common/barrier_comp.py b/omnisafe/common/barrier_comp.py new file mode 100644 index 000000000..57d39a8d6 --- /dev/null +++ b/omnisafe/common/barrier_comp.py @@ -0,0 +1,86 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +from __future__ import annotations + +import torch +from torch import optim +from omnisafe.utils.model import build_mlp_network +from omnisafe.utils.config import Config + +class BarrierCompensator(torch.nn.Module): + """A module that represents a barrier compensator using a multi-layer perceptron (MLP) network. + + This module is designed to compute actions based on observations, with the intention of compensating for + potential barriers in a control system or a similar application. It is built upon a configurable MLP network + and trained using an optimization routine. + + Attributes: + obs_dim (int): Dimension of the observation space. + act_dim (int): Dimension of the action space. + _cfgs (Config): Configuration parameters for the MLP network and training. + model (torch.nn.Module): The MLP network. + optimizer (torch.optim.Optimizer): The optimizer for training the network. + + Args: + obs_dim (int): Dimension of the observation space. + act_dim (int): Dimension of the action space. + cfgs (Config): Configuration parameters for the network and training. + """ + + def __init__(self, obs_dim: int, act_dim: int, cfgs: Config): + super(BarrierCompensator, self).__init__() + self._cfgs: Config = cfgs + self.model: torch.nn.Module = build_mlp_network( + sizes=[obs_dim, *self._cfgs.hidden_sizes, act_dim], + activation=self._cfgs.activation, + weight_initialization_mode=self._cfgs.weight_initialization_mode, + ) + self.optimizer: optim.Adam = optim.Adam(self.parameters(), lr=self._cfgs.lr) + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + """Estimate the sum of previous compensating actions. + + Args: + obs (torch.Tensor): The input observation. + + Returns: + torch.Tensor: The estimation of previous compensating actions. + """ + return self.model(obs) + + def train(self, observation: torch.Tensor, approx_compensating_act: torch.Tensor, compensating_act: torch.Tensor) -> torch.Tensor: + """Train the barrier compensator model. + + This method updates the model parameters to minimize the difference between the model's output and the + target, which is a combination of approximate compensating action and compensating action. + + Args: + observation (torch.Tensor): The observation data. + approx_compensating_act (torch.Tensor): The approximate compensating action. + compensating_act (torch.Tensor): The actual compensating action. + + Returns: + torch.Tensor: The loss after training. + """ + # Train the model + for _ in range(self._cfgs.update_iters): + target = approx_compensating_act + compensating_act + self.optimizer.zero_grad() + loss = torch.pow((self(observation)-target), 2).mean() + loss.backward() + self.optimizer.step() + + return loss diff --git a/omnisafe/common/barrier_solver.py b/omnisafe/common/barrier_solver.py new file mode 100644 index 000000000..1c11ffffb --- /dev/null +++ b/omnisafe/common/barrier_solver.py @@ -0,0 +1,251 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Control Barrier Function Solver.""" + +from __future__ import annotations +import warnings +warnings.filterwarnings("ignore") +import numpy as np +import torch +from cvxopt import matrix +from cvxopt import solvers +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C + +class PendulumSolver: + """Solver for the pendulum problem using Gaussian Process models. + + Attributes: + action_size (int): Size of the action space. + observation_size (int): Size of the observation space. + torque_bound (float): Maximum torque bound. + max_speed (float): Maximum speed of the pendulum. + device (str): Device to run the computations on. + """ + + def __init__(self, action_size: int = 1, observation_size: int = 3, + torque_bound: float = 15., max_speed: float = 60., + device: str = 'cpu') -> None: + """Initializes the PendulumSolver with specified parameters. + + Args: + action_size (int): Size of the action space. + observation_size (int): Size of the observation space. + torque_bound (float): Maximum torque bound. + max_speed (float): Maximum speed of the pendulum. + device (str): Device to run the computations on. + """ + self.action_size = action_size + self.observation_size = observation_size + self.torque_bound = torque_bound + self.max_speed = max_speed + self.F = 1.0 + self._device = device + self._gamma_b = 0.5 + self._kd = 1.5 + self._build_barrier() + self.build_GP_model() + self.GP_model_prev = None + + def build_GP_model(self) -> None: + """Builds the Gaussian Process model.""" + gp_list = [] + noise = 0.01 + for _ in range(self.observation_size - 1): + kern = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) + gp = GaussianProcessRegressor(kernel=kern, alpha=noise, n_restarts_optimizer=10) + gp_list.append(gp) + self.GP_model = gp_list + + def _build_barrier(self) -> None: + """Builds the barrier for the pendulum solver.""" + self.P = matrix(np.diag([1., 1e16]), tc='d') + self.q = matrix(np.zeros(self.action_size + 1)) + self.h1 = np.array([1, 0.01]) + self.h2 = np.array([1, -0.01]) + self.h3 = np.array([-1, 0.01]) + self.h4 = np.array([-1, -0.01]) + + def control_barrier(self, original_action: torch.Tensor, f: np.ndarray, g: np.ndarray, x: np.ndarray, std: np.ndarray) -> torch.Tensor: + """ + Adjusts the original action using a control barrier function to ensure + that the action complies with the system's physical constraints. + + Args: + original_action (torch.Tensor): The original action proposed by the RL algorithm. + f (np.ndarray): The drift component of the system's dynamics. + g (np.ndarray): The control component of the system's dynamics. + x (np.ndarray): The current state of the system. + std (np.ndarray): The standard deviation of the system's state. + + Returns: + torch.Tensor: The adjusted action that respects the system's constraints. + """ + + # Define gamma for the barrier function + gamma_b = 0.5 + kd = 1.5 + u_rl = original_action.detach().numpy() + # u_rl*=self.torque_bound + + # Set up Quadratic Program to satisfy Control Barrier Function + G = np.array( + [ + [ + -np.dot(self.h1, g), + -np.dot(self.h2, g), + -np.dot(self.h3, g), + -np.dot(self.h4, g), + 1, + -1, + g[1], + -g[1] + ], + [ + -1, + -1, + -1, + -1, + 0, + 0, + 0, + 0 + ] + ] + ) + G = np.transpose(G) + h = np.array( + [ + gamma_b * self.F + np.dot(self.h1, f) + np.dot(self.h1, g) * u_rl - (1 - gamma_b) * np.dot(self.h1, x) - kd * np.abs(np.dot(self.h1, std)), + gamma_b * self.F + np.dot(self.h2, f) + np.dot(self.h2, g) * u_rl - (1 - gamma_b) * np.dot(self.h2, x) - kd * np.abs(np.dot(self.h2, std)), + gamma_b * self.F + np.dot(self.h3, f) + np.dot(self.h3, g) * u_rl - (1 - gamma_b) * np.dot(self.h3, x) - kd * np.abs(np.dot(self.h3, std)), + gamma_b * self.F + np.dot(self.h4, f) + np.dot(self.h4, g) * u_rl - (1 - gamma_b) * np.dot(self.h4, x) - kd * np.abs(np.dot(self.h4, std)), + -u_rl + self.torque_bound, + u_rl + self.torque_bound, + -f[1] - g[1] * u_rl + self.max_speed, + f[1] + g[1] * u_rl + self.max_speed + ] + ) + h = np.squeeze(h).astype(np.double) + + # Convert numpy arrays to cvx matrices to set up QP + G = matrix(G, tc='d') + h = matrix(h, tc='d') + solvers.options['show_progress'] = False + sol = solvers.qp(self.P, self.q, G, h) + u_bar = sol['x'] + + # Check if the adjusted action is within bounds + if np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) - 0.001 >= self.torque_bound: + u_bar[0] = self.torque_bound - u_rl + print("Error in QP") + elif np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) + 0.001 <= -self.torque_bound: + u_bar[0] = -self.torque_bound - u_rl + print("Error in QP") + + return torch.as_tensor(u_bar[0], dtype=torch.float32, device=self._device).unsqueeze(dim=0) + + def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: + """ + Calculates the dynamics of the system based on the current observation + and the original action. + + Args: + obs (list[float]): The current observation of the system state. + original_action (float): The original action proposed by the RL algorithm. + + Returns: + np.ndarray: The calculated dynamics of the system. + """ + + dt = 0.05 # Time step + G = 10 # Gravitational constant + m = 2 # Mass + l = 2 # Length + + theta = np.arctan2(obs[1], obs[0]) # Calculate the angle + theta_dot = obs[2] # Angular velocity + + # Dynamics equations + f = np.array([-3 * G / (2 * l) * np.sin(theta + np.pi) * dt**2 + theta_dot * dt + theta + 3 / (m * l**2) * original_action * dt**2, + theta_dot - 3 * G / (2 * l) * np.sin(theta + np.pi) * dt + 3 / (m * l**2) * original_action * dt]) + + return np.squeeze(f) + + def update_GP_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: + """ + Updates the Gaussian Process (GP) dynamics model based on observed states and actions. + + Args: + obs (np.ndarray): Observed states. + act (np.ndarray): Actions taken. + """ + obs=obs.detach().cpu().squeeze().numpy() + act=act.detach().cpu().squeeze().numpy() + N = self.observation_size + X = obs + U = act + L = len(X) + err = np.zeros((L-1, N-1)) + S = np.zeros((L-1, 2)) + for i in range(L-1): + f = self.get_dynamics(X[i], U[i]) + theta_p = np.arctan2(X[i][1], X[i][0]) + theta_dot_p = X[i][2] + theta = np.arctan2(X[i+1][1], X[i+1][0]) + theta_dot = X[i+1][2] + S[i, :] = np.array([theta_p, theta_dot_p]) + err[i, :] = np.array([theta, theta_dot]) - f + self.GP_model[0].fit(S, err[:, 0]) + self.GP_model[1].fit(S, err[:, 1]) + + def get_GP_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.ndarray]: + """ + Retrieves the GP dynamics based on the current observation. + + Args: + obs (torch.Tensor): Current state observation. + + Returns: + list[np.ndarray]: list containing the GP dynamics [f, g, x, std]. + """ + obs = obs.cpu().detach().numpy() + u_rl = 0 + dt = 0.05 + G = 10 + m = 1 + l = 1 + obs = np.squeeze(obs) + theta = np.arctan2(obs[1], obs[0]) + theta_dot = obs[2] + x = np.array([theta, theta_dot]) # 这个x估计就对应state + f_nom = np.array( + [ + -3*G/(2*l)*np.sin(theta + np.pi)*dt**2 + theta_dot*dt + theta + 3/(m*l**2)*u_rl*dt**2, + theta_dot - 3*G/(2*l)*np.sin(theta + np.pi)*dt + 3/(m*l**2)*u_rl*dt + ] + ) + g = np.array([3/(m*l**2)*dt**2, 3/(m*l**2)*dt]) + f_nom = np.squeeze(f_nom) + f = np.zeros(2) + if use_prev_model: + [m1, std1] = self.GP_model_prev[0].predict(x.reshape(1,-1), return_std=True) + [m2, std2] = self.GP_model_prev[1].predict(x.reshape(1,-1), return_std=True) + else: + [m1, std1] = self.GP_model[0].predict(x.reshape(1, -1), return_std=True) + [m2, std2] = self.GP_model[1].predict(x.reshape(1, -1), return_std=True) + f[0] = f_nom[0] + m1 + f[1] = f_nom[1] + m2 + return [np.squeeze(f), np.squeeze(g), np.squeeze(x), np.array([np.squeeze(std1), np.squeeze(std2)])] diff --git a/omnisafe/common/buffer/onpolicy_buffer.py b/omnisafe/common/buffer/onpolicy_buffer.py index b6f9586df..6fab686aa 100644 --- a/omnisafe/common/buffer/onpolicy_buffer.py +++ b/omnisafe/common/buffer/onpolicy_buffer.py @@ -216,17 +216,7 @@ def get(self) -> dict[str, torch.Tensor]: The data stored and calculated in the buffer. """ self.ptr, self.path_start_idx = 0, 0 - - data = { - 'obs': self.data['obs'], - 'act': self.data['act'], - 'target_value_r': self.data['target_value_r'], - 'adv_r': self.data['adv_r'], - 'logp': self.data['logp'], - 'discounted_ret': self.data['discounted_ret'], - 'adv_c': self.data['adv_c'], - 'target_value_c': self.data['target_value_c'], - } + data = self.data.copy() adv_mean, adv_std, *_ = distributed.dist_statistics_scalar(data['adv_r']) cadv_mean, *_ = distributed.dist_statistics_scalar(data['adv_c']) diff --git a/omnisafe/common/buffer/vector_onpolicy_buffer.py b/omnisafe/common/buffer/vector_onpolicy_buffer.py index a920d8e6a..a8e2c25a8 100644 --- a/omnisafe/common/buffer/vector_onpolicy_buffer.py +++ b/omnisafe/common/buffer/vector_onpolicy_buffer.py @@ -87,6 +87,23 @@ def __init__( # pylint: disable=super-init-not-called,too-many-arguments ) for _ in range(num_envs) ] + + def add_field(self, name: str, shape: tuple[int, ...], dtype: torch.dtype) -> None: + """Add a field to the buffer. + + Examples: + >>> buffer = BaseBuffer(...) + >>> buffer.add_field('new_field', (2, 3), torch.float32) + >>> buffer.data['new_field'].shape + >>> (buffer.size, 2, 3) + + Args: + name (str): The name of the field. + shape (tuple of int): The shape of the field. + dtype (torch.dtype): The dtype of the field. + """ + for buffer in self.buffers: + buffer.add_field(name=name, shape=shape, dtype=dtype) @property def num_buffers(self) -> int: diff --git a/omnisafe/common/robust_barrier_solver.py b/omnisafe/common/robust_barrier_solver.py new file mode 100644 index 000000000..80d8d33b6 --- /dev/null +++ b/omnisafe/common/robust_barrier_solver.py @@ -0,0 +1,428 @@ +import numpy as np +import torch +from cvxopt import matrix +from cvxopt import solvers +from omnisafe.common.utils import to_tensor, prRed, sort_vertices_cclockwise +from qpth.qp import QPFunction + +DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}, # state = [x y θ] + 'SimulatedCars': {'n_s': 10, 'n_u': 1}, # state = [x y θ v ω] + 'Pvtol': {'n_s': 6, 'n_u': 2}, # state = [x y θ v_x v_y thrust] + 'Pendulum-v1': {'n_s': 3, 'n_u': 1} + } + + +class CBFQPLayer: + + def __init__(self, env, device='cpu', gamma_b=20, k_d=3.0, l_p=0.03): + """Constructor of CBFLayer. + + Parameters + ---------- + env : gym.env + Gym environment. + gamma_b : float, optional + gamma of control barrier certificate. + k_d : float, optional + confidence parameter desired (2.0 corresponds to ~95% for example). + """ + + self.device = torch.device(device) + + self.env = env + self.u_min, self.u_max = self.get_control_bounds() + self.gamma_b = gamma_b + + self.k_d = k_d + self.l_p = l_p + + self.action_dim = env.action_space.shape[0] + + def get_safe_action(self, state_batch, action_batch, mean_pred_batch, sigma_batch, modular=False, cbf_info_batch=None): # TODO: 迁移的核心在于此,把它用CBF的方法来改写就好 + """ + + Parameters + ---------- + state_batch : torch.tensor or ndarray + action_batch : torch.tensor or ndarray + State batch + mean_pred_batch : torch.tensor or ndarray + Mean of disturbance + sigma_batch : torch.tensor or ndarray + Standard deviation of disturbance + + Returns + ------- + final_action_batch : torch.tensor + Safe actions to take in the environment. + """ + + # batch form if only a single data point is passed + expand_dims = len(state_batch.shape) == 1 + if expand_dims: + action_batch = action_batch.unsqueeze(0) + state_batch = state_batch.unsqueeze(0) + mean_pred_batch = mean_pred_batch.unsqueeze(0) + sigma_batch = sigma_batch.unsqueeze(0) + if cbf_info_batch is not None: + cbf_info_batch = cbf_info_batch.unsqueeze(0) + + if modular: + final_action = torch.clamp(action_batch, self.u_min.repeat(action_batch.shape[0], 1), self.u_max.repeat(action_batch.shape[0], 1)) + else: + Ps, qs, Gs, hs = self.get_cbf_qp_constraints(state_batch, action_batch, mean_pred_batch, sigma_batch, modular=modular, cbf_info_batch=cbf_info_batch) + + Ps, qs, Gs, hs = Ps.detach().cpu().numpy(), qs.detach().cpu().numpy(), Gs.detach().cpu().numpy(), hs.detach().cpu().numpy() + batch_size = Ps.shape[0] + safe_actions = [] + for i in range(batch_size): + Ps_m = matrix(np.diag([1., 1e16]), tc='d') + qs_m = matrix(np.zeros(2)) + Gs_m = matrix(np.float64(Gs[i]), tc='d') + hs_m = matrix(np.float64(hs[i]), tc='d') + solvers.options['show_progress'] = False + sol = solvers.qp(Ps_m, qs_m, Gs_m, hs_m) + safe_action=torch.as_tensor(sol['x'][0], dtype=torch.float32) + safe_actions.append(safe_action) + safe_action_batch = torch.as_tensor(safe_actions, dtype=torch.float32, device=self.device).unsqueeze(-1) + + # print(action_batch.shape, safe_action_batch.shape) + # safe_action_batch = self.solve_qp(Ps, qs, Gs, hs) + final_action = torch.clamp(action_batch + safe_action_batch, self.u_min.repeat(action_batch.shape[0], 1), self.u_max.repeat(action_batch.shape[0], 1)) + + return final_action if not expand_dims else final_action.squeeze(0) + + def solve_qp(self, Ps: torch.Tensor, qs: torch.Tensor, Gs: torch.Tensor, hs: torch.Tensor): + """Solves: + minimize_{u,eps} 0.5 * u^T P u + q^T u + subject to G[u,eps]^T <= h + + Parameters + ---------- + Ps : torch.Tensor + (batch_size, n_u+1, n_u+1) + qs : torch.Tensor + (batch_size, n_u+1) + Gs : torch.Tensor + (batch_size, num_ineq_constraints, n_u+1) + hs : torch.Tensor + (batch_size, num_ineq_constraints) + Returns + ------- + safe_action_batch : torch.tensor + The solution of the qp without the last dimension (the slack). + """ + + Ghs = torch.cat((Gs, hs.unsqueeze(2)), -1) + Ghs_norm = torch.max(torch.abs(Ghs), dim=2, keepdim=True)[0] + Gs /= Ghs_norm + hs = hs / Ghs_norm.squeeze(-1) + sol = self.cbf_layer(Ps, qs, Gs, hs, solver_args={"check_Q_spd": False, "maxIter": 100000, "notImprovedLim": 10, "eps": 1e-4}) + safe_action_batch = sol[:, :self.env.action_space.shape[0]] + return safe_action_batch + + def cbf_layer(self, Qs, ps, Gs, hs, As=None, bs=None, solver_args=None): + """ + + Parameters + ---------- + Qs : torch.Tensor + ps : torch.Tensor + Gs : torch.Tensor + shape (batch_size, num_ineq_constraints, num_vars) + hs : torch.Tensor + shape (batch_size, num_ineq_constraints) + As : torch.Tensor, optional + bs : torch.Tensor, optional + solver_args : dict, optional + + Returns + ------- + result : torch.Tensor + Result of QP + """ + + if solver_args is None: + solver_args = {} + + if As is None or bs is None: + As = torch.Tensor().to(self.device).double() + bs = torch.Tensor().to(self.device).double() + + result = QPFunction(verbose=-1, **solver_args)(Qs.double(), ps.double(), Gs.double(), hs.double(), As, bs).float() + if torch.any(torch.isnan(result)): + prRed('QP Failed to solve - result is nan == {}!'.format(torch.any(torch.isnan(result)))) + raise Exception('QP Failed to solve') + return result + + def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sigma_pred_batch, modular=False, cbf_info_batch=None): # TODO: 解耦合的核心在这里 + """Build up matrices required to solve qp + + Program specifically solves: + minimize_{u,eps} 0.5 * u^T P u + q^T u + subject to G[u,eps]^T <= h + + Each control barrier certificate is of the form: + dh/dx^T (f_out + g_out u) >= -gamma^b h_out^3 where out here is an output of the state. + + In the case of SafetyGym_point dynamics: + state = [x y θ v ω] + state_d = [v*cos(θ) v*sin(θ) omega ω u^v u^ω] + + Quick Note on batch matrix multiplication for matrices A and B: + - Batch size should be first dim + - Everything needs to be 3-dimensional + - E.g. if B is a vec, i.e. shape (batch_size, vec_length) --> .view(batch_size, vec_length, 1) + + Parameters + ---------- + state_batch : torch.tensor + current state (check dynamics.py for details on each dynamics' specifics) + action_batch : torch.tensor + Nominal control input. + mean_pred_batch : torch.tensor + mean disturbance prediction state, dimensions (n_s, n_u) + sigma_pred_batch : torch.tensor + standard deviation in additive disturbance after undergoing the output dynamics. + gamma_b : float, optional + CBF parameter for the class-Kappa function + + Returns + ------- + P : torch.tensor + Quadratic cost matrix in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u) + q : torch.tensor + Linear cost vector in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u) + G : torch.tensor + Inequality constraint matrix (G[u,eps] <= h) of size (num_constraints, n_u + 1) + h : torch.tensor + Inequality constraint vector (G[u,eps] <= h) of size (num_constraints,) + """ + + assert len(state_batch.shape) == 2 and len(action_batch.shape) == 2 and len(mean_pred_batch.shape) == 2 and len(sigma_pred_batch.shape) == 2, print(state_batch.shape, action_batch.shape, mean_pred_batch.shape, sigma_pred_batch.shape) + + batch_size = state_batch.shape[0] + gamma_b = self.gamma_b + + # Expand dims + state_batch = torch.unsqueeze(state_batch, -1).to(self.device) + action_batch = torch.unsqueeze(action_batch, -1).to(self.device) + mean_pred_batch = torch.unsqueeze(mean_pred_batch, -1).to(self.device) + sigma_pred_batch = torch.unsqueeze(sigma_pred_batch, -1).to(self.device) + + if self.env.dynamics_mode == 'Pendulum': + num_constraints = 8 + n_u = action_batch.shape[1] # dimension of control inputs + # Inequality constraints (G[u, eps] <= h) + G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) # the extra variable is for epsilon (to make sure qp is always feasible) + h = torch.zeros((batch_size, num_constraints)).to(self.device) + + h1 = torch.FloatTensor([1, 0.01]).unsqueeze(-1).to(self.device) + h2 = torch.FloatTensor([1, -0.01]).unsqueeze(-1).to(self.device) + h3 = torch.FloatTensor([-1, 0.01]).unsqueeze(-1).to(self.device) + h4 = torch.FloatTensor([-1, -0.01]).unsqueeze(-1).to(self.device) + action_batch_scaled=(action_batch*15.0).squeeze(-1).to(self.device) # TODO: 写的好看点 + + theta = state_batch[:,0,:].squeeze(-1) + theta_dot = state_batch[:,1,:].squeeze(-1) + f_norm = torch.zeros(batch_size, 2).to(self.device) + # theta [batch_size, 1] + f_norm[:, 0] = -3*10/2*torch.sin(theta+torch.pi)*self.env.dt + theta + f_norm[: ,1] = theta_dot - 3*10/2*torch.sin(theta+torch.pi) + + g = torch.tensor([3*self.env.dt**2, 3*self.env.dt]).unsqueeze(0).to(self.device) + + f = torch.zeros_like(f_norm).to(self.device) + f[:, 0] = f_norm[:, 0] + mean_pred_batch[:,0,:].squeeze(-1) + f[:, 1] = f_norm[:, 1] + mean_pred_batch[:,1,:].squeeze(-1) + G = torch.tensor( + [ + [ + -torch.matmul(g, h1), + -torch.matmul(g, h2), + -torch.matmul(g, h3), + -torch.matmul(g, h4), + 1, + -1, + g[:, 1], + -g[:, 1] + ], + [ + -1, + -1, + -1, + -1, + 0, + 0, + 0, + 0 + ] + ] + ).transpose(0, 1).repeat(batch_size, 1, 1).to(self.device) + state_batch_squeeze = state_batch.squeeze(-1) + sigma_pred_batch_squeeze = sigma_pred_batch.squeeze(-1) + + h = torch.cat( + [ + self.gamma_b + torch.matmul(f, h1) + torch.matmul(g, h1) * action_batch_scaled - (1 - self.gamma_b) * torch.matmul(state_batch_squeeze, h1) - self.k_d * torch.abs(torch.matmul(sigma_pred_batch_squeeze, h1)), + self.gamma_b + torch.matmul(f, h2) + torch.matmul(g, h2) * action_batch_scaled - (1 - self.gamma_b) * torch.matmul(state_batch_squeeze, h2) - self.k_d * torch.abs(torch.matmul(sigma_pred_batch_squeeze, h2)), + self.gamma_b + torch.matmul(f, h3) + torch.matmul(g, h3) * action_batch_scaled - (1 - self.gamma_b) * torch.matmul(state_batch_squeeze, h3) - self.k_d * torch.abs(torch.matmul(sigma_pred_batch_squeeze, h3)), + self.gamma_b + torch.matmul(f, h4) + torch.matmul(g, h4) * action_batch_scaled - (1 - self.gamma_b) * torch.matmul(state_batch_squeeze, h4) - self.k_d * torch.abs(torch.matmul(sigma_pred_batch_squeeze, h4)), + -action_batch_scaled + 15.0, + action_batch_scaled + 15.0, + -f[:, 1].unsqueeze(-1) - g[:, 1] * action_batch_scaled + 60.0, + f[:, 1].unsqueeze(-1) + g[:, 1] * action_batch_scaled + 60.0 + ], + dim=1 + ).to(self.device) + P = torch.diag(torch.tensor([1.e0, 1e16])).repeat(batch_size, 1, 1).to(self.device) + q = torch.zeros((batch_size, self.action_dim + 1)).to(self.device) + + elif self.env.dynamics_mode == 'Unicycle': + + num_cbfs = len(self.env.hazards) + l_p = self.l_p + buffer = 0.1 + + thetas = state_batch[:, 2, :].squeeze(-1) + c_thetas = torch.cos(thetas) + s_thetas = torch.sin(thetas) + + # p(x): lookahead output (batch_size, 2) + ps = torch.zeros((batch_size, 2)).to(self.device) + ps[:, 0] = state_batch[:, 0, :].squeeze(-1) + l_p * c_thetas + ps[:, 1] = state_batch[:, 1, :].squeeze(-1) + l_p * s_thetas + + # p_dot(x) = f_p(x) + g_p(x)u + D_p where f_p(x) = 0, g_p(x) = RL and D_p is the disturbance + + # f_p(x) = [0,...,0]^T + f_ps = torch.zeros((batch_size, 2, 1)).to(self.device) + + # g_p(x) = RL where L = diag([1, l_p]) + Rs = torch.zeros((batch_size, 2, 2)).to(self.device) + Rs[:, 0, 0] = c_thetas + Rs[:, 0, 1] = -s_thetas + Rs[:, 1, 0] = s_thetas + Rs[:, 1, 1] = c_thetas + Ls = torch.zeros((batch_size, 2, 2)).to(self.device) + Ls[:, 0, 0] = 1 + Ls[:, 1, 1] = l_p + g_ps = torch.bmm(Rs, Ls) # (batch_size, 2, 2) + + # D_p(x) = g_p [0 D_θ]^T + [D_x1 D_x2]^T + mu_theta_aug = torch.zeros([batch_size, 2, 1]).to(self.device) + mu_theta_aug[:, 1, :] = mean_pred_batch[:, 2, :] + mu_ps = torch.bmm(g_ps, mu_theta_aug) + mean_pred_batch[:, :2, :] + sigma_theta_aug = torch.zeros([batch_size, 2, 1]).to(self.device) + sigma_theta_aug[:, 1, :] = sigma_pred_batch[:, 2, :] + sigma_ps = torch.bmm(torch.abs(g_ps), sigma_theta_aug) + sigma_pred_batch[:, :2, :] + + # Build RCBFs + hs = 1e3 * torch.ones((batch_size, num_cbfs), device=self.device) # the RCBF itself + dhdps = torch.zeros((batch_size, num_cbfs, 2), device=self.device) + hazards = self.env.hazards + for i in range(len(hazards)): + if hazards[i]['type'] == 'circle': # 1/2 * (||ps - x_obs||^2 - r^2) + obs_loc = to_tensor(hazards[i]['location'], torch.FloatTensor, self.device) + hs[:, i] = 0.5 * (torch.sum((ps - obs_loc)**2, dim=1) - (hazards[i]['radius'] + buffer)**2) + dhdps[:, i, :] = (ps - obs_loc) + elif hazards[i]['type'] == 'polygon': # max_j(h_j) where h_j = 1/2 * (dist2seg_j)^2 + vertices = sort_vertices_cclockwise(hazards[i]['vertices']) # (n_v, 2) + segments = np.diff(vertices, axis=0, + append=vertices[[0]]) # (n_v, 2) at row i contains vector from v_i to v_i+1 + segments = to_tensor(segments, torch.FloatTensor, self.device) + vertices = to_tensor(vertices, torch.FloatTensor, self.device) + # Get max RBCF TODO: Can be optimized + for j in range(segments.shape[0]): + # Compute Distances to segment + dot_products = torch.matmul(ps - vertices[j:j + 1], segments[j]) / torch.sum( + segments[j] ** 2) # (batch_size,) + mask0_ = dot_products < 0 # if <0 closest point on segment is vertex j + mask1_ = dot_products > 1 # if >0 closest point on segment is vertex j+1 + mask_ = torch.logical_and(dot_products >= 0, + dot_products <= 1) # Else find distance to line l_{v_j, v_j+1} + # Compute Distances + dists2seg = torch.zeros((batch_size)) + if mask0_.sum() > 0: + dists2seg[mask0_] = torch.linalg.norm(ps[mask0_] - vertices[[j]], dim=1) + if mask1_.sum() > 0: + dists2seg[mask1_] = torch.linalg.norm(ps[mask1_] - vertices[[(j + 1) % segments.shape[0]]], dim=1) + if mask_.sum() > 0: + dists2seg[mask_] = torch.linalg.norm( + dot_products[mask_, None] * segments[j].tile((torch.sum(mask_), 1)) + vertices[[j]] - + ps[mask_], dim=1) + # Compute hs_ for this segment + hs_ = 0.5 * ((dists2seg ** 2) + 0.5*buffer) # (batch_size,) + # Compute dhdps TODO: Can be optimized to only compute for indices that need updating + dhdps_ = torch.zeros((batch_size, 2)) + if mask0_.sum() > 0: + dhdps_[mask0_] = ps[mask0_] - vertices[[j]] + if mask1_.sum() > 0: + dhdps_[mask1_] = ps[mask1_] - vertices[[(j + 1) % segments.shape[0]]] + if mask_.sum() > 0: + normal_vec = torch.tensor([segments[j][1], -segments[j][0]]) + normal_vec /= torch.linalg.norm(normal_vec) + dhdps_[mask_] = (ps[mask_]-vertices[j]).matmul(normal_vec) * normal_vec.view((1,2)).repeat(torch.sum(mask_), 1) # dot products (batch_size, 1) + # Find indices to update (closest segment basically, worst case -> CBF boolean and is a min) + idxs_to_update = torch.nonzero(hs[:, i] - hs_ > 0) + # Update the actual hs to be used in the constraints + if idxs_to_update.shape[0] > 0: + hs[idxs_to_update, i] = hs_[idxs_to_update] + # Compute dhdhps for those indices + dhdps[idxs_to_update, i, :] = dhdps_[idxs_to_update, :] + else: + raise Exception('Only obstacles of type `circle` or `polygon` are supported, got: {}'.format(hazards[i]['type'])) + + n_u = action_batch.shape[1] # dimension of control inputs + num_constraints = num_cbfs + 2 * n_u # each cbf is a constraint, and we need to add actuator constraints (n_u of them) + + # Inequality constraints (G[u, eps] <= h) + G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) # the extra variable is for epsilon (to make sure qp is always feasible) + h = torch.zeros((batch_size, num_constraints)).to(self.device) + ineq_constraint_counter = 0 + + # Add inequality constraints + G[:, :num_cbfs, :n_u] = -torch.bmm(dhdps, g_ps) # h1^Tg(x) + G[:, :num_cbfs, n_u] = -1 # for slack + h[:, :num_cbfs] = gamma_b * (hs ** 3) + (torch.bmm(dhdps, f_ps + mu_ps) - torch.bmm(torch.abs(dhdps), sigma_ps) + torch.bmm(torch.bmm(dhdps, g_ps), action_batch)).squeeze(-1) + ineq_constraint_counter += num_cbfs + + # Let's also build the cost matrices, vectors to minimize control effort and penalize slack + P = torch.diag(torch.tensor([1.e0, 1.e-2, 1e5])).repeat(batch_size, 1, 1).to(self.device) + q = torch.zeros((batch_size, n_u + 1)).to(self.device) + + # Add Actuator Constraints + n_u = action_batch.shape[1] # dimension of control inputs + + for c in range(n_u): + + # u_max >= u_nom + u ---> u <= u_max - u_nom + if self.u_max is not None: + G[:, ineq_constraint_counter, c] = 1 + h[:, ineq_constraint_counter] = self.u_max[c] - action_batch[:, c].squeeze(-1) + ineq_constraint_counter += 1 + + # u_min <= u_nom + u ---> -u <= u_min - u_nom + if self.u_min is not None: + G[:, ineq_constraint_counter, c] = -1 + h[:, ineq_constraint_counter] = -self.u_min[c] + action_batch[:, c].squeeze(-1) + ineq_constraint_counter += 1 + + return P, q, G, h + + def get_control_bounds(self): + """ + + Returns + ------- + u_min : torch.tensor + min control input. + u_max : torch.tensor + max control input. + """ + + u_min = torch.tensor(self.env.safe_action_space.low).to(self.device) + u_max = torch.tensor(self.env.safe_action_space.high).to(self.device) + + return u_min, u_max + \ No newline at end of file diff --git a/omnisafe/common/robust_gp_model.py b/omnisafe/common/robust_gp_model.py new file mode 100644 index 000000000..2824faf12 --- /dev/null +++ b/omnisafe/common/robust_gp_model.py @@ -0,0 +1,498 @@ +""" Adapted almost directly from: +https://docs.gpytorch.ai/en/stable/examples/02_Scalable_Exact_GPs/Simple_GP_Regression_CUDA.html + +Training is performed rapidly (and exactly) using GPUs and prediction is done very rapidly using LOVE. +""" + +import torch +import numpy as np +import gpytorch +import warnings +warnings.filterwarnings('ignore') +from omnisafe.common.utils import to_tensor, to_numpy + +DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}, # state = [x y θ] + 'SimulatedCars': {'n_s': 10, 'n_u': 1}, # state = [x y θ v ω] + 'Pvtol': {'n_s': 6, 'n_u': 2}, # state = [x y θ v_x v_y thrust] + 'Pendulum': {'n_s': 2, 'n_u': 1} + } +MAX_STD = {'Unicycle': [2e-1, 2e-1, 2e-1], 'SimulatedCars': [0, 0.2, 0, 0.2, 0, 0.2, 0, 0.2, 0, 0.2], 'Pvtol': [0, 0, 0, 0, 0, 0], 'Pendulum': [0.1, 0.1, 0.1]} + + +class BaseGPy(gpytorch.models.ExactGP): + + def __init__(self, train_x, train_y, prior_std, likelihood): + super().__init__(train_x, train_y, likelihood) + self.mean_module = gpytorch.means.ZeroMean() + self.covar_module = gpytorch.kernels.ScaleKernel( + gpytorch.kernels.RBFKernel(lengthscale_prior=gpytorch.priors.NormalPrior(1e5, 1e-5)), + outputscale_prior=gpytorch.priors.NormalPrior(prior_std + 1e-6, 1e-5)) + # Initialize lengthscale and outputscale to mean of priors + self.covar_module.base_kernel.lengthscale = 1e5 + self.covar_module.outputscale = prior_std + 1e-6 + + def forward(self, x): + mean = self.mean_module(x) + covar = self.covar_module(x) + return gpytorch.distributions.MultivariateNormal(mean, covar) + +class GPyDisturbanceEstimator: + """ + A wrapper around teh BaseGPy model above. + """ + + def __init__(self, train_x, train_y, prior_std, likelihood=None, device=None): + + if device: + self.device = device + else: + self.device = torch.device("cpu") + + if not torch.is_tensor(train_x): + train_x = to_tensor(train_x, torch.FloatTensor, self.device) + if not torch.is_tensor(train_y): + train_y = to_tensor(train_y, torch.FloatTensor, self.device) + self.train_x = train_x + self.train_y = train_y + + if not likelihood: + likelihood = gpytorch.likelihoods.GaussianLikelihood() + self.likelihood = likelihood.to(self.device) + + self.model = BaseGPy(train_x, train_y, prior_std, likelihood) + self.model = self.model.to(self.device) + + def train(self, training_iter, verbose=False): + + # Find optimal model hyperparameters + self.model.train() + self.likelihood.train() + + # Use the adam optimizer + optimizer = torch.optim.Adam(self.model.parameters(), lr=0.1) # Includes GaussianLikelihood parameters + + # "Loss" for GPs - the marginal log likelihood + mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model) + + for i in range(training_iter): + # Zero gradients from previous iteration + optimizer.zero_grad() + # Output from model + output = self.model(self.train_x) + # Calc loss and backprop gradients + loss = -mll(output, self.train_y) + loss.backward() + if verbose: + print('\tIter %d/%d - Loss: %.3f lengthscale: %.3f noise: %.3f' % ( + i + 1, training_iter, loss.item(), + self.model.covar_module.base_kernel.lengthscale.item(), + self.model.likelihood.noise.item() + )) + optimizer.step() + + def predict(self, test_x): + + # Convert to torch tensor + is_tensor = torch.is_tensor(test_x) + if not is_tensor: + test_x = to_tensor(test_x, torch.FloatTensor, self.device) + + # Get into evaluation (predictive posterior) mode + self.model.eval() + self.likelihood.eval() + + # Test points are regularly spaced along [0,1] + # Make predictions by feeding model through likelihood + with torch.no_grad(), gpytorch.settings.fast_pred_var(): + observed_pred = self.likelihood(self.model(test_x)) + pred_dict = dict() + pred_dict['mean'] = observed_pred.mean.cpu() + pred_dict['f_var'] = observed_pred.variance.cpu() + pred_dict['f_covar'] = observed_pred.covariance_matrix.cpu() + lower_ci, upper_ci = observed_pred.confidence_region() + pred_dict['lower_ci'] = lower_ci.cpu() + pred_dict['upper_ci'] = upper_ci.cpu() + + # If they gave us ndarray, we give back ndarray + if not is_tensor: + for key, val in pred_dict.items(): + pred_dict[key] = to_numpy(val) + + return pred_dict + +class DynamicsModel: + + def __init__(self, env, gp_model_size=2000, l_p=0.03, device='cpu'): + """Constructor of DynamicsModel. + + Parameters + ---------- + env : gym.env + Gym environment. + """ + + self.env = env + # Get Dynamics + self.get_f, self.get_g = self.get_dynamics() + self.n_s = DYNAMICS_MODE[self.env.dynamics_mode]['n_s'] + self.n_u = DYNAMICS_MODE[self.env.dynamics_mode]['n_u'] + + # Keep Disturbance History to estimate it using GPs + self.disturb_estimators = None + self.disturbance_history = dict() + self.history_counter = 0 # keeping only max_history_count points in the buffer + self.max_history_count = gp_model_size # How many points we want to have in the GP + self.disturbance_history['state'] = np.zeros((self.max_history_count, self.n_s)) + self.disturbance_history['disturbance'] = np.zeros((self.max_history_count, self.n_s)) + self.train_x = None # x-data used to fit the last GP models + self.train_y = None # y-data used to fit the last GP models + + self.l_p = l_p + + self.device = torch.device(device) + + def predict_next_state(self, state_batch, u_batch, t_batch=None, use_gps=True): + """Given the current state and action, this function predicts the next state. + + Parameters + ---------- + state_batch : ndarray + State + u_batch : ndarray + Action + t_batch: ndarray, optional + Time batch for state dependant dynamics + use_gps : bool, optional + Use GPs to return mean and var + + Returns + ------- + next_state : ndarray + Next state + """ + + expand_dims = len(state_batch.shape) == 1 + if expand_dims: + state_batch = np.expand_dims(state_batch, axis=0) + + # Start with our prior for continuous time system x' = f(x) + g(x)u + if t_batch is not None: + next_state_batch = state_batch + self.env.dt * (self.get_f(state_batch, t_batch) + (self.get_g(state_batch, t_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1)) + else: + next_state_batch = state_batch + self.env.dt * (self.get_f(state_batch) + (self.get_g(state_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1)) + + if use_gps: # if we want estimate the disturbance, let's do it! + pred_mean, pred_std = self.predict_disturbance(state_batch) + next_state_batch += self.env.dt * pred_mean + else: + pred_std = np.zeros(state_batch.shape) + + if expand_dims: + next_state_batch = next_state_batch.squeeze(0) + if pred_std is not None: + pred_std = pred_std.squeeze(0) + + if t_batch is not None: + next_t_batch = t_batch + self.env.dt + return next_state_batch, self.env.dt * pred_std, next_t_batch + + return next_state_batch, self.env.dt * pred_std, t_batch + + def predict_next_obs(self, state, u): + """Predicts the next observation given the state and u. Note that this only predicts the mean next observation. + + Parameters + ---------- + state : ndarray + u : ndarray + + Returns + ------- + next_obs : ndarray + Next observation + """ + + next_state, _, _ = self.predict_next_state(state, u) + next_obs = self.get_obs(next_state) + return next_obs + + def get_dynamics(self): + """Get affine CBFs for a given environment. + + Parameters + ---------- + + Returns + ------- + get_f : callable + Drift dynamics of the continuous system x' = f(x) + g(x)u + get_g : callable + Control dynamics of the continuous system x' = f(x) + g(x)u + """ + + if self.env.dynamics_mode == 'Unicycle': + + def get_f(state_batch, t_batch=None): + f_x = np.zeros(state_batch.shape) + return f_x + + def get_g(state_batch, t_batch=None): + theta = state_batch[:, 2] + g_x = np.zeros((state_batch.shape[0], 3, 2)) + g_x[:, 0, 0] = np.cos(theta) + g_x[:, 1, 0] = np.sin(theta) + g_x[:, 2, 1] = 1.0 + return g_x + + elif self.env.dynamics_mode == 'Pendulum': + + def get_f(state_batch, t_batch=None): + f_x = np.zeros(state_batch.shape) + theta = state_batch[:, 0] + theta_dot = state_batch[:, 1] + f_x = np.array( + [ + -3*10/2*np.sin(theta+np.pi)*self.env.dt + theta, + theta_dot - 3*10/2*np.sin(theta+np.pi) + ] + ) + return f_x + + def get_g(state_batch, t_batch=None): + g_x = np.zeros((state_batch.shape[0], 2, 1)) + g_x[:, 0, 0] = 3*self.env.dt**2 + g_x[:, 1, 0] = 3*self.env.dt + return g_x + + else: + raise Exception('Unknown Dynamics mode.') + + return get_f, get_g + + def get_state(self, obs): + """Given the observation, this function does the pre-processing necessary and returns the state. + + Parameters + ---------- + obs_batch : ndarray or torch.tensor + Environment observation. + + Returns + ------- + state_batch : ndarray or torch.tensor + State of the system. + + """ + + expand_dims = len(obs.shape) == 1 + is_tensor = torch.is_tensor(obs) + + if is_tensor: + dtype = obs.dtype + device = obs.device + obs = to_numpy(obs) + + if expand_dims: + obs = np.expand_dims(obs, 0) + + if self.env.dynamics_mode == 'Unicycle': + theta = np.arctan2(obs[:, 3], obs[:, 2]) + state_batch = np.zeros((obs.shape[0], 3)) + state_batch[:, 0] = obs[:, 0] + state_batch[:, 1] = obs[:, 1] + state_batch[:, 2] = theta + elif self.env.dynamics_mode == 'Pendulum': + theta = np.arctan2(obs[:, 1], obs[:, 0]) + theta_dot = obs[:, 2] + state_batch = np.zeros((obs.shape[0], 2)) + state_batch[:, 0] = theta + state_batch[:, 1] = theta_dot + else: + raise Exception('Unknown dynamics') + + if expand_dims: + state_batch = state_batch.squeeze(0) + + return to_tensor(state_batch, dtype, device) if is_tensor else state_batch + + def get_obs(self, state_batch): + """Given the state, this function returns it to an observation akin to the one obtained by calling env.step + + Parameters + ---------- + state : ndarray + Environment state batch of shape (batch_size, n_s) + + Returns + ------- + obs : ndarray + Observation batch of shape (batch_size, n_o) + + """ + + if self.env.dynamics_mode == 'Unicycle': + obs = np.zeros((state_batch.shape[0], 4)) + obs[:, 0] = state_batch[:, 0] + obs[:, 1] = state_batch[:, 1] + obs[:, 2] = np.cos(state_batch[:, 2]) + obs[:, 3] = np.sin(state_batch[:, 2]) + else: + raise Exception('Unknown dynamics') + return obs + + def append_transition(self, state_batch, u_batch, next_state_batch, t_batch=None): + """Estimates the disturbance from the current dynamics transition and adds it to buffer. + + Parameters + ---------- + state_batch : ndarray + shape (n_s,) or (batch_size, n_s) + u_batch : ndarray + shape (n_u,) or (batch_size, n_u) + next_state_batch : ndarray + shape (n_s,) or (batch_size, n_s) + t_batch : ndarray, optional + shape (1,) or (batch_size, 1) + + Returns + ------- + + """ + + expand_dims = len(state_batch.shape) == 1 + + if expand_dims: + state_batch = np.expand_dims(state_batch, 0) + next_state_batch = np.expand_dims(next_state_batch, 0) + u_batch = np.expand_dims(u_batch, 0) + + u_batch = np.expand_dims(u_batch, -1) # for broadcasting batch matrix multiplication + disturbance_batch = (next_state_batch - state_batch - self.env.dt * (self.get_f(state_batch, t_batch) + (self.get_g(state_batch, t_batch) @ u_batch).squeeze(-1))) / self.env.dt + + # Append new data point (state, disturbance) to our dataset + for i in range(state_batch.shape[0]): + + self.disturbance_history['state'][self.history_counter % self.max_history_count] = state_batch[i] + self.disturbance_history['disturbance'][self.history_counter % self.max_history_count] = disturbance_batch[i] + + # Increment how many data points we have + self.history_counter += 1 + + # Update GP models every max_history_count data points + if self.history_counter % (self.max_history_count/10) == 0: + self.fit_gp_model() + + def fit_gp_model(self, training_iter=70): + """ + + Parameters + ---------- + training_iter : int + Number of training iterations for GP model. + + Returns + ------- + + """ + + if self.history_counter < self.max_history_count: # didn't fill the buffer yet + train_x = self.disturbance_history['state'][:self.history_counter] + train_y = self.disturbance_history['disturbance'][:self.history_counter] + else: # buffer filled, use all the data points + train_x = self.disturbance_history['state'] + train_y = self.disturbance_history['disturbance'] + + # Normalize Data + train_x_std = np.std(train_x, axis=0) + train_x_normalized = train_x / (train_x_std + 1e-8) + train_y_std = np.std(train_y, axis=0) + train_y_normalized = train_y / (train_y_std + 1e-8) + + self.disturb_estimators = [] + for i in range(self.n_s): + # self.disturb_estimators.append(GPyDisturbanceEstimator(train_x, train_y[:, i])) + self.disturb_estimators.append(GPyDisturbanceEstimator(train_x_normalized, train_y_normalized[:, i], MAX_STD[self.env.dynamics_mode][i], device=self.device)) + self.disturb_estimators[i].train(training_iter) + + # track the data I last used to fit the GPs for saving purposes (need it to initialize before loading weights) + self.train_x = train_x + self.train_y = train_y + + def predict_disturbance(self, test_x): + """Predict the disturbance at the queried states using the GP models. + + Parameters + ---------- + test_x : ndarray or torch.tensor + shape(n_test, n_s) + Returns + ------- + means: ndarray or torch.tensor + Prediction means -- shape(n_test, n_s) + vars: ndarray or torch.tensor + Prediction variances -- shape(n_test, n_s) + """ + + is_tensor = torch.is_tensor(test_x) + + if is_tensor: + dtype = test_x.dtype + device = test_x.device + test_x = to_numpy(test_x) + + expand_dims = len(test_x.shape) == 1 + if expand_dims: + test_x = np.expand_dims(test_x, axis=0) + + means = np.zeros(test_x.shape) + f_std = np.zeros(test_x.shape) # standard deviation + + if self.disturb_estimators: + # Normalize + train_x_std = np.std(self.train_x, axis=0) + train_y_std = np.std(self.train_y, axis=0) + test_x = test_x / train_x_std + for i in range(self.n_s): + prediction_ = self.disturb_estimators[i].predict(test_x) + means[:, i] = prediction_['mean'] * (train_y_std[i] + 1e-8) + f_std[:, i] = np.sqrt(prediction_['f_var']) * (train_y_std[i] + 1e-8) + + else: # zero-mean, max_sigma prior + f_std = np.ones(test_x.shape) + for i in range(self.n_s): + f_std[:, i] *= MAX_STD[self.env.dynamics_mode][i] + + if expand_dims: + means = means.squeeze(0) + f_std = f_std.squeeze(0) + + return (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) if is_tensor else (means, f_std) + + def load_disturbance_models(self, output): + + if output is None: + return + + self.disturb_estimators = [] + + weights = torch.load('{}/gp_models.pkl'.format(output), map_location=self.device) + self.train_x = torch.load('{}/gp_models_train_x.pkl'.format(output)) + self.train_y = torch.load('{}/gp_models_train_y.pkl'.format(output)) + for i in range(self.n_s): + self.disturb_estimators.append(GPyDisturbanceEstimator(self.train_x, self.train_y[:, i], MAX_STD[self.env.dynamics_mode][i], device=self.device)) + self.disturb_estimators[i].model.load_state_dict(weights[i]) + + def save_disturbance_models(self, output): + + if not self.disturb_estimators or self.train_x is None or self.train_y is None: + return + weights = [] + for i in range(len(self.disturb_estimators)): + weights.append(self.disturb_estimators[i].model.state_dict()) + torch.save(weights, '{}/gp_models.pkl'.format(output)) + # Also save data used to fit model (needed for initializing the model before loading weights) + torch.save(self.train_x, '{}/gp_models_train_x.pkl'.format(output)) + torch.save(self.train_y, '{}/gp_models_train_y.pkl'.format(output)) + + def seed(self, seed): + torch.manual_seed(seed) \ No newline at end of file diff --git a/omnisafe/common/utils.py b/omnisafe/common/utils.py new file mode 100644 index 000000000..beee622e5 --- /dev/null +++ b/omnisafe/common/utils.py @@ -0,0 +1,215 @@ +import math +import numpy as np +import os +import torch +from torch.autograd import Variable + +USE_CUDA = torch.cuda.is_available() + + +def prRed(prt): print("\033[91m {}\033[00m".format(prt)) + + +def prGreen(prt): print("\033[92m {}\033[00m".format(prt)) + + +def prYellow(prt): print("\033[93m {}\033[00m".format(prt)) + + +def prLightPurple(prt): print("\033[94m {}\033[00m".format(prt)) + + +def prPurple(prt): print("\033[95m {}\033[00m".format(prt)) + + +def prCyan(prt): print("\033[96m {}\033[00m".format(prt)) + + +def prLightGray(prt): print("\033[97m {}\033[00m".format(prt)) + + +def prBlack(prt): print("\033[98m {}\033[00m".format(prt)) + + +def mat_to_euler_2d(rot_mat): + """ + rot_mat has shape: + [[c -s 0], + [s c 0], + [0 0 1]] + """ + + theta = np.arcsin(rot_mat[1, 0]) + return theta + + +def euler_to_mat_2d(theta_batch): + s = np.sin(theta_batch) + c = np.cos(theta_batch) + Rs = np.zeros((theta_batch.shape[0], 2, 2)) + Rs[:, 0, 0] = c + Rs[:, 0, 1] = -s + Rs[:, 1, 0] = s + Rs[:, 1, 1] = c + return Rs + +def to_numpy(x): + # convert torch tensor to numpy array + return x.cpu().detach().double().numpy() + +def to_tensor(x, dtype, device, requires_grad=False): + # convert numpy array to torch tensor + if type(x).__module__ != 'numpy': + return x + return torch.from_numpy(x).type(dtype).to(device).requires_grad_(requires_grad) + +def scale_action(action, action_lb, action_ub, device=None): + + act_k = (action_ub - action_lb) / 2. + act_b = (action_ub + action_lb) / 2. + return act_k * action + act_b + + +def soft_update(target, source, tau): + for target_param, param in zip(target.parameters(), source.parameters()): + target_param.data.copy_( + target_param.data * (1.0 - tau) + param.data * tau + ) + + +def hard_update(target, source): + for target_param, param in zip(target.parameters(), source.parameters()): + target_param.data.copy_(param.data) + + +def create_log_gaussian(mean, log_std, t): + quadratic = -((0.5 * (t - mean) / (log_std.exp())).pow(2)) + l = mean.shape + log_z = log_std + z = l[-1] * math.log(2 * math.pi) + log_p = quadratic.sum(dim=-1) - log_z.sum(dim=-1) - 0.5 * z + return log_p + + +def logsumexp(inputs, dim=None, keepdim=False): + if dim is None: + inputs = inputs.view(-1) + dim = 0 + s, _ = torch.max(inputs, dim=dim, keepdim=True) + outputs = s + (inputs - s).exp().sum(dim=dim, keepdim=True).log() + if not keepdim: + outputs = outputs.squeeze(dim) + return outputs + + +def get_output_folder(parent_dir, env_name): + """Return save folder. + + Assumes folders in the parent_dir have suffix -run{run + number}. Finds the highest run number and sets the output folder + to that number + 1. This is just convenient so that if you run the + same script multiple times tensorboard can plot all of the results + on the same plots with different names. + + Parameters + ---------- + parent_dir: str + Path of the directory containing all experiment runs. + + Returns + ------- + parent_dir/run_dir + Path to this run's save directory. + """ + os.makedirs(parent_dir, exist_ok=True) + experiment_id = 0 + for folder_name in os.listdir(parent_dir): + if not os.path.isdir(os.path.join(parent_dir, folder_name)): + continue + try: + folder_name = int(folder_name.split('-run')[-1]) + if folder_name > experiment_id: + experiment_id = folder_name + except: + pass + experiment_id += 1 + + parent_dir = os.path.join(parent_dir, env_name) + parent_dir = parent_dir + '-run{}'.format(experiment_id) + os.makedirs(parent_dir, exist_ok=True) + return parent_dir + + +def get_wrapped_policy(agent, cbf_wrapper, dynamics_model, compensator=None, warmup=False, action_space=None, + policy_eval=False): + + def wrapped_policy(observation): + + if warmup and action_space: + action = action_space.sample() # Sample random action + else: + action, _ = agent.select_action(observation, evaluate=policy_eval) # Sample action from policy + + if compensator: + action_comp = compensator(observation) + else: + action_comp = 0 + state = dynamics_model.get_state(observation) + disturb_mean, disturb_std = dynamics_model.predict_disturbance(state) + action_safe = cbf_wrapper.get_safe_action(state, action + action_comp, disturb_mean, disturb_std) + # print('state = {}, action = {}, action_comp = {}, u_safe = {}'.format(state, action, action_comp, u_safe)) + return action + action_comp + action_safe + + return wrapped_policy + +def sort_vertices_cclockwise(vertices): + """ Function used to sort vertices of 2D convex polygon in counter clockwise direction. + + Parameters + ---------- + vertices : numpy.ndarray + Array of size (n_v, 2) where n_v is the number of vertices and d is the dimension of the space + + Returns + ------- + sorted_vertices : numpy.ndarray + Array of size (n_v, 2) of the vertices sorted in counter-clockwise direction. + """ + + assert vertices.shape[1] == 2, "Vertices must each have dimension 2, got {}".format(vertices.shape[1]) + + # Sort vertices + polygon_center = vertices.sum(axis=0, keepdims=True) / vertices.shape[0] # (1, d) + rel_vecs = vertices - polygon_center + thetas = np.arctan2(rel_vecs[:, 1], rel_vecs[:, 0]) + idxs = np.argsort(thetas) + return vertices[idxs, :] + +def get_polygon_normals(vertices): + """ + + Parameters + ---------- + vertices : numpy.ndarray + Array of size (n_v, 2) where n_v is the number of 2D vertices. + Returns + ------- + normals : numpy.ndarray + Array of size (n_v, 2) where each row i is the 2D normal vector of the line from vertices_sorted[i] - vertices_sorted[i+1] + + centers : numpy.ndarary + Array of size (n_v, 2) where each row i is the 2D center point of the segment from vertices_sorted[i] to vertices_sorted[i+1] + """ + + sorted_vertices = sort_vertices_cclockwise(vertices) # (n_v, 2) + diffs = np.diff(sorted_vertices, axis=0, append=sorted_vertices[[0]]) # (n_v, 2) at row i contains vector from v_i to v_i+1 + + # Compute Normals (rotate each diff by -90 degrees) + diffs = np.diff(sorted_vertices, axis=0, append=sorted_vertices[[0]]) # (n_v, 2) at row i contains vector from v_i to v_i+1 + normals = np.array([diffs[:, 1], -diffs[:, 0]]).transpose() + normals = normals / np.linalg.norm(normals) + # Compute Centers + centers = (diffs + 2*vertices) / 2.0 + return normals, centers + + diff --git a/omnisafe/configs/off-policy/DDPGCBF.yaml b/omnisafe/configs/off-policy/DDPGCBF.yaml new file mode 100644 index 000000000..1579aa658 --- /dev/null +++ b/omnisafe/configs/off-policy/DDPGCBF.yaml @@ -0,0 +1,171 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # seed for random number generator + seed: 0 + # training configurations + train_cfgs: + # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc. + device: cpu + # number of threads for torch + torch_threads: 16 + # number of vectorized environments + vector_env_nums: 1 + # number of parallel agent, similar to a3c + parallel: 1 + # total number of steps to train + total_steps: 80_000 + # number of evaluate episodes + eval_episodes: 0 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # number of steps per sample + update_cycle: 1 + # number of iterations to update the policy + update_iters: 1 + # The size of replay buffer + size: 1000000 + # The size of batch + batch_size: 256 + # normalize reward + reward_normalize: False + # normalize cost + cost_normalize: False + # normalize observation + obs_normalize: False + # max gradient norm + max_grad_norm: 40 + # use critic norm + use_critic_norm: False + # critic norm coefficient + critic_norm_coeff: 0.001 + # The soft update coefficient + polyak: 0.001 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_learning_steps` steps + start_learning_steps: 0 + # The delay step of policy update + policy_delay: 1 + # Whether to use the exploration noise + use_exploration_noise: True + # The exploration noise + exploration_noise: 0.1 + # use cost + use_cost: False + # logger configurations + logger_cfgs: + # use wandb for logging + use_wandb: False + # wandb project name + wandb_project: omnisafe + # use tensorboard for logging + use_tensorboard: True + # save model frequency + save_model_freq: 100 + # save logger path + log_dir: "./runs" + # save model path + window_lens: 10 + # model configurations + model_cfgs: + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # actor type + actor_type: mlp + # linear learning rate decay + linear_lr_decay: False + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # The learning rate of Actor network + lr: 0.0001 + # Configuration of Critic network + critic: + # The number of critic networks + num_critics: 1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + + activation: relu + # The learning rate of Critic network + lr: 0.001 + # barrier function compensator configurations + compensator_cfgs: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: relu + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # learning rate + lr: 0.01 + # number of iterations to update the compensator + update_iters: 1 + +SafetyCarCircle1-v0: + # model configurations + model_cfgs: + # Configuration of Actor network + actor: + # The learning rate of Actor network + lr: 0.000005 + # Configuration of Critic network + critic: + # The learning rate of Critic network + lr: 0.001 + +SafetyCarGoal1-v0: + # model configurations + model_cfgs: + # Configuration of Actor network + actor: + # The learning rate of Actor network + lr: 0.000005 + # Configuration of Critic network + critic: + # The learning rate of Critic network + lr: 0.001 + +SafetyPointCircle1-v0: + # model configurations + model_cfgs: + # Configuration of Actor network + actor: + # The learning rate of Actor network + lr: 0.000005 + # Configuration of Critic network + critic: + # The learning rate of Critic network + lr: 0.001 + +SafetyPointGoal1-v0: + # model configurations + model_cfgs: + # Configuration of Actor network + actor: + # The learning rate of Actor network + lr: 0.000005 + # Configuration of Critic network + critic: + # The learning rate of Critic network + lr: 0.001 diff --git a/omnisafe/configs/off-policy/SACRCBF.yaml b/omnisafe/configs/off-policy/SACRCBF.yaml new file mode 100644 index 000000000..bb133e56c --- /dev/null +++ b/omnisafe/configs/off-policy/SACRCBF.yaml @@ -0,0 +1,148 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # seed for random number generator + seed: 0 + # training configurations + train_cfgs: + # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc. + device: cpu + # number of threads for torch + torch_threads: 4 + # number of vectorized environments + vector_env_nums: 1 + # number of parallel agent, similar to a3c + parallel: 1 + # total number of steps to train + total_steps: 80_000 + # number of evaluate episodes + eval_episodes: 0 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 200 + # number of steps per sample + update_cycle: 1 + # number of iterations to update the policy + update_iters: 1 + # The size of replay buffer + size: 1000000 + # The size of batch + batch_size: 256 + # normalize reward + reward_normalize: False + # normalize cost + cost_normalize: False + # normalize observation + obs_normalize: False + # max gradient norm + max_grad_norm: 40 + # use critic norm + use_critic_norm: False + # critic norm coefficient + critic_norm_coeff: 0.001 + # The soft update coefficient + polyak: 0.005 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_learning_steps` steps + start_learning_steps: 5000 + # The delay step of policy update + policy_delay: 1 + # Whether to use the exploration noise + use_exploration_noise: False + # The exploration noise + exploration_noise: 0.1 + # The policy noise + policy_noise: 0.2 + # policy_noise_clip + policy_noise_clip: 0.5 + # The value of alpha + alpha: 0.2 + # Whether to use auto alpha + auto_alpha: True + # use cost + use_cost: False + # control barrier function configurations + cbf_cfgs: + # gamma of control barrier certificate. + gamma_b: 20 + # confidence parameter desired + k_d: 3.0 + # environment dynamics coefficient + l_p: 0.03 + # logger configurations + logger_cfgs: + # use wandb for logging + use_wandb: False + # wandb project name + wandb_project: omnisafe + # use tensorboard for logging + use_tensorboard: True + # save model frequency + save_model_freq: 100 + # save logger path + log_dir: "./runs" + # save model path + window_lens: 10 + # model configurations + model_cfgs: + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # actor type + actor_type: gaussian_sac + # linear learning rate decay + linear_lr_decay: False + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # The learning rate of Actor network + lr: 0.0003 + # Configuration of Critic network + critic: + # The number of critic networks + num_critics: 2 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # The learning rate of Critic network + lr: 0.0003 + # Dynamics model configurations + dynamics_model_cfgs: + # The max number of episodes updateing GP models + gp_max_episodes: 100 + # The size of gp model + gp_model_size: 2000 + # Whether to use the action compensator + use_compensator: False + +Pendulum-v1: + # algorithm configurations + algo_cfgs: + # Actor perdorm random action before `start_learning_steps` steps + start_learning_steps: 0 + # control barrier function configurations + cbf_cfgs: + # gamma of control barrier certificate. + gamma_b: 0.5 + # confidence parameter desired + k_d: 1.5 + # environment dynamics coefficient + l_p: 0.03 \ No newline at end of file diff --git a/omnisafe/configs/on-policy/IPO.yaml b/omnisafe/configs/on-policy/IPO.yaml index 852b08344..e2a6869c3 100644 --- a/omnisafe/configs/on-policy/IPO.yaml +++ b/omnisafe/configs/on-policy/IPO.yaml @@ -27,25 +27,25 @@ defaults: # number of parallel agent, similar to a3c parallel: 1 # total number of steps to train - total_steps: 10000000 + total_steps: 80_000 # algorithm configurations algo_cfgs: # number of steps to update the policy - steps_per_epoch: 20000 + steps_per_epoch: 2000 # number of iterations to update the policy update_iters: 10 # batch size for each iteration - batch_size: 64 + batch_size: 256 # target kl divergence - target_kl: 0.02 + target_kl: 0.005 # entropy coefficient entropy_coef: 0.0 # normalize reward - reward_normalize: True + reward_normalize: False # normalize cost - cost_normalize: True + cost_normalize: False # normalize observation - obs_normalize: True + obs_normalize: False # early stop when kl divergence is bigger than target kl kl_early_stop: True # use max gradient norm @@ -57,11 +57,11 @@ defaults: # critic norm coefficient critic_norm_coef: 0.001 # reward discount factor - gamma: 0.99 + gamma: 0.995 # cost discount factor cost_gamma: 0.99 # lambda for gae - lam: 0.95 + lam: 0.98 # lambda for cost gae lam_c: 0.95 # clip ratio @@ -127,7 +127,7 @@ defaults: # lagrangian configurations lagrange_cfgs: # Tolerance of constraint violation - cost_limit: 25.0 + cost_limit: 1000.0 # Initial value of lagrangian multiplier lagrangian_multiplier_init: 0.001 # Learning rate of lagrangian multiplier diff --git a/omnisafe/configs/on-policy/PPOBetaCBF.yaml b/omnisafe/configs/on-policy/PPOBetaCBF.yaml new file mode 100644 index 000000000..4bd5f0f12 --- /dev/null +++ b/omnisafe/configs/on-policy/PPOBetaCBF.yaml @@ -0,0 +1,120 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # seed for random number generator + seed: 0 + # training configurations + train_cfgs: + # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc. + device: cpu + # number of threads for torch + torch_threads: 16 + # number of vectorized environments + vector_env_nums: 1 + # number of parallel agent, similar to a3c + parallel: 1 + # total number of steps to train + total_steps: 80_000 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # number of iterations to update the policy + update_iters: 10 + # batch size for each iteration + batch_size: 64 + # target kl divergence + target_kl: 0.02 + # entropy coefficient + entropy_coef: 0.0 + # normalize reward + reward_normalize: False + # normalize cost + cost_normalize: False + # normalize observation + obs_normalize: False + # early stop when kl divergence is bigger than target kl + kl_early_stop: True + # use max gradient norm + use_max_grad_norm: False + # max gradient norm + max_grad_norm: 40.0 + # use critic norm + use_critic_norm: True + # critic norm coefficient + critic_norm_coef: 0.001 + # reward discount factor + gamma: 0.995 + # cost discount factor + cost_gamma: 0.99 + # lambda for gae + lam: 0.98 + # lambda for cost gae + lam_c: 0.95 + # clip ratio + clip: 0.2 + # advantage estimation method, options: gae, retrace + adv_estimation_method: gae + # standardize reward advantage + standardized_rew_adv: True + # standardize cost advantage + standardized_cost_adv: True + # penalty coefficient + penalty_coef: 0.0 + # use cost + use_cost: False + # logger configurations + logger_cfgs: + # use wandb for logging + use_wandb: False + # wandb project name + wandb_project: omnisafe + # use tensorboard for logging + use_tensorboard: True + # save model frequency + save_model_freq: 100 + # save logger path + log_dir: "./runs" + # save model path + window_lens: 100 + # model configurations + model_cfgs: + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # actor type, options: gaussian, gaussian_learning + actor_type: beta + # linear learning rate decay + linear_lr_decay: True + # exploration noise anneal + exploration_noise_anneal: False + # std upper bound, and lower bound + std_range: [0.5, 0.1] + # actor network configurations + actor: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: tanh + # out_activation: tanh + # learning rate + lr: 0.0003 + critic: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: tanh + # learning rate + lr: 0.0003 diff --git a/omnisafe/configs/on-policy/TRPO.yaml b/omnisafe/configs/on-policy/TRPO.yaml index 455ba163f..a8d60878b 100644 --- a/omnisafe/configs/on-policy/TRPO.yaml +++ b/omnisafe/configs/on-policy/TRPO.yaml @@ -124,3 +124,35 @@ defaults: activation: tanh # learning rate lr: 0.001 + +Pendulum-v1: + # training configurations + train_cfgs: + # total number of steps to train + total_steps: 80_000 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # batch size for each iteration + batch_size: 256 + # target kl divergence + target_kl: 0.005 + # normalize observation + obs_normalize: False + # reward discount factor + gamma: 0.995 + # lambda for gae + lam: 0.98 + # model configurations + model_cfgs: + # actor network configurations + actor: + # activation function + activation: relu + # barrier function compensator configurations + compensator_cfgs: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: relu \ No newline at end of file diff --git a/omnisafe/configs/on-policy/TRPOCBF.yaml b/omnisafe/configs/on-policy/TRPOCBF.yaml new file mode 100644 index 000000000..74922c9d2 --- /dev/null +++ b/omnisafe/configs/on-policy/TRPOCBF.yaml @@ -0,0 +1,139 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # seed for random number generator + seed: 0 + # training configurations + train_cfgs: + # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc. + device: cpu + # number of threads for torch + torch_threads: 16 + # number of vectorized environments + vector_env_nums: 1 + # number of parallel agent, similar to a3c + parallel: 1 + # total number of steps to train + total_steps: 80_000 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # number of iterations to update the policy + update_iters: 10 + # batch size for each iteration + batch_size: 256 + # target kl divergence + target_kl: 0.005 + # entropy coefficient + entropy_coef: 0.0 + # normalize reward + reward_normalize: False + # normalize cost + cost_normalize: False + # normalize observation + obs_normalize: False + # early stop when kl divergence is bigger than target kl + kl_early_stop: False + # use max gradient norm + use_max_grad_norm: True + # max gradient norm + max_grad_norm: 40.0 + # use critic norm + use_critic_norm: True + # critic norm coefficient + critic_norm_coef: 0.001 + # reward discount factor + gamma: 0.995 + # cost discount factor + cost_gamma: 0.99 + # lambda for gae + lam: 0.98 + # lambda for cost gae + lam_c: 0.95 + # advantage estimation method, options: gae, retrace + adv_estimation_method: gae + # standardize reward advantage + standardized_rew_adv: True + # standardize cost advantage + standardized_cost_adv: True + # penalty coefficient + penalty_coef: 0.0 + # use cost + use_cost: False + # Damping value for conjugate gradient + cg_damping: 0.1 + # Number of conjugate gradient iterations + cg_iters: 15 + # Subsampled observation + fvp_obs: None + # The sub-sampling rate of the observation + fvp_sample_freq: 1 + # logger configurations + logger_cfgs: + # use wandb for logging + use_wandb: False + # wandb project name + wandb_project: omnisafe + # use tensorboard for logging + use_tensorboard: True + # save model frequency + save_model_freq: 100 + # save logger path + log_dir: "./runs" + # save model path + window_lens: 100 + # model configurations + model_cfgs: + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # actor type, options: gaussian, gaussian_learning + actor_type: gaussian_learning + # linear learning rate decay + linear_lr_decay: False + # exploration noise anneal + exploration_noise_anneal: False + # std upper bound, and lower bound + std_range: [0.5, 0.1] + # actor network configurations + actor: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: relu + # out_activation: tanh + # learning rate + lr: ~ + # critic network configurations + critic: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: tanh + # learning rate + lr: 0.001 + # barrier function compensator configurations + compensator_cfgs: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: relu + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # learning rate + lr: 0.01 + # number of iterations to update the compensator + update_iters: 1 \ No newline at end of file diff --git a/omnisafe/envs/__init__.py b/omnisafe/envs/__init__.py index 4d225c61d..ebeb6af4e 100644 --- a/omnisafe/envs/__init__.py +++ b/omnisafe/envs/__init__.py @@ -19,7 +19,9 @@ from omnisafe.envs.crabs_env import CRABSEnv from omnisafe.envs.custom_env import CustomEnv from omnisafe.envs.meta_drive_env import SafetyMetaDriveEnv +from omnisafe.envs.barrier_function_env import BarrierFunctionEnv from omnisafe.envs.mujoco_env import MujocoEnv from omnisafe.envs.safety_gymnasium_env import SafetyGymnasiumEnv from omnisafe.envs.safety_gymnasium_modelbased import SafetyGymnasiumModelBased from omnisafe.envs.safety_isaac_gym_env import SafetyIsaacGymEnv +from omnisafe.envs.robust_barrier_function_env import RobustBarrierFunctionEnv diff --git a/omnisafe/envs/barrier_function_env.py b/omnisafe/envs/barrier_function_env.py new file mode 100644 index 000000000..f8d0d964c --- /dev/null +++ b/omnisafe/envs/barrier_function_env.py @@ -0,0 +1,209 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Interface of control barrier function-based environments.""" + +from __future__ import annotations + +from typing import Any, ClassVar + +import gymnasium +import numpy as np +import torch + +from gymnasium import spaces +from omnisafe.envs.core import CMDP, env_register +from omnisafe.typing import Box + + +# @env_register +class BarrierFunctionEnv(CMDP): + """Interface of control barrier function-based environments. + + .. warning:: + Since environments based on control barrier functions require special judgment and control of environmental dynamics, + they do not support the use of vectorized environments for parallelization. + + Attributes: + need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. + need_time_limit_wrapper (bool): Whether to use time limit wrapper. + """ + need_auto_reset_wrapper = True + need_time_limit_wrapper = False + _support_envs: ClassVar[list[str]] = [ + 'Pendulum-v1', + ] + + def __init__( + self, + env_id: str, + num_envs: int = 1, + device: str = 'cpu', + **kwargs: Any, + ) -> None: + """Initialize the environment. + + Args: + env_id (str): Environment id. + num_envs (int, optional): Number of environments. Defaults to 1. + device (torch.device, optional): Device to store the data. Defaults to 'cpu'. + + Keyword Args: + render_mode (str, optional): The render mode, ranging from ``human``, ``rgb_array``, ``rgb_array_list``. + Defaults to ``rgb_array``. + camera_name (str, optional): The camera name. + camera_id (int, optional): The camera id. + width (int, optional): The width of the rendered image. Defaults to 256. + height (int, optional): The height of the rendered image. Defaults to 256. + """ + super().__init__(env_id) + self._env_id = env_id + if num_envs == 1: + self._env = gymnasium.make(id=env_id, autoreset=False, **kwargs) + self._env_specific_setting() + assert isinstance(self._env.action_space, Box), 'Only support Box action space.' + assert isinstance( + self._env.observation_space, + Box, + ), 'Only support Box observation space.' + self._action_space = self._env.action_space + self._observation_space = self._env.observation_space + else: + raise NotImplementedError('Only support num_envs=1 now.') + self._device = torch.device(device) + + self._num_envs = num_envs + self._metadata = self._env.metadata + + def _env_specific_setting(self): + """Execute some specific setting for environments. + + Some algorithms based on control barrier functions have made fine-tuning adjustments to the environment. + We have organized these adjustments and encapsulated them in this function. + """ + if self._env_id == 'Pendulum-v1': + self._env.unwrapped.max_torque = 15. + self._env.unwrapped.max_speed = 60. + self._env.unwrapped.action_space = spaces.Box(low=-self._env.unwrapped.max_torque, high=self._env.unwrapped.max_torque, shape=(1,)) + high = np.array([1., 1., self._env.unwrapped.max_speed]) + self._env.unwrapped.observation_space = spaces.Box(low=-high, high=high) + self._env.dt = 0.05 + self._env.dynamics_mode = 'Pendulum' + + def step( + self, + action: torch.Tensor, + ) -> tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + dict[str, Any], + ]: + """Step the environment. + + .. note:: + + OmniSafe use auto reset wrapper to reset the environment when the episode is + terminated. So the ``obs`` will be the first observation of the next episode. + And the true ``final_observation`` in ``info`` will be stored in the ``final_observation`` key of ``info``. + + Args: + action (torch.Tensor): Action to take. + + Returns: + observation: Agent's observation of the current environment. + reward: Amount of reward returned after previous action. + cost: Amount of cost returned after previous action. + terminated: Whether the episode has ended. + truncated: Whether the episode has been truncated due to a time limit. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + obs, reward, terminated, truncated, info = self._env.step( + action.detach().cpu().numpy(), + ) + obs, reward, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, terminated, truncated) + ) + cost = torch.abs(torch.atan2(obs[1], obs[0])).to(self._device) + + if 'final_observation' in info: + info['final_observation'] = np.array( + [ + array if array is not None else np.zeros(obs.shape[-1]) + for array in info['final_observation'] + ], + ) + info['final_observation'] = torch.as_tensor( + info['final_observation'], + dtype=torch.float32, + device=self._device, + ) + + return obs, reward, cost, terminated, truncated, info + + def reset( + self, + seed: int | None = None, + options: dict[str, Any] | None = None, + ) -> tuple[torch.Tensor, dict]: + """Reset the environment. + + Args: + seed (int, optional): The random seed. Defaults to None. + options (dict[str, Any], optional): The options for the environment. Defaults to None. + + Returns: + observation: Agent's observation of the current environment. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + obs, info = self._env.reset(seed=seed, options=options) + if self._env_id == 'Pendulum-v1': + while (self._env.unwrapped.state[0] > 1.0 or self._env.unwrapped.state[0] < -1.0): + obs, info = self._env.reset(options=options) + return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info + + def set_seed(self, seed: int) -> None: + """Set the seed for the environment. + + Args: + seed (int): Seed to set. + """ + self.reset(seed=seed) + + def sample_action(self) -> torch.Tensor: + """Sample a random action. + + Returns: + A random action. + """ + return torch.normal(torch.zeros(self.action_space.shape), torch.ones(self.action_space.shape)) + + def render(self) -> Any: + """Render the environment. + + Returns: + Rendered environment. + """ + return self._env.render() + + def close(self) -> None: + """Close the environment.""" + self._env.close() + + @property + def unwrapped(self): + return self._env.unwrapped \ No newline at end of file diff --git a/omnisafe/envs/robust_barrier_function_env.py b/omnisafe/envs/robust_barrier_function_env.py new file mode 100644 index 000000000..12e680b86 --- /dev/null +++ b/omnisafe/envs/robust_barrier_function_env.py @@ -0,0 +1,224 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Interface of control barrier function-based environments.""" + +from __future__ import annotations + +from typing import Any, ClassVar + +import numpy as np +import torch + +import gymnasium +from omnisafe.envs.core import CMDP, env_register +from omnisafe.typing import Box +from gymnasium import spaces +from omnisafe.envs.unicycle_env import UnicycleEnv + + +@env_register +class RobustBarrierFunctionEnv(CMDP): + """Interface of control barrier function-based environments. + + .. warning:: + Since environments based on control barrier functions require special judgment and control of environmental dynamics, + they do not support the use of vectorized environments for parallelization. + + Attributes: + need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. + need_time_limit_wrapper (bool): Whether to use time limit wrapper. + """ + need_auto_reset_wrapper = True + need_time_limit_wrapper = False + _support_envs: ClassVar[list[str]] = [ + 'Unicycle', + 'Pendulum-v1', + ] + + def __init__( + self, + env_id: str, + num_envs: int = 1, + device: str = 'cpu', + **kwargs: Any, + ) -> None: + """Initialize the environment. + + Args: + env_id (str): Environment id. + num_envs (int, optional): Number of environments. Defaults to 1. + device (torch.device, optional): Device to store the data. Defaults to 'cpu'. + + Keyword Args: + render_mode (str, optional): The render mode, ranging from ``human``, ``rgb_array``, ``rgb_array_list``. + Defaults to ``rgb_array``. + camera_name (str, optional): The camera name. + camera_id (int, optional): The camera id. + width (int, optional): The width of the rendered image. Defaults to 256. + height (int, optional): The height of the rendered image. Defaults to 256. + """ + super().__init__(env_id) + self._env_id = env_id + if num_envs == 1: + if self._env_id == 'Unicycle': + self._env = UnicycleEnv() + elif self._env_id == 'Pendulum-v1': + self._env = gymnasium.make(id=env_id, autoreset=False, **kwargs) + self._env_specific_setting() + else: + raise NotImplementedError('Only support Unicycle now.') + assert isinstance(self._env.action_space, Box), 'Only support Box action space.' + assert isinstance( + self._env.observation_space, + Box, + ), 'Only support Box observation space.' + self._action_space = self._env.action_space + self._observation_space = self._env.observation_space + else: + raise NotImplementedError('Only support num_envs=1 now.') + self._device = torch.device(device) + + self._num_envs = num_envs + self._metadata = self._env.metadata + + def _env_specific_setting(self): + """Execute some specific setting for environments. + + Some algorithms based on control barrier functions have made fine-tuning adjustments to the environment. + We have organized these adjustments and encapsulated them in this function. + """ + if self._env_id == 'Pendulum-v1': + self._env.unwrapped.max_torque = 15. + self._env.unwrapped.max_speed = 60. + self._env.unwrapped.action_space = spaces.Box(low=-self._env.unwrapped.max_torque, high=self._env.unwrapped.max_torque, shape=(1,)) + high = np.array([1., 1., self._env.unwrapped.max_speed]) + self._env.unwrapped.observation_space = spaces.Box(low=-high, high=high) + + def step( + self, + action: torch.Tensor, + ) -> tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + dict[str, Any], + ]: + """Step the environment. + + .. note:: + + OmniSafe use auto reset wrapper to reset the environment when the episode is + terminated. So the ``obs`` will be the first observation of the next episode. + And the true ``final_observation`` in ``info`` will be stored in the ``final_observation`` key of ``info``. + + Args: + action (torch.Tensor): Action to take. + + Returns: + observation: Agent's observation of the current environment. + reward: Amount of reward returned after previous action. + cost: Amount of cost returned after previous action. + terminated: Whether the episode has ended. + truncated: Whether the episode has been truncated due to a time limit. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + if self._env_id == 'Unicycle': + obs, reward, cost, terminated, truncated, info = self._env.step( + action.detach().cpu().numpy(), + ) + obs, reward, cost, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, cost, terminated, truncated) + ) + elif self._env_id == 'Pendulum-v1': + obs, reward, terminated, truncated, info = self._env.step( + action.detach().cpu().numpy(), + ) + obs, reward, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, terminated, truncated) + ) + cost = torch.abs(torch.atan2(obs[1], obs[0])).to(self._device) + if 'final_observation' in info: + info['final_observation'] = np.array( + [ + array if array is not None else np.zeros(obs.shape[-1]) + for array in info['final_observation'] + ], + ) + info['final_observation'] = torch.as_tensor( + info['final_observation'], + dtype=torch.float32, + device=self._device, + ) + + return obs, reward, cost, terminated, truncated, info + + def reset( + self, + seed: int | None = None, + options: dict[str, Any] | None = None, + ) -> tuple[torch.Tensor, dict]: + """Reset the environment. + + Args: + seed (int, optional): The random seed. Defaults to None. + options (dict[str, Any], optional): The options for the environment. Defaults to None. + + Returns: + observation: Agent's observation of the current environment. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + obs, info = self._env.reset(seed=seed, options=options) + if self._env_id == 'Pendulum-v1': + while (self._env.unwrapped.state[0] > 1.0 or self._env.unwrapped.state[0] < -1.0): + obs, info = self._env.reset(options=options) + return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info + + def set_seed(self, seed: int) -> None: + """Set the seed for the environment. + + Args: + seed (int): Seed to set. + """ + self.reset(seed=seed) + + def sample_action(self) -> torch.Tensor: + """Sample a random action. + + Returns: + A random action. + """ + return torch.normal(torch.zeros(self.action_space.shape), torch.ones(self.action_space.shape)) + + def render(self) -> Any: + """Render the environment. + + Returns: + Rendered environment. + """ + return self._env.render() + + def close(self) -> None: + """Close the environment.""" + self._env.close() + + def __getattr__(self, name): + try: + return getattr(self._env, name) + except AttributeError: + raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") diff --git a/omnisafe/envs/unicycle_env.py b/omnisafe/envs/unicycle_env.py new file mode 100644 index 000000000..fb16394a5 --- /dev/null +++ b/omnisafe/envs/unicycle_env.py @@ -0,0 +1,366 @@ +import numpy as np +import gymnasium as gym +from gymnasium import spaces +from collections.abc import Iterable + + +def to_pixel(meas_cm, shift=0): + + if isinstance(meas_cm, Iterable): + return 1.5 * 37.795 * meas_cm + np.array(shift) + + return 1.5 * 37.795 * meas_cm + shift + +class UnicycleEnv(gym.Env): + """Custom Environment that follows SafetyGym interface""" + + metadata = {'render.modes': ['human']} + + def __init__(self, obs_config='default'): + + super(UnicycleEnv, self).__init__() + + self.dynamics_mode = 'Unicycle' + # Define action and observation space + # They must be gym.spaces objects + # Example when using discrete actions: + self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,)) + self.safe_action_space = spaces.Box(low=-2.5, high=2.5, shape=(2,)) + self.observation_space = spaces.Box(low=-1e10, high=1e10, shape=(7,)) + self.bds = np.array([[-3., -3.], [3., 3.]]) + + self.dt = 0.02 + self.max_episode_steps = 1000 + self.reward_goal = 1.0 + self.goal_size = 0.3 + # Initialize Env + self.state = None + self.episode_step = 0 + self.initial_state = np.array([[-2.5, -2.5, 0.0], [-2.5, 2.5, 0.0], [-2.5, 0.0, 0.0], [2.5, -2.5, np.pi/2]]) + self.goal_pos = np.array([2.5, 2.5]) + self.rand_init = False # Random Initial State + + self.reset() + + # Get Dynamics + self.get_f, self.get_g = self._get_dynamics() + # Disturbance + self.disturb_mean = np.zeros((3,)) + self.disturb_covar = np.diag([0.005, 0.005, 0.05]) * 20 + + # Build Hazards + self.obs_config = obs_config + self.hazards = [] + if obs_config == 'default': # default + self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([0., 0.])}) + self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([-1., 1.])}) + self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([-1., -1.])}) + self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([1., -1.])}) + self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([1., 1.])}) + elif obs_config == 'test': + # self.build_hazards(obs_config) + self.hazards.append({'type': 'polygon', 'vertices': 0.6*np.array([[-1., -1.], [1., -1], [1., 1.], [-1., 1.]])}) + self.hazards[-1]['vertices'][:, 0] += 0.5 + self.hazards[-1]['vertices'][:, 1] -= 0.5 + self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([1., 1.])}) + self.hazards.append( + {'type': 'polygon', 'vertices': np.array([[0.9, 0.9], [2.1, 2.1], [2.1, 0.9]])}) + else: + n_hazards = 6 + hazard_radius = 0.6 + self.get_random_hazard_locations(n_hazards, hazard_radius) + + # Viewer + self.viewer = None + + + def step(self, action): + """Organize the observation to understand what's going on + + Parameters + ---------- + action : ndarray + Action that the agent takes in the environment + + Returns + ------- + new_obs : ndarray + The new observation with the following structure: + [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, dist2goal] + + """ + + action = np.clip(action, -1.0, 1.0) + state, reward, cost, terminated, truncated, info = self._step(action) + return self.get_obs(), reward, cost, terminated, truncated, info + + def _step(self, action): + """ + + Parameters + ---------- + action + + Returns + ------- + state : ndarray + New internal state of the agent. + reward : float + Reward collected during this transition. + terminated : bool + Whether the episode terminated. + info : dict + Additional info relevant to the environment. + """ + + # Start with our prior for continuous time system x' = f(x) + g(x)u + self.state += self.dt * (self.get_f(self.state) + self.get_g(self.state) @ action) + self.state -= self.dt * 0.1 * self.get_g(self.state) @ np.array([np.cos(self.state[2]), 0]) #* np.random.multivariate_normal(self.disturb_mean, self.disturb_covar, 1).squeeze() + + self.episode_step += 1 + + info = dict() + + dist_goal = self._goal_dist() + reward = (self.last_goal_dist - dist_goal) # -1e-3 * dist_goal + self.last_goal_dist = dist_goal + # Check if goal is met + terminated = False + if self.goal_met(): + info['goal_met'] = True + reward += self.reward_goal + terminated = True + truncated = self.episode_step >= self.max_episode_steps + + # Include constraint cost in reward (only during training, i.e. obs_config=='default') + if self.obs_config == 'default': + info['cost'] = 0 + for hazard in self.hazards: + if hazard['type'] == 'circle': # They should all be circles if 'default' + info['cost'] += 0.1 * (np.sum((self.state[:2] - hazard['location']) ** 2) < hazard['radius'] ** 2) + return self.state, reward, info['cost'], terminated, truncated, info + + def goal_met(self): + """Return true if the current goal is met this step + + Returns + ------- + goal_met : bool + True if the goal condition is met. + + """ + + return np.linalg.norm(self.state[:2] - self.goal_pos) <= self.goal_size + + def reset(self, seed=None, options=None): + """ Reset the state of the environment to an initial state. + + Returns + ------- + observation : ndarray + Next observation. + """ + + self.episode_step = 0 + + # Re-initialize state + if self.rand_init: + self.state = np.copy(self.initial_state[np.random.randint(self.initial_state.shape[0])]) + else: + self.state = np.copy(self.initial_state[0]) + + # Re-initialize last goal dist + self.last_goal_dist = self._goal_dist() + + return self.get_obs(), dict() + + def render(self, mode='human', close=False): + """Render the environment to the screen + + Parameters + ---------- + mode : str + close : bool + + Returns + ------- + + """ + + if mode != 'human' and mode != 'rgb_array': + rel_loc = self.goal_pos - self.state[:2] + theta_error = np.arctan2(rel_loc[1], rel_loc[0]) - self.state[2] + print('Ep_step = {}, \tState = {}, \tDist2Goal = {}, alignment_error = {}'.format(self.episode_step, self.state, self._goal_dist(), theta_error)) + + screen_width = 600 + screen_height = 400 + + if self.viewer is None: + from envs import pyglet_rendering + + self.viewer = pyglet_rendering.Viewer(screen_width, screen_height) + # Draw obstacles + obstacles = [] + for i in range(len(self.hazards)): + if self.hazards[i]['type'] == 'circle': + obstacles.append(pyglet_rendering.make_circle(radius=to_pixel(self.hazards[i]['radius'], shift=0), filled=True)) + obs_trans = pyglet_rendering.Transform(translation=(to_pixel(self.hazards[i]['location'][0], shift=screen_width/2), to_pixel(self.hazards[i]['location'][1], shift=screen_height/2))) + obstacles[i].set_color(1.0, 0.0, 0.0) + obstacles[i].add_attr(obs_trans) + elif self.hazards[i]['type'] == 'polygon': + obstacles.append(pyglet_rendering.make_polygon(to_pixel(self.hazards[i]['vertices'], shift=[screen_width/2, screen_height/2]), filled=True)) + self.viewer.add_geom(obstacles[i]) + + # Make Goal + goal = pyglet_rendering.make_circle(radius=to_pixel(0.1, shift=0), filled=True) + goal_trans = pyglet_rendering.Transform(translation=(to_pixel(self.goal_pos[0], shift=screen_width/2), to_pixel(self.goal_pos[1], shift=screen_height/2))) + goal.add_attr(goal_trans) + goal.set_color(0.0, 0.5, 0.0) + self.viewer.add_geom(goal) + + # Make Robot + self.robot = pyglet_rendering.make_circle(radius=to_pixel(0.1), filled=True) + self.robot_trans = pyglet_rendering.Transform(translation=(to_pixel(self.state[0], shift=screen_width/2), to_pixel(self.state[1], shift=screen_height/2))) + self.robot_trans.set_rotation(self.state[2]) + self.robot.add_attr(self.robot_trans) + self.robot.set_color(0.5, 0.5, 0.8) + self.viewer.add_geom(self.robot) + self.robot_orientation = pyglet_rendering.Line(start=(0.0, 0.0), end=(15.0, 0.0)) + self.robot_orientation.linewidth.stroke = 2 + self.robot_orientation.add_attr(self.robot_trans) + self.robot_orientation.set_color(0, 0, 0) + self.viewer.add_geom(self.robot_orientation) + + if self.state is None: + return None + + self.robot_trans.set_translation(to_pixel(self.state[0], shift=screen_width/2), to_pixel(self.state[1], shift=screen_height/2)) + self.robot_trans.set_rotation(self.state[2]) + + return self.viewer.render(return_rgb_array=mode == "rgb_array") + + def get_obs(self): + """Given the state, this function returns it to an observation akin to the one obtained by calling env.step + + Parameters + ---------- + + Returns + ------- + observation : ndarray + Observation: [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, exp(-dist2goal)] + """ + + rel_loc = self.goal_pos - self.state[:2] + goal_dist = np.linalg.norm(rel_loc) + goal_compass = self.obs_compass() # compass to the goal + + return np.array([self.state[0], self.state[1], np.cos(self.state[2]), np.sin(self.state[2]), goal_compass[0], goal_compass[1], np.exp(-goal_dist)]) + + def _get_dynamics(self): + """Get affine CBFs for a given environment. + + Parameters + ---------- + + Returns + ------- + get_f : callable + Drift dynamics of the continuous system x' = f(x) + g(x)u + get_g : callable + Control dynamics of the continuous system x' = f(x) + g(x)u + """ + + def get_f(state): + f_x = np.zeros(state.shape) + return f_x + + def get_g(state): + theta = state[2] + g_x = np.array([[np.cos(theta), 0], + [np.sin(theta), 0], + [ 0, 1.0]]) + return g_x + + return get_f, get_g + + def obs_compass(self): + """ + Return a robot-centric compass observation of a list of positions. + Compass is a normalized (unit-lenght) egocentric XY vector, + from the agent to the object. + This is equivalent to observing the egocentric XY angle to the target, + projected into the sin/cos space we use for joints. + (See comment on joint observation for why we do this.) + """ + + # Get ego vector in world frame + vec = self.goal_pos - self.state[:2] + # Rotate into frame + R = np.array([[np.cos(self.state[2]), -np.sin(self.state[2])], [np.sin(self.state[2]), np.cos(self.state[2])]]) + vec = np.matmul(vec, R) + # Normalize + vec /= np.sqrt(np.sum(np.square(vec))) + 0.001 + return vec + + def _goal_dist(self): + return np.linalg.norm(self.goal_pos - self.state[:2]) + + def close(self): + if self.viewer: + self.viewer.close() + self.viewer = None + + def get_random_hazard_locations(self, n_hazards: int, hazard_radius: float): + """ + + Parameters + ---------- + n_hazards : int + Number of hazards to create + hazard_radius : float + Radius of hazards + + Returns + ------- + hazards_locs : ndarray + Numpy array of shape (n_hazards, 2) containing xy locations of hazards. + """ + + # Create buffer with boundaries + buffered_bds = np.copy(self.bds) + buffered_bds[0] = buffered_bds[0] + hazard_radius + buffered_bds[1] -= hazard_radius + + hazards = [] + hazards_centers = np.zeros((n_hazards, 2)) + n = 0 # Number of hazards actually placed + for i in range(n_hazards): + successfully_placed = False + iter = 0 + hazard_type = np.random.randint(3) # 0-> Circle 1->Square 2->Triangle + radius = hazard_radius * (1-0.2*2.0*(np.random.random() - 0.5)) + while not successfully_placed and iter < 100: + hazards_centers[n] = (buffered_bds[1] - buffered_bds[0]) * np.random.random(2) + buffered_bds[0] + successfully_placed = np.all(np.linalg.norm(hazards_centers[:n] - hazards_centers[[n]], axis=1) > 3.5*hazard_radius) + successfully_placed = np.logical_and(successfully_placed, np.linalg.norm(self.goal_pos - hazards_centers[n]) > 2.0*hazard_radius) + successfully_placed = np.logical_and(successfully_placed, np.all(np.linalg.norm(self.initial_state[:, :2] - hazards_centers[[n]], axis=1) > 2.0*hazard_radius)) + iter += 1 + if not successfully_placed: + continue + if hazard_type == 0: # Circle + hazards.append({'type': 'circle', 'location': hazards_centers[n], 'radius': radius}) + elif hazard_type == 1: # Square + hazards.append({'type': 'polygon', 'vertices': np.array( + [[-radius, -radius], [-radius, radius], [radius, radius], [radius, -radius]])}) + hazards[-1]['vertices'] += hazards_centers[n] + else: # Triangle + hazards.append({'type': 'polygon', 'vertices': np.array( + [[-radius, -radius], [-radius, radius], [radius, radius], [radius, -radius]])}) + # Pick a vertex and delete it + idx = np.random.randint(4) + hazards[-1]['vertices'] = np.delete(hazards[-1]['vertices'], idx, axis=0) + hazards[-1]['vertices'] += hazards_centers[n] + n += 1 + + self.hazards = hazards diff --git a/omnisafe/models/actor/actor_builder.py b/omnisafe/models/actor/actor_builder.py index cd1a0df15..80c68e1be 100644 --- a/omnisafe/models/actor/actor_builder.py +++ b/omnisafe/models/actor/actor_builder.py @@ -21,6 +21,7 @@ from omnisafe.models.actor.mlp_actor import MLPActor from omnisafe.models.actor.perturbation_actor import PerturbationActor from omnisafe.models.actor.vae_actor import VAE +from omnisafe.models.actor.beta_learning_actor import BetaLearningActor from omnisafe.models.base import Actor from omnisafe.typing import Activation, ActorType, InitFunction, OmnisafeSpace @@ -114,6 +115,14 @@ def build_actor( activation=self._activation, weight_initialization_mode=self._weight_initialization_mode, ) + if actor_type == 'beta': + return BetaLearningActor( + self._obs_space, + self._act_space, + self._hidden_sizes, + activation=self._activation, + weight_initialization_mode=self._weight_initialization_mode, + ) raise NotImplementedError( f'Actor type {actor_type} is not implemented! ' f'Available actor types are: gaussian_learning, gaussian_sac, mlp, vae, perturbation.', diff --git a/omnisafe/models/actor/beta_learning_actor.py b/omnisafe/models/actor/beta_learning_actor.py new file mode 100644 index 000000000..8f9675934 --- /dev/null +++ b/omnisafe/models/actor/beta_learning_actor.py @@ -0,0 +1,144 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of BetaLearningActor.""" + +from __future__ import annotations + +import torch +import torch.nn as nn +import numpy as np + +from torch.distributions import Distribution, Beta + +from omnisafe.models.actor.gaussian_actor import GaussianActor +from omnisafe.typing import Activation, InitFunction, OmnisafeSpace +from omnisafe.utils.model import build_mlp_network +from omnisafe.models.base import Actor + + +# pylint: disable-next=too-many-instance-attributes +class BetaLearningActor(Actor): + + + _current_dist: Beta + + def __init__( + self, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: list[int], + activation: Activation = 'relu', + weight_initialization_mode: InitFunction = 'kaiming_uniform', + ) -> None: + """Initialize an instance of :class:`GaussianLearningActor`.""" + super().__init__(obs_space, act_space, hidden_sizes, activation, weight_initialization_mode) + + self.mean: nn.Module = build_mlp_network( + sizes=[self._obs_dim, self._hidden_sizes[0], self._hidden_sizes[0]], + activation=activation, + output_activation='tanh', + weight_initialization_mode=weight_initialization_mode, + ) + + self.alpha_net: nn.Module = build_mlp_network( + sizes=[self._hidden_sizes[-1], self._act_dim], + activation='identity', + output_activation='softplus', + weight_initialization_mode=weight_initialization_mode, + ) + + self.beta_net: nn.Module = build_mlp_network( + sizes=[self._hidden_sizes[-1], self._act_dim], + activation='identity', + output_activation='softplus', + weight_initialization_mode=weight_initialization_mode, + ) + + def _distribution(self, obs: torch.Tensor) -> Beta: + """Get the distribution of the actor. + + .. warning:: + This method is not supposed to be called by users. You should call :meth:`forward` + instead. + + Args: + obs (torch.Tensor): Observation from environments. + + Returns: + The normal distribution of the mean and standard deviation from the actor. + """ + mean = self.mean(obs) + alphas = 1.0+self.alpha_net(mean) + betas = 1.0+self.beta_net(mean) + return Beta(alphas, betas) + + def predict(self, obs: torch.Tensor, deterministic: bool = False) -> torch.Tensor: + """Predict the action given observation. + + The predicted action depends on the ``deterministic`` flag. + + - If ``deterministic`` is ``True``, the predicted action is the mean of the distribution. + - If ``deterministic`` is ``False``, the predicted action is sampled from the distribution. + + Args: + obs (torch.Tensor): Observation from environments. + deterministic (bool, optional): Whether to use deterministic policy. Defaults to False. + + Returns: + The mean of the distribution if deterministic is True, otherwise the sampled action. + """ + self._current_dist = self._distribution(obs) + self._after_inference = True + if deterministic: + return self._current_dist.mean + return self._current_dist.rsample() + + def forward(self, obs: torch.Tensor) -> Distribution: + """Forward method. + + Args: + obs (torch.Tensor): Observation from environments. + + Returns: + The current distribution. + """ + self._current_dist = self._distribution(obs) + self._after_inference = True + return self._current_dist + + def log_prob(self, act: torch.Tensor) -> torch.Tensor: + """Compute the log probability of the action given the current distribution. + + .. warning:: + You must call :meth:`forward` or :meth:`predict` before calling this method. + + Args: + act (torch.Tensor): Action from :meth:`predict` or :meth:`forward` . + + Returns: + Log probability of the action. + """ + assert self._after_inference, 'log_prob() should be called after predict() or forward()' + self._after_inference = False + return self._current_dist.log_prob(act).sum(axis=-1) + + @property + def std(self) -> float: + """Standard deviation of the distribution.""" + return 1.0 + + @std.setter + def std(self, std: float) -> None: + pass diff --git a/omnisafe/typing.py b/omnisafe/typing.py index bf73b558f..492067e72 100644 --- a/omnisafe/typing.py +++ b/omnisafe/typing.py @@ -39,7 +39,7 @@ AdvatageEstimator = Literal['gae', 'gae-rtg', 'vtrace', 'plain'] InitFunction = Literal['kaiming_uniform', 'xavier_normal', 'glorot', 'xavier_uniform', 'orthogonal'] CriticType = Literal['v', 'q'] -ActorType = Literal['gaussian_learning', 'gaussian_sac', 'mlp', 'vae', 'perturbation'] +ActorType = Literal['gaussian_learning', 'gaussian_sac', 'mlp', 'vae', 'perturbation', 'beta'] DEVICE_CPU = torch.device('cpu') From 025eea9b2c3df2ef6a5f4da4c62d7f4f7b646aba Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Tue, 30 Apr 2024 16:33:04 +0800 Subject: [PATCH 02/18] fix: fix test --- omnisafe/adapter/__init__.py | 2 +- omnisafe/adapter/barrier_function_adapter.py | 55 +- .../adapter/beta_barrier_function_adapter.py | 105 +-- .../offpolicy_barrier_function_adapter.py | 116 +++- .../robust_barrier_function_adapter.py | 126 ++-- omnisafe/algorithms/__init__.py | 8 +- omnisafe/algorithms/off_policy/__init__.py | 4 +- omnisafe/algorithms/off_policy/ddpg.py | 23 +- omnisafe/algorithms/off_policy/ddpg_cbf.py | 76 ++- omnisafe/algorithms/off_policy/sac_rcbf.py | 69 +- omnisafe/algorithms/on_policy/__init__.py | 4 +- .../on_policy/barrier_function/__init__.py | 2 +- .../on_policy/barrier_function/ppo_cbf.py | 10 +- .../on_policy/barrier_function/trpo_cbf.py | 49 +- omnisafe/algorithms/on_policy/base/ppo.py | 56 -- omnisafe/common/barrier_comp.py | 23 +- omnisafe/common/barrier_solver.py | 208 +++--- .../common/buffer/vector_onpolicy_buffer.py | 2 +- omnisafe/common/robust_barrier_solver.py | 513 +++++++-------- omnisafe/common/robust_gp_model.py | 601 ++++++++---------- omnisafe/common/utils.py | 218 +------ omnisafe/configs/off-policy/DDPGCBF.yaml | 6 +- omnisafe/configs/off-policy/SACRCBF.yaml | 24 +- omnisafe/configs/on-policy/TRPO.yaml | 2 +- omnisafe/configs/on-policy/TRPOCBF.yaml | 2 +- omnisafe/envs/__init__.py | 2 + omnisafe/envs/barrier_function_env.py | 64 +- omnisafe/envs/robust_barrier_function_env.py | 71 +-- omnisafe/envs/unicycle_env.py | 401 +++++++----- omnisafe/evaluator.py | 71 +++ omnisafe/models/actor/actor_builder.py | 2 +- omnisafe/models/actor/beta_learning_actor.py | 25 +- pyproject.toml | 5 + requirements.txt | 5 + 34 files changed, 1482 insertions(+), 1468 deletions(-) diff --git a/omnisafe/adapter/__init__.py b/omnisafe/adapter/__init__.py index 75d4539ba..02dab6709 100644 --- a/omnisafe/adapter/__init__.py +++ b/omnisafe/adapter/__init__.py @@ -14,6 +14,7 @@ # ============================================================================== """Adapter for the environment and the algorithm.""" +from omnisafe.adapter.beta_barrier_function_adapter import BetaBarrierFunctionAdapter from omnisafe.adapter.early_terminated_adapter import EarlyTerminatedAdapter from omnisafe.adapter.modelbased_adapter import ModelBasedAdapter from omnisafe.adapter.offline_adapter import OfflineAdapter @@ -22,4 +23,3 @@ from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter from omnisafe.adapter.saute_adapter import SauteAdapter from omnisafe.adapter.simmer_adapter import SimmerAdapter -from omnisafe.adapter.beta_barrier_function_adapter import BetaBarrierFunctionAdapter diff --git a/omnisafe/adapter/barrier_function_adapter.py b/omnisafe/adapter/barrier_function_adapter.py index 47fa9b871..735ff690e 100644 --- a/omnisafe/adapter/barrier_function_adapter.py +++ b/omnisafe/adapter/barrier_function_adapter.py @@ -20,26 +20,20 @@ from rich.progress import track from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.common.buffer import VectorOnPolicyBuffer from omnisafe.common.logger import Logger +from omnisafe.envs.wrapper import AutoReset, CostNormalize, RewardNormalize, TimeLimit, Unsqueeze from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic from omnisafe.utils.config import Config -from omnisafe.common.barrier_solver import PendulumSolver -from omnisafe.common.barrier_comp import BarrierCompensator -from omnisafe.envs.wrapper import ( - AutoReset, - CostNormalize, - RewardNormalize, - TimeLimit, - Unsqueeze, -) class BarrierFunctionAdapter(OnPolicyAdapter): """BarrierFunction Adapter for OmniSafe. - The BarrierFunction Adapter is used to establish the logic of interaction between agents and the - environment based on control barrier functions. Its key feature is the introduction of action + The BarrierFunction Adapter is used to establish the logic of interaction between agents and the + environment based on control barrier functions. Its key feature is the introduction of action compensators and barrier function solvers. Args: @@ -63,10 +57,10 @@ def _wrapper( cost_normalize: bool = True, ) -> None: """Wrapper the environment. - + .. warning:: - Since solving the optimization problem requires obtaining physical quantities with practical - significance from state observations, the Barrier Function Adapter does not support + Since solving the optimization problem requires obtaining physical quantities with practical + significance from state observations, the Barrier Function Adapter does not support normalization of observations. Args: @@ -89,15 +83,15 @@ def _wrapper( self._env = Unsqueeze(self._env, device=self._device) self._eval_env = Unsqueeze(self._eval_env, device=self._device) - def set_solver(self, solver: PendulumSolver): + def set_solver(self, solver: PendulumSolver) -> None: """Set the barrier function solver for Pendulum environment.""" self.solver: PendulumSolver = solver - - def set_compensator(self, compensator: BarrierCompensator): + + def set_compensator(self, compensator: BarrierCompensator) -> None: """Set the action compensator.""" self.compensator: BarrierCompensator = compensator - def reset_gp_model(self): + def reset_gp_model(self) -> None: """Reset the gaussian processing model of barrier function solver.""" self.solver.GP_model_prev = self.solver.GP_model.copy() self.solver.build_GP_model() @@ -111,10 +105,6 @@ def rollout( # pylint: disable=too-many-locals ) -> None: """Rollout the environment and store the data in the buffer. - .. warning:: - As OmniSafe uses :class:`AutoReset` wrapper, the environment will be reset automatically, - so the final observation will be stored in ``info['final_observation']``. - Args: steps_per_epoch (int): Number of steps per epoch. agent (ConstraintActorCritic): Constraint actor-critic, including actor , reward critic @@ -143,17 +133,23 @@ def rollout( # pylint: disable=too-many-locals approx_compensating_act = self.compensator(obs=obs) compensated_act_mean_raw = act_mean + approx_compensating_act - + if self.first_iter: - [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model = False) + [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model=False) else: - [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model = True) - - compensating_act = self.solver.control_barrier(compensated_act_mean_raw, f, g, x, std) + [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model=True) + + compensating_act = self.solver.control_barrier( + compensated_act_mean_raw, + f, + g, + x, + std, + ) compensated_act_mean = compensated_act_mean_raw + compensating_act final_act = torch.normal(compensated_act_mean, act_std) - + logp = agent.actor.log_prob(final_act).detach() path_obs.append(obs.detach().cpu().squeeze().numpy()) path_act.append(final_act.detach().cpu().squeeze().numpy()) @@ -207,7 +203,7 @@ def rollout( # pylint: disable=too-many-locals self._ep_len[idx] = 0.0 if step < 650: - self.solver.update_GP_dynamics(obs = path_obs, act = path_act) + self.solver.update_GP_dynamics(obs=path_obs, act=path_act) path_obs = [] path_act = [] @@ -216,4 +212,3 @@ def rollout( # pylint: disable=too-many-locals obs, _ = self._env.reset() buffer.finish_path(last_value_r, last_value_c, idx) self.first_iter = 0 - diff --git a/omnisafe/adapter/beta_barrier_function_adapter.py b/omnisafe/adapter/beta_barrier_function_adapter.py index f785c3062..ee8ccc298 100644 --- a/omnisafe/adapter/beta_barrier_function_adapter.py +++ b/omnisafe/adapter/beta_barrier_function_adapter.py @@ -16,84 +16,86 @@ from __future__ import annotations -import torch import numpy as np +import torch from rich.progress import track from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter from omnisafe.common.buffer import VectorOnPolicyBuffer from omnisafe.common.logger import Logger +from omnisafe.envs.wrapper import CostNormalize, RewardNormalize, Unsqueeze from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic from omnisafe.utils.config import Config -from omnisafe.common.barrier_solver import PendulumSolver -from omnisafe.common.barrier_comp import BarrierCompensator - -from omnisafe.envs.wrapper import ( - AutoReset, - CostNormalize, - RewardNormalize, - TimeLimit, - Unsqueeze, -) -def cbf(state=None, eta: float = 0.99): +def cbf(state: np.ndarray | None = None, eta: float = 0.99) -> tuple[np.ndarray, np.ndarray]: """ Calculates CBF constraint set at a given state. Default is the current state. """ - - state = state g = 9.8 m = 1 - l = 1 + length = 1 tau = 5e-2 theta_safety_bounds = [-1.0, 1.0] thetadot_safety_bounds = [-np.inf, np.inf] torque_bounds = [-15.0, 15.0] - if (eta>1-1e-3) or (eta<1e-5): - raise ValueError("eta should be inside (0, 1)") - c1 = ((3 * g)/(2 * l)) - c2 = (3 /(m * (l ** 2))) + if (eta > 1 - 1e-3) or (eta < 1e-5): + raise ValueError('eta should be inside (0, 1)') + c1 = (3 * g) / (2 * length) + c2 = 3 / (m * (length**2)) theta, thetadot = state[0], state[1] theta_min, theta_max = theta_safety_bounds[0], theta_safety_bounds[1] thetadot_min, thetadot_max = thetadot_safety_bounds[0], thetadot_safety_bounds[1] - u_min1 = (1/c2) * (((1 / (tau **2)) * (-eta * (theta - theta_min) - tau * thetadot)) - c1 * np.sin(theta) ) - u_max1 = (1/c2) * (((1 / (tau **2)) * ( eta * (theta_max - theta) - tau * thetadot)) - c1 * np.sin(theta) ) + u_min1 = (1 / c2) * ( + ((1 / (tau**2)) * (-eta * (theta - theta_min) - tau * thetadot)) - c1 * np.sin(theta) + ) + u_max1 = (1 / c2) * ( + ((1 / (tau**2)) * (eta * (theta_max - theta) - tau * thetadot)) - c1 * np.sin(theta) + ) - - u_min2 = (1/c2) * (((1 / (tau)) * (-eta * (thetadot - thetadot_min))) - c1 * np.sin(theta) ) - u_max2 = (1/c2) * (((1 / (tau)) * ( eta * (thetadot_max - thetadot))) - c1 * np.sin(theta) ) + u_min2 = (1 / c2) * (((1 / (tau)) * (-eta * (thetadot - thetadot_min))) - c1 * np.sin(theta)) + u_max2 = (1 / c2) * (((1 / (tau)) * (eta * (thetadot_max - thetadot))) - c1 * np.sin(theta)) u_min = max(u_min1, u_min2, torque_bounds[0]) u_max = min(u_max1, u_max2, torque_bounds[1]) - - u_min=torque_bounds[0] - u_max=torque_bounds[1] - if u_min>u_max: - raise ValueError("Infeasible") - else: - return [u_min, u_max] - -def vectorize_f(f): #--vipul :added action_dim - """ - Converts a function f defined on 1D numpy arrays and outputting pairs of - scalars into a vectorized function accepting batches of - torch tensorized arrays and output pairs of torch tensors. + + u_min = torque_bounds[0] + u_max = torque_bounds[1] + + return [u_min, u_max] + + +def vectorize_f(f: callable) -> callable: + """Converts a function `f` that operates on 1D numpy arrays and outputs pairs of scalars, + into a vectorized function that accepts batches of torch tensorized arrays and outputs + pairs of torch tensors. + + Args: + f (callable): A function that accepts 1D numpy arrays and returns a tuple (lower_bound, upper_bound), where both are scalars. + + Returns: + callable: A vectorized function that can process batches of torch tensors and return pairs of torch tensors. """ - def vectorized_f_(obs): #--vipul :added action_dim + def vectorized_f_(obs: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """ + Inner function to process the torch tensor batch. + + Args: + obs (torch.Tensor): A batch of observations as torch tensors. + Returns: + tuple: Two torch tensors representing the lower and upper bounds for each observation in the batch. + """ obs = obs.cpu().detach().numpy() - if len(obs.shape) == 1: # check to see if obs is a batch or single obs + if len(obs.shape) == 1: batch_size = 1 lbs, ubs = f(obs) - lbs=np.array(lbs) - ubs=np.array(ubs) - #lbs = -5 - #ubs = 5 + lbs = np.array(lbs) + ubs = np.array(ubs) else: batch_size = obs.shape[0] @@ -104,7 +106,7 @@ def vectorized_f_(obs): #--vipul :added action_dim lbs = torch.FloatTensor(lbs).reshape(batch_size, 1) ubs = torch.FloatTensor(ubs).reshape(batch_size, 1) - + return lbs, ubs return vectorized_f_ @@ -113,8 +115,8 @@ def vectorized_f_(obs): #--vipul :added action_dim class BetaBarrierFunctionAdapter(OnPolicyAdapter): """BarrierFunction Adapter for OmniSafe. - The BarrierFunction Adapter is used to establish the logic of interaction between agents and the - environment based on control barrier functions. Its key feature is the introduction of action + The BarrierFunction Adapter is used to establish the logic of interaction between agents and the + environment based on control barrier functions. Its key feature is the introduction of action compensators and barrier function solvers. Args: @@ -139,10 +141,10 @@ def _wrapper( cost_normalize: bool = True, ) -> None: """Wrapper the environment. - + .. warning:: - Since solving the optimization problem requires obtaining physical quantities with practical - significance from state observations, the Barrier Function Adapter does not support + Since solving the optimization problem requires obtaining physical quantities with practical + significance from state observations, the Barrier Function Adapter does not support normalization of observations. Args: @@ -190,10 +192,10 @@ def rollout( # pylint: disable=too-many-locals with torch.no_grad(): act, value_r, value_c, logp = agent.step(obs) lb, ub = self.constraint_fn(obs) - final_act = lb + (ub-lb)*act + final_act = lb + (ub - lb) * act next_obs, reward, cost, terminated, truncated, info = self.step(final_act) - + self._log_value(reward=reward, cost=cost, info=info) if self._cfgs.algo_cfgs.use_cost: @@ -242,4 +244,3 @@ def rollout( # pylint: disable=too-many-locals obs, _ = self._env.reset() buffer.finish_path(last_value_r, last_value_c, idx) self.first_iter = 0 - diff --git a/omnisafe/adapter/offpolicy_barrier_function_adapter.py b/omnisafe/adapter/offpolicy_barrier_function_adapter.py index b05e950cb..e1353884b 100644 --- a/omnisafe/adapter/offpolicy_barrier_function_adapter.py +++ b/omnisafe/adapter/offpolicy_barrier_function_adapter.py @@ -17,23 +17,17 @@ from __future__ import annotations import torch -import numpy as np +from sklearn.gaussian_process import GaussianProcessRegressor from omnisafe.adapter.offpolicy_adapter import OffPolicyAdapter +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.common.buffer import VectorOffPolicyBuffer from omnisafe.common.logger import Logger -from omnisafe.utils.config import Config -from omnisafe.common.barrier_solver import PendulumSolver -from omnisafe.common.robust_barrier_solver import CBFQPLayer -from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.envs.wrapper import CostNormalize, RewardNormalize, Unsqueeze from omnisafe.models.actor_critic.constraint_actor_q_critic import ConstraintActorQCritic -from omnisafe.common.robust_gp_model import DynamicsModel +from omnisafe.utils.config import Config -from omnisafe.envs.wrapper import ( - CostNormalize, - RewardNormalize, - Unsqueeze, -) class OffPolicyBarrierFunctionAdapter(OffPolicyAdapter): @@ -64,18 +58,58 @@ def _wrapper( self._env = Unsqueeze(self._env, device=self._device) self._eval_env = Unsqueeze(self._eval_env, device=self._device) - def set_solver(self, solver: PendulumSolver): + def eval_policy( # pylint: disable=too-many-locals + self, + episode: int, + agent: ConstraintActorQCritic, + logger: Logger, + ) -> None: + """Rollout the environment in an evaluation environment. + + Args: + episode (int): Number of episodes. + agent (ConstraintActorCritic): Agent. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + """ + for _ in range(episode): + ep_ret, ep_cost, ep_len = 0.0, 0.0, 0 + obs, _ = self._eval_env.reset() + obs = obs.to(self._device) + + done = False + while not done: + act = agent.step(obs, deterministic=True) + final_act = self.get_safe_action(obs=obs, act=act, is_eval=True) + obs, reward, cost, terminated, truncated, info = self._eval_env.step(final_act) + obs, reward, cost, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, cost, terminated, truncated) + ) + ep_ret += info.get('original_reward', reward).cpu() + ep_cost += info.get('original_cost', cost).cpu() + ep_len += 1 + done = bool(terminated[0].item()) or bool(truncated[0].item()) + + logger.store( + { + 'Metrics/TestEpRet': ep_ret, + 'Metrics/TestEpCost': ep_cost, + 'Metrics/TestEpLen': ep_len, + }, + ) + + def set_solver(self, solver: PendulumSolver) -> None: """Set the barrier function solver for Pendulum environment.""" self.solver: PendulumSolver = solver - - def set_compensator(self, compensator: BarrierCompensator): + + def set_compensator(self, compensator: BarrierCompensator) -> None: """Set the action compensator.""" self.compensator: BarrierCompensator = compensator - def reset_gp_model(self): + def reset_gp_model(self) -> None: """Reset the gaussian processing model of barrier function solver.""" - self.solver.GP_model_prev = self.solver.GP_model.copy() - self.solver.build_GP_model() + self.solver.gp_model_prev = self.solver.gp_model.copy() + self.solver.build_gp_model() def rollout( # pylint: disable=too-many-locals self, @@ -87,18 +121,16 @@ def rollout( # pylint: disable=too-many-locals ) -> None: for _ in range(rollout_step): if use_rand_action: - act = torch.normal(torch.zeros(self.action_space.shape), torch.ones(self.action_space.shape)).unsqueeze(0) + act = (torch.rand(self.action_space.shape) * 2 - 1).unsqueeze(0).to(self._device) else: act = agent.actor.predict(self._current_obs, deterministic=False) - + final_act = self.get_safe_action(obs=self._current_obs, act=act) self.episode_rollout['obs'].append(self._current_obs) self.episode_rollout['final_act'].append(final_act) next_obs, reward, cost, terminated, truncated, info = self.step(final_act) - logger.store({'Metrics/angle': cost}) - self._log_value(reward=reward, cost=cost, info=info) buffer.store( @@ -115,18 +147,21 @@ def rollout( # pylint: disable=too-many-locals if done: self._log_metrics(logger, idx) compensator_loss = self.compensator.train( - torch.cat(self.episode_rollout['obs']), - torch.cat(self.episode_rollout['approx_compensating_act']), + torch.cat(self.episode_rollout['obs']), + torch.cat(self.episode_rollout['approx_compensating_act']), torch.cat(self.episode_rollout['compensating_act']), - ) + ) logger.store({'Value/Loss_compensator': compensator_loss.item()}) - self.solver.update_GP_dynamics(obs=torch.cat(self.episode_rollout['obs']), act=torch.cat(self.episode_rollout['final_act'])) - + self.solver.update_gp_dynamics( + obs=torch.cat(self.episode_rollout['obs']), + act=torch.cat(self.episode_rollout['final_act']), + ) + self.episode_rollout['obs'] = [] self.episode_rollout['final_act'] = [] self.episode_rollout['approx_compensating_act'] = [] self.episode_rollout['compensating_act'] = [] - + self._reset_log(idx) self._current_obs, _ = self._env.reset() self.first_iter = 0 @@ -134,18 +169,29 @@ def rollout( # pylint: disable=too-many-locals self.reset_gp_model() @torch.no_grad - def get_safe_action(self, obs, act): + def get_safe_action( + self, + obs: torch.Tensor, + act: torch.Tensor, + is_eval: bool = False, + ) -> torch.Tensor: approx_compensating_act = self.compensator(obs=self._current_obs) compensated_act_mean_raw = act + approx_compensating_act - + if self.first_iter: - [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model = False) + [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=False) else: - [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model = True) - + [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=True) + compensating_act = self.solver.control_barrier(compensated_act_mean_raw, f, g, x, std) safe_act = compensated_act_mean_raw + compensating_act - self.episode_rollout['compensating_act'].append(compensating_act) - self.episode_rollout['approx_compensating_act'].append(approx_compensating_act) - return safe_act \ No newline at end of file + if not is_eval: + self.episode_rollout['compensating_act'].append(compensating_act) + self.episode_rollout['approx_compensating_act'].append(approx_compensating_act) + + return safe_act + + @property + def gp_models(self) -> list[GaussianProcessRegressor]: + return self.solver.gp_models diff --git a/omnisafe/adapter/robust_barrier_function_adapter.py b/omnisafe/adapter/robust_barrier_function_adapter.py index f58f1e176..843676c7f 100644 --- a/omnisafe/adapter/robust_barrier_function_adapter.py +++ b/omnisafe/adapter/robust_barrier_function_adapter.py @@ -16,25 +16,20 @@ from __future__ import annotations +from typing import Any + import torch -import numpy as np from omnisafe.adapter.offpolicy_adapter import OffPolicyAdapter from omnisafe.common.buffer import VectorOffPolicyBuffer from omnisafe.common.logger import Logger -from omnisafe.utils.config import Config from omnisafe.common.robust_barrier_solver import CBFQPLayer -from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.robust_gp_model import DynamicsModel +from omnisafe.envs.wrapper import CostNormalize, RewardNormalize, Unsqueeze from omnisafe.models.actor_critic.constraint_actor_q_critic import ConstraintActorQCritic from omnisafe.typing import OmnisafeSpace -from omnisafe.common.robust_gp_model import DynamicsModel - +from omnisafe.utils.config import Config -from omnisafe.envs.wrapper import ( - CostNormalize, - RewardNormalize, - Unsqueeze, -) class RobustBarrierFunctionAdapter(OffPolicyAdapter): @@ -53,10 +48,10 @@ def _wrapper( cost_normalize: bool = True, ) -> None: """Wrapper the environment. - + .. warning:: - Since solving the optimization problem requires obtaining physical quantities with practical - significance from state observations, the Barrier Function Adapter does not support + Since solving the optimization problem requires obtaining physical quantities with practical + significance from state observations, the Barrier Function Adapter does not support normalization of observations. Args: @@ -72,19 +67,56 @@ def _wrapper( if self._env.num_envs == 1: self._env = Unsqueeze(self._env, device=self._device) self._eval_env = Unsqueeze(self._eval_env, device=self._device) - # self._env = ActionScale(self._env, low=-1.0, high=1.0, device=self._device) - # self._eval_env = ActionScale(self._eval_env, low=-1.0, high=1.0, device=self._device) - - def set_solver(self, solver: CBFQPLayer): + + def set_solver(self, solver: CBFQPLayer) -> None: """Set the barrier function solver for Pendulum environment.""" self.solver: CBFQPLayer = solver self.solver.env = self._env - def set_dynamics_model(self, dynamics_model: DynamicsModel): + def set_dynamics_model(self, dynamics_model: DynamicsModel) -> None: """Set the dynamics model.""" self.dynamics_model = dynamics_model self.dynamics_model.env = self._env + def eval_policy( # pylint: disable=too-many-locals + self, + episode: int, + agent: ConstraintActorQCritic, + logger: Logger, + ) -> None: + """Rollout the environment with deterministic agent action. + + Args: + episode (int): Number of episodes. + agent (ConstraintActorCritic): Agent. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + """ + for _ in range(episode): + ep_ret, ep_cost, ep_len = 0.0, 0.0, 0 + obs, _ = self._eval_env.reset() + obs = obs.to(self._device) + + done = False + while not done: + act = agent.step(obs, deterministic=True) + obs, reward, cost, terminated, truncated, info = self._eval_env.step(act) + obs, reward, cost, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, cost, terminated, truncated) + ) + ep_ret += info.get('original_reward', reward).cpu() + ep_cost += info.get('original_cost', cost).cpu() + ep_len += 1 + done = bool(terminated[0].item()) or bool(truncated[0].item()) + + logger.store( + { + 'Metrics/TestEpRet': ep_ret, + 'Metrics/TestEpCost': ep_cost, + 'Metrics/TestEpLen': ep_len, + }, + ) + def rollout( # pylint: disable=too-many-locals self, rollout_step: int, @@ -108,14 +140,15 @@ def rollout( # pylint: disable=too-many-locals use_rand_action (bool): Whether to use random action. """ for _ in range(rollout_step): - state = self.dynamics_model.get_state(self._current_obs) # 动态模型将观测转换为状态,状态和观测之间有一个互逆的转换 + state = self.dynamics_model.get_state(self._current_obs) self._current_steps += 1 if use_rand_action: - act = torch.normal(torch.zeros(self.action_space.shape), torch.ones(self.action_space.shape)).unsqueeze(0).to(self._device) + act = (torch.rand(self.action_space.shape) * 2 - 1).unsqueeze(0).to(self._device) else: act = agent.step(self._current_obs, deterministic=False) final_act = self.get_safe_action(obs=self._current_obs, act=act) + next_obs, reward, cost, terminated, truncated, info = self.step(final_act) self._log_value(reward=reward, cost=cost, info=info) @@ -127,11 +160,18 @@ def rollout( # pylint: disable=too-many-locals done=torch.logical_and(terminated, torch.logical_xor(terminated, truncated)), next_obs=next_obs, ) - - if self._ep_len[0] % 2 == 0 and self._num_episodes < self._cfgs.dynamics_model_cfgs.gp_max_episodes: + + if ( + self._ep_len[0] % 2 == 0 + and self._num_episodes < self._cfgs.dynamics_model_cfgs.gp_max_episodes + ): next_state = self.dynamics_model.get_state(next_obs) - self.dynamics_model.append_transition(state.cpu().detach().numpy(), final_act.cpu().detach().numpy(), next_state.cpu().detach().numpy(), t_batch=np.array([self._ep_len[0]*self._env.dt])) - + self.dynamics_model.append_transition( + state.cpu().detach().numpy(), + final_act.cpu().detach().numpy(), + next_state.cpu().detach().numpy(), + ) + self._current_obs = next_obs for idx, done in enumerate(torch.logical_or(terminated, truncated)): if done: @@ -139,36 +179,24 @@ def rollout( # pylint: disable=too-many-locals self._reset_log(idx) self._num_episodes += 1 self._current_obs, _ = self._env.reset() - + @property def safe_action_space(self) -> OmnisafeSpace: if hasattr(self._env, 'safe_action_space'): return self._env.safe_action_space - else: - return self._env.action_space - - def get_safe_action(self, obs, act, modular=False, cbf_info_batch=None): - """Given a nominal action, returns a minimally-altered safe action to take. - - Parameters - ---------- - obs : torch.tensor - act : torch.tensor - dynamics_model : DynamicsModel - - Returns - ------- - safe_act : torch.tensor - Safe actions to be taken (cbf_action + action). - """ + return self._env.action_space + + def get_safe_action(self, obs: torch.Tensor, act: torch.Tensor) -> torch.Tensor: + state_batch = self.dynamics_model.get_state(obs) mean_pred_batch, sigma_pred_batch = self.dynamics_model.predict_disturbance(state_batch) - safe_act = self.solver.get_safe_action(state_batch, act, mean_pred_batch, sigma_pred_batch, modular=modular, cbf_info_batch=cbf_info_batch) - return safe_act + return self.solver.get_safe_action( + state_batch, + act, + mean_pred_batch, + sigma_pred_batch, + ) - def __getattr__(self, name): - try: - return getattr(self._env, name) - except AttributeError: - raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") \ No newline at end of file + def __getattr__(self, name: str) -> Any: + return getattr(self._env, name) diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index f25928ad2..da82ecbea 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -27,16 +27,16 @@ from omnisafe.algorithms.off_policy import ( CRABS, DDPG, + DDPGCBF, DDPGPID, SAC, SACPID, + SACRCBF, TD3, TD3PID, DDPGLag, SACLag, TD3Lag, - SACRCBF, - DDPGCBF, ) # Offline Safe @@ -53,10 +53,12 @@ PPO, RCPO, TRPO, + TRPOCBF, TRPOPID, NaturalPG, OnCRPO, PolicyGradient, + PPOBetaCBF, PPOEarlyTerminated, PPOLag, PPOSaute, @@ -65,8 +67,6 @@ TRPOLag, TRPOSaute, TRPOSimmerPID, - TRPOCBF, - PPOBetaCBF, ) diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py index e87bd82f2..5a297c49f 100644 --- a/omnisafe/algorithms/off_policy/__init__.py +++ b/omnisafe/algorithms/off_policy/__init__.py @@ -16,16 +16,16 @@ from omnisafe.algorithms.off_policy.crabs import CRABS from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.algorithms.off_policy.ddpg_cbf import DDPGCBF from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag from omnisafe.algorithms.off_policy.ddpg_pid import DDPGPID from omnisafe.algorithms.off_policy.sac import SAC from omnisafe.algorithms.off_policy.sac_lag import SACLag from omnisafe.algorithms.off_policy.sac_pid import SACPID +from omnisafe.algorithms.off_policy.sac_rcbf import SACRCBF from omnisafe.algorithms.off_policy.td3 import TD3 from omnisafe.algorithms.off_policy.td3_lag import TD3Lag from omnisafe.algorithms.off_policy.td3_pid import TD3PID -from omnisafe.algorithms.off_policy.sac_rcbf import SACRCBF -from omnisafe.algorithms.off_policy.ddpg_cbf import DDPGCBF __all__ = ['DDPG', 'TD3', 'SAC', 'DDPGLag', 'TD3Lag', 'SACLag', 'DDPGPID', 'TD3PID', 'SACPID', 'SACRCBF', 'DDPGCBF', 'CRABS'] diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 517d8c0be..f0c633220 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -188,13 +188,7 @@ def _init_log(self) -> None: config=self._cfgs, ) - what_to_save: dict[str, Any] = {} - what_to_save['pi'] = self._actor_critic.actor - if self._cfgs.algo_cfgs.obs_normalize: - obs_normalizer = self._env.save()['obs_normalizer'] - what_to_save['obs_normalizer'] = obs_normalizer - - self._logger.setup_torch_saver(what_to_save) + self._log_what_to_save() self._logger.torch_save() self._logger.register_key( @@ -338,6 +332,7 @@ def learn(self) -> tuple[float, float, float]: # save model to disk if (epoch + 1) % self._cfgs.logger_cfgs.save_model_freq == 0: self._logger.torch_save() + self._specific_save() ep_ret = self._logger.get_stats('Metrics/EpRet')[0] ep_cost = self._logger.get_stats('Metrics/EpCost')[0] @@ -562,3 +557,17 @@ def _log_when_not_update(self) -> None: 'Value/cost_critic': 0.0, }, ) + + def _log_what_to_save(self) -> dict[str, Any]: + """Define what need to be saved below.""" + what_to_save: dict[str, Any] = {} + + what_to_save['pi'] = self._actor_critic.actor + if self._cfgs.algo_cfgs.obs_normalize: + obs_normalizer = self._env.save()['obs_normalizer'] + what_to_save['obs_normalizer'] = obs_normalizer + + self._logger.setup_torch_saver(what_to_save) + + def _specific_save(self) -> None: + """Save some algorithms specific models per epoch.""" diff --git a/omnisafe/algorithms/off_policy/ddpg_cbf.py b/omnisafe/algorithms/off_policy/ddpg_cbf.py index 12692db67..ad1306d5b 100644 --- a/omnisafe/algorithms/off_policy/ddpg_cbf.py +++ b/omnisafe/algorithms/off_policy/ddpg_cbf.py @@ -14,13 +14,21 @@ # ============================================================================== """Implementation of the DDPG algorithm with Control Barrier Function.""" + +from __future__ import annotations + +import os + +import joblib import torch +from omnisafe.adapter.offpolicy_barrier_function_adapter import OffPolicyBarrierFunctionAdapter from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.ddpg import DDPG -from omnisafe.common.barrier_solver import PendulumSolver -from omnisafe.adapter.offpolicy_barrier_function_adapter import OffPolicyBarrierFunctionAdapter from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.typing import Any +from omnisafe.utils.distributed import get_rank @registry.register @@ -35,7 +43,7 @@ class DDPGCBF(DDPG): """ def _init_env(self) -> None: - self._env: OffPolicyBarrierFunctionAdapter=OffPolicyBarrierFunctionAdapter( + self._env: OffPolicyBarrierFunctionAdapter = OffPolicyBarrierFunctionAdapter( self._env_id, self._cfgs.train_cfgs.vector_env_nums, self._seed, @@ -46,11 +54,11 @@ def _init_env(self) -> None: obs_dim=self._env.observation_space.shape[0], act_dim=self._env.action_space.shape[0], cfgs=self._cfgs.compensator_cfgs, - ) - + ).to(self._device) + self._env.set_compensator(compensator=compensator) self._env.set_solver(solver=solver) - + assert ( self._cfgs.algo_cfgs.steps_per_epoch % self._cfgs.train_cfgs.vector_env_nums == 0 ), 'The number of steps per epoch is not divisible by the number of environments.' @@ -58,26 +66,34 @@ def _init_env(self) -> None: assert ( int(self._cfgs.train_cfgs.total_steps) % self._cfgs.algo_cfgs.steps_per_epoch == 0 ), 'The total number of steps is not divisible by the number of steps per epoch.' - self._epochs: int=int( + self._epochs: int = int( self._cfgs.train_cfgs.total_steps // self._cfgs.algo_cfgs.steps_per_epoch, ) - self._epoch: int=0 - self._steps_per_epoch: int=( + self._epoch: int = 0 + self._steps_per_epoch: int = ( self._cfgs.algo_cfgs.steps_per_epoch // self._cfgs.train_cfgs.vector_env_nums ) - self._update_cycle: int=self._cfgs.algo_cfgs.update_cycle + self._update_cycle: int = self._cfgs.algo_cfgs.update_cycle assert ( self._steps_per_epoch % self._update_cycle == 0 ), 'The number of steps per epoch is not divisible by the number of steps per sample.' - self._samples_per_epoch: int=self._steps_per_epoch // self._update_cycle - self._update_count: int=0 - + self._samples_per_epoch: int = self._steps_per_epoch // self._update_cycle + self._update_count: int = 0 + def _init(self) -> None: super()._init() - self._buf.add_field(name='approx_compensating_act', shape=self._env.action_space.shape, dtype=torch.float32) - self._buf.add_field(name='compensating_act', shape=self._env.action_space.shape, dtype=torch.float32) - + self._buf.add_field( + name='approx_compensating_act', + shape=self._env.action_space.shape, + dtype=torch.float32, + ) + self._buf.add_field( + name='compensating_act', + shape=self._env.action_space.shape, + dtype=torch.float32, + ) + def _init_log(self) -> None: # """Log the DDPGRCBF specific information. @@ -88,6 +104,28 @@ def _init_log(self) -> None: # +----------------------------+--------------------------+ # """ super()._init_log() - if self._cfgs.env_id == 'Pendulum-v1': - self._logger.register_key('Metrics/angle', min_and_max=True) - self._logger.register_key('Value/Loss_compensator') \ No newline at end of file + self._logger.register_key('Value/Loss_compensator') + + def _specific_save(self) -> None: + """Save some algorithms specific models per epoch.""" + super()._specific_save() + if get_rank() == 0: + path = os.path.join( + self._logger.log_dir, + 'gp_model_save', + f'gaussian_process_regressor_{self._logger.current_epoch}.pkl', + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + joblib.dump(self._env.gp_models, path) + + def _log_what_to_save(self) -> dict[str, Any]: + """Define what need to be saved below.""" + what_to_save: dict[str, Any] = {} + + what_to_save['pi'] = self._actor_critic.actor + what_to_save['compensator'] = self._env.compensator + if self._cfgs.algo_cfgs.obs_normalize: + obs_normalizer = self._env.save()['obs_normalizer'] + what_to_save['obs_normalizer'] = obs_normalizer + + self._logger.setup_torch_saver(what_to_save) diff --git a/omnisafe/algorithms/off_policy/sac_rcbf.py b/omnisafe/algorithms/off_policy/sac_rcbf.py index e980025c3..9fbd20a39 100644 --- a/omnisafe/algorithms/off_policy/sac_rcbf.py +++ b/omnisafe/algorithms/off_policy/sac_rcbf.py @@ -15,16 +15,20 @@ """Implementation of the Soft Actor-Critic algorithm with Robust Control Barrier Function.""" +from __future__ import annotations + +import os + import torch from torch import nn from torch.nn.utils.clip_grad import clip_grad_norm_ +from omnisafe.adapter.robust_barrier_function_adapter import RobustBarrierFunctionAdapter from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.sac import SAC from omnisafe.common.robust_barrier_solver import CBFQPLayer -from omnisafe.adapter.robust_barrier_function_adapter import RobustBarrierFunctionAdapter -from omnisafe.common.barrier_comp import BarrierCompensator from omnisafe.common.robust_gp_model import DynamicsModel +from omnisafe.utils.distributed import get_rank @registry.register @@ -39,24 +43,24 @@ class SACRCBF(SAC): """ def _init_env(self) -> None: - self._env: RobustBarrierFunctionAdapter=RobustBarrierFunctionAdapter( + self._env: RobustBarrierFunctionAdapter = RobustBarrierFunctionAdapter( self._env_id, self._cfgs.train_cfgs.vector_env_nums, self._seed, self._cfgs, ) - solver=CBFQPLayer( + solver = CBFQPLayer( env=self._env, device=self._cfgs.train_cfgs.device, gamma_b=self._cfgs.cbf_cfgs.gamma_b, k_d=self._cfgs.cbf_cfgs.k_d, l_p=self._cfgs.cbf_cfgs.l_p, ) - dynamics_model=DynamicsModel(env=self._env) - + dynamics_model = DynamicsModel(env=self._env) + self._env.set_dynamics_model(dynamics_model=dynamics_model) self._env.set_solver(solver=solver) - + assert ( self._cfgs.algo_cfgs.steps_per_epoch % self._cfgs.train_cfgs.vector_env_nums == 0 ), 'The number of steps per epoch is not divisible by the number of environments.' @@ -64,33 +68,20 @@ def _init_env(self) -> None: assert ( int(self._cfgs.train_cfgs.total_steps) % self._cfgs.algo_cfgs.steps_per_epoch == 0 ), 'The total number of steps is not divisible by the number of steps per epoch.' - self._epochs: int=int( + self._epochs: int = int( self._cfgs.train_cfgs.total_steps // self._cfgs.algo_cfgs.steps_per_epoch, ) - self._epoch: int=0 - self._steps_per_epoch: int=( + self._epoch: int = 0 + self._steps_per_epoch: int = ( self._cfgs.algo_cfgs.steps_per_epoch // self._cfgs.train_cfgs.vector_env_nums ) - self._update_cycle: int=self._cfgs.algo_cfgs.update_cycle + self._update_cycle: int = self._cfgs.algo_cfgs.update_cycle assert ( self._steps_per_epoch % self._update_cycle == 0 ), 'The number of steps per epoch is not divisible by the number of steps per sample.' - self._samples_per_epoch: int=self._steps_per_epoch // self._update_cycle - self._update_count: int=0 - - def _init_log(self) -> None: - # """Log the SACRCBF specific information. - - # +----------------------------+--------------------------+ - # | Things to log | Description | - # +============================+==========================+ - # | Metrics/LagrangeMultiplier | The Lagrange multiplier. | - # +----------------------------+--------------------------+ - # """ - super()._init_log() - if self._cfgs.env_id == 'Pendulum-v1': - self._logger.register_key('Metrics/angle', min_and_max=True) + self._samples_per_epoch: int = self._steps_per_epoch // self._update_cycle + self._update_count: int = 0 def _update_actor( self, @@ -163,7 +154,7 @@ def _update_reward_critic( 'Value/reward_critic': q1_value_r.mean().item(), }, ) - + def _loss_pi( self, obs: torch.Tensor, @@ -172,4 +163,26 @@ def _loss_pi( action = self._env.get_safe_action(obs, action) log_prob = self._actor_critic.actor.log_prob(action) q1_value_r, q2_value_r = self._actor_critic.reward_critic(obs, action) - return (self._alpha * log_prob - torch.min(q1_value_r, q2_value_r)).mean() \ No newline at end of file + return (self._alpha * log_prob - torch.min(q1_value_r, q2_value_r)).mean() + + def _specific_save(self) -> None: + """Save some algorithms specific models per epoch.""" + super()._specific_save() + if get_rank() == 0: + path = os.path.join(self._logger.log_dir, 'gp_model_save') + os.makedirs(path, exist_ok=True) + train_x = self._env.dynamics_model.train_x + train_y = self._env.dynamics_model.train_y + disturb_estimators = self._env.dynamics_model.disturb_estimators + weights = [] + for i in range(len(disturb_estimators)): + weights.append(disturb_estimators[i].model.state_dict()) + torch.save(weights, os.path.join(path, f'gp_models_{self._logger.current_epoch}.pkl')) + torch.save( + train_x, + os.path.join(path, f'gp_models_train_x_{self._logger.current_epoch}.pkl'), + ) + torch.save( + train_y, + os.path.join(path, f'gp_models_train_y_{self._logger.current_epoch}.pkl'), + ) diff --git a/omnisafe/algorithms/on_policy/__init__.py b/omnisafe/algorithms/on_policy/__init__.py index 06932a307..8351ecf2d 100644 --- a/omnisafe/algorithms/on_policy/__init__.py +++ b/omnisafe/algorithms/on_policy/__init__.py @@ -15,6 +15,7 @@ """On-policy algorithms.""" from omnisafe.algorithms.on_policy import ( + barrier_function, base, early_terminated, first_order, @@ -25,8 +26,8 @@ saute, second_order, simmer, - barrier_function, ) +from omnisafe.algorithms.on_policy.barrier_function import TRPOCBF, PPOBetaCBF from omnisafe.algorithms.on_policy.base import PPO, TRPO, NaturalPG, PolicyGradient from omnisafe.algorithms.on_policy.early_terminated import PPOEarlyTerminated, TRPOEarlyTerminated from omnisafe.algorithms.on_policy.first_order import CUP, FOCOPS @@ -37,7 +38,6 @@ from omnisafe.algorithms.on_policy.saute import PPOSaute, TRPOSaute from omnisafe.algorithms.on_policy.second_order import CPO, PCPO from omnisafe.algorithms.on_policy.simmer import PPOSimmerPID, TRPOSimmerPID -from omnisafe.algorithms.on_policy.barrier_function import TRPOCBF, PPOBetaCBF __all__ = [ diff --git a/omnisafe/algorithms/on_policy/barrier_function/__init__.py b/omnisafe/algorithms/on_policy/barrier_function/__init__.py index 273ca2831..dacdc3c4d 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/__init__.py +++ b/omnisafe/algorithms/on_policy/barrier_function/__init__.py @@ -14,8 +14,8 @@ # ============================================================================== """Control Barrier Function Safe Reinforcement Learning algorithms.""" -from omnisafe.algorithms.on_policy.barrier_function.trpo_cbf import TRPOCBF from omnisafe.algorithms.on_policy.barrier_function.ppo_cbf import PPOBetaCBF +from omnisafe.algorithms.on_policy.barrier_function.trpo_cbf import TRPOCBF __all__ = [ diff --git a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py index e7711ed3c..24b27d939 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py +++ b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py @@ -26,10 +26,9 @@ @registry.register class PPOBetaCBF(PPO): - + def _init_log(self) -> None: super()._init_log() - self._logger.register_key('Metrics/angle', min_and_max=True) self._logger.register_key('Value/Loss_compensator') def _init_env(self) -> None: @@ -48,10 +47,6 @@ def _init_env(self) -> None: // self._cfgs.train_cfgs.vector_env_nums ) - def _init_log(self) -> None: - super()._init_log() - self._logger.register_key('Metrics/angle', min_and_max=True) - def _loss_pi( self, obs: torch.Tensor, @@ -85,7 +80,6 @@ def _loss_pi( """ distribution = self._actor_critic.actor(obs) logp_ = self._actor_critic.actor.log_prob(act) - std = self._actor_critic.actor.std ratio = torch.exp(logp_ - logp) ratio_cliped = torch.clamp( ratio, @@ -103,4 +97,4 @@ def _loss_pi( 'Loss/Loss_pi': loss.mean().item(), }, ) - return loss \ No newline at end of file + return loss diff --git a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py index 404776d72..3fceec4f7 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py +++ b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py @@ -22,13 +22,14 @@ from omnisafe.adapter.barrier_function_adapter import BarrierFunctionAdapter from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.trpo import TRPO -from omnisafe.utils import distributed -from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.utils import distributed + @registry.register class TRPOCBF(TRPO): - + def _init_log(self) -> None: super()._init_log() self._logger.register_key('Metrics/angle', min_and_max=True) @@ -51,18 +52,26 @@ def _init_env(self) -> None: ) self.solver = PendulumSolver(device=self._cfgs.train_cfgs.device) self.compensator = BarrierCompensator( - obs_dim = self._env.observation_space.shape[0], - act_dim = self._env.action_space.shape[0], - cfgs = self._cfgs.compensator_cfgs, + obs_dim=self._env.observation_space.shape[0], + act_dim=self._env.action_space.shape[0], + cfgs=self._cfgs.compensator_cfgs, ) self._env.set_solver(solver=self.solver) self._env.set_compensator(compensator=self.compensator) - + def _init(self) -> None: super()._init() - self._buf.add_field(name='approx_compensating_act', shape=self._env.action_space.shape, dtype=torch.float32) - self._buf.add_field(name='compensating_act', shape=self._env.action_space.shape, dtype=torch.float32) - + self._buf.add_field( + name='approx_compensating_act', + shape=self._env.action_space.shape, + dtype=torch.float32, + ) + self._buf.add_field( + name='compensating_act', + shape=self._env.action_space.shape, + dtype=torch.float32, + ) + def _update(self) -> None: """Update actor, critic. @@ -77,8 +86,18 @@ def _update(self) -> None: accepted. """ data = self._buf.get() - - obs, act, logp, target_value_r, target_value_c, adv_r, adv_c, approx_compensating_act, compensating_act = ( + + ( + obs, + act, + logp, + target_value_r, + target_value_c, + adv_r, + adv_c, + approx_compensating_act, + compensating_act, + ) = ( data['obs'], data['act'], data['logp'], @@ -91,7 +110,11 @@ def _update(self) -> None: ) self._update_actor(obs, act, logp, adv_r, adv_c) - compensator_loss = self._env.compensator.train(observation=obs, approx_compensating_act=approx_compensating_act, compensating_act=compensating_act) + compensator_loss = self._env.compensator.train( + observation=obs, + approx_compensating_act=approx_compensating_act, + compensating_act=compensating_act, + ) dataloader = DataLoader( dataset=TensorDataset(obs, target_value_r, target_value_c), batch_size=self._cfgs.algo_cfgs.batch_size, diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py index 69f0ce4e9..463b286c8 100644 --- a/omnisafe/algorithms/on_policy/base/ppo.py +++ b/omnisafe/algorithms/on_policy/base/ppo.py @@ -16,8 +16,6 @@ from __future__ import annotations -import torch - from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient @@ -31,57 +29,3 @@ class PPO(PolicyGradient): - Authors: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. - URL: `PPO `_ """ - - def _loss_pi( - self, - obs: torch.Tensor, - act: torch.Tensor, - logp: torch.Tensor, - adv: torch.Tensor, - ) -> torch.Tensor: - r"""Computing pi/actor loss. - - In Proximal Policy Optimization, the loss is defined as: - - .. math:: - - L^{CLIP} = \underset{s_t \sim \rho_{\theta}}{\mathbb{E}} \left[ - \min ( r_t A^{R}_{\pi_{\theta}} (s_t, a_t) , \text{clip} (r_t, 1 - \epsilon, 1 + \epsilon) - A^{R}_{\pi_{\theta}} (s_t, a_t) - \right] - - where :math:`r_t = \frac{\pi_{\theta}^{'} (a_t|s_t)}{\pi_{\theta} (a_t|s_t)}`, - :math:`\epsilon` is the clip parameter, and :math:`A^{R}_{\pi_{\theta}} (s_t, a_t)` is the - advantage. - - Args: - obs (torch.Tensor): The ``observation`` sampled from buffer. - act (torch.Tensor): The ``action`` sampled from buffer. - logp (torch.Tensor): The ``log probability`` of action sampled from buffer. - adv (torch.Tensor): The ``advantage`` processed. ``reward_advantage`` here. - - Returns: - The loss of pi/actor. - """ - distribution = self._actor_critic.actor(obs) - logp_ = self._actor_critic.actor.log_prob(act) - std = self._actor_critic.actor.std - ratio = torch.exp(logp_ - logp) - ratio_cliped = torch.clamp( - ratio, - 1 - self._cfgs.algo_cfgs.clip, - 1 + self._cfgs.algo_cfgs.clip, - ) - loss = -torch.min(ratio * adv, ratio_cliped * adv).mean() - loss -= self._cfgs.algo_cfgs.entropy_coef * distribution.entropy().mean() - # useful extra info - entropy = distribution.entropy().mean().item() - self._logger.store( - { - 'Train/Entropy': entropy, - 'Train/PolicyRatio': ratio, - 'Train/PolicyStd': std, - 'Loss/Loss_pi': loss.mean().item(), - }, - ) - return loss diff --git a/omnisafe/common/barrier_comp.py b/omnisafe/common/barrier_comp.py index 57d39a8d6..1a27d5863 100644 --- a/omnisafe/common/barrier_comp.py +++ b/omnisafe/common/barrier_comp.py @@ -17,8 +17,10 @@ import torch from torch import optim -from omnisafe.utils.model import build_mlp_network + from omnisafe.utils.config import Config +from omnisafe.utils.model import build_mlp_network + class BarrierCompensator(torch.nn.Module): """A module that represents a barrier compensator using a multi-layer perceptron (MLP) network. @@ -39,9 +41,9 @@ class BarrierCompensator(torch.nn.Module): act_dim (int): Dimension of the action space. cfgs (Config): Configuration parameters for the network and training. """ - - def __init__(self, obs_dim: int, act_dim: int, cfgs: Config): - super(BarrierCompensator, self).__init__() + + def __init__(self, obs_dim: int, act_dim: int, cfgs: Config) -> None: + super().__init__() self._cfgs: Config = cfgs self.model: torch.nn.Module = build_mlp_network( sizes=[obs_dim, *self._cfgs.hidden_sizes, act_dim], @@ -49,7 +51,7 @@ def __init__(self, obs_dim: int, act_dim: int, cfgs: Config): weight_initialization_mode=self._cfgs.weight_initialization_mode, ) self.optimizer: optim.Adam = optim.Adam(self.parameters(), lr=self._cfgs.lr) - + def forward(self, obs: torch.Tensor) -> torch.Tensor: """Estimate the sum of previous compensating actions. @@ -61,7 +63,12 @@ def forward(self, obs: torch.Tensor) -> torch.Tensor: """ return self.model(obs) - def train(self, observation: torch.Tensor, approx_compensating_act: torch.Tensor, compensating_act: torch.Tensor) -> torch.Tensor: + def train( + self, + observation: torch.Tensor, + approx_compensating_act: torch.Tensor, + compensating_act: torch.Tensor, + ) -> torch.Tensor: """Train the barrier compensator model. This method updates the model parameters to minimize the difference between the model's output and the @@ -79,8 +86,8 @@ def train(self, observation: torch.Tensor, approx_compensating_act: torch.Tensor for _ in range(self._cfgs.update_iters): target = approx_compensating_act + compensating_act self.optimizer.zero_grad() - loss = torch.pow((self(observation)-target), 2).mean() + loss = torch.pow((self(observation) - target), 2).mean() loss.backward() self.optimizer.step() - + return loss diff --git a/omnisafe/common/barrier_solver.py b/omnisafe/common/barrier_solver.py index 1c11ffffb..b00af906e 100644 --- a/omnisafe/common/barrier_solver.py +++ b/omnisafe/common/barrier_solver.py @@ -15,14 +15,17 @@ """Implementation of the Control Barrier Function Solver.""" from __future__ import annotations + import warnings -warnings.filterwarnings("ignore") + +import joblib import numpy as np import torch -from cvxopt import matrix -from cvxopt import solvers +from cvxopt import matrix, solvers from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C +from sklearn.gaussian_process.kernels import RBF +from sklearn.gaussian_process.kernels import ConstantKernel as C + class PendulumSolver: """Solver for the pendulum problem using Gaussian Process models. @@ -35,9 +38,14 @@ class PendulumSolver: device (str): Device to run the computations on. """ - def __init__(self, action_size: int = 1, observation_size: int = 3, - torque_bound: float = 15., max_speed: float = 60., - device: str = 'cpu') -> None: + def __init__( + self, + action_size: int = 1, + observation_size: int = 3, + torque_bound: float = 15.0, + max_speed: float = 60.0, + device: str = 'cpu', + ) -> None: """Initializes the PendulumSolver with specified parameters. Args: @@ -56,29 +64,45 @@ def __init__(self, action_size: int = 1, observation_size: int = 3, self._gamma_b = 0.5 self._kd = 1.5 self._build_barrier() - self.build_GP_model() - self.GP_model_prev = None + self.build_gp_model() + self.gp_model_prev = None + warnings.filterwarnings('ignore') - def build_GP_model(self) -> None: + def build_gp_model(self, save_dir: str | None = None) -> None: """Builds the Gaussian Process model.""" gp_list = [] noise = 0.01 for _ in range(self.observation_size - 1): - kern = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) - gp = GaussianProcessRegressor(kernel=kern, alpha=noise, n_restarts_optimizer=10) - gp_list.append(gp) - self.GP_model = gp_list + if not save_dir: + kern = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) + gp = GaussianProcessRegressor(kernel=kern, alpha=noise, n_restarts_optimizer=10) + gp_list.append(gp) + else: + gp_list = joblib.load(save_dir) + self.gp_model = gp_list + + @property + def gp_models(self) -> list[GaussianProcessRegressor]: + """Return all gaussian process regressor for saving.""" + return self.gp_model def _build_barrier(self) -> None: """Builds the barrier for the pendulum solver.""" - self.P = matrix(np.diag([1., 1e16]), tc='d') + self.P = matrix(np.diag([1.0, 1e16]), tc='d') self.q = matrix(np.zeros(self.action_size + 1)) self.h1 = np.array([1, 0.01]) self.h2 = np.array([1, -0.01]) self.h3 = np.array([-1, 0.01]) self.h4 = np.array([-1, -0.01]) - def control_barrier(self, original_action: torch.Tensor, f: np.ndarray, g: np.ndarray, x: np.ndarray, std: np.ndarray) -> torch.Tensor: + def control_barrier( + self, + original_action: torch.Tensor, + f: np.ndarray, + g: np.ndarray, + x: np.ndarray, + std: np.ndarray, + ) -> torch.Tensor: """ Adjusts the original action using a control barrier function to ensure that the action complies with the system's physical constraints. @@ -97,49 +121,64 @@ def control_barrier(self, original_action: torch.Tensor, f: np.ndarray, g: np.nd # Define gamma for the barrier function gamma_b = 0.5 kd = 1.5 - u_rl = original_action.detach().numpy() - # u_rl*=self.torque_bound + u_rl = original_action.cpu().detach().numpy() # Set up Quadratic Program to satisfy Control Barrier Function G = np.array( [ [ - -np.dot(self.h1, g), - -np.dot(self.h2, g), - -np.dot(self.h3, g), - -np.dot(self.h4, g), + -np.dot(self.h1, g), + -np.dot(self.h2, g), + -np.dot(self.h3, g), + -np.dot(self.h4, g), 1, - -1, - g[1], - -g[1] - ], + -1, + g[1], + -g[1], + ], [ - -1, - -1, - -1, - -1, - 0, - 0, - 0, - 0 - ] - ] + -1, + -1, + -1, + -1, + 0, + 0, + 0, + 0, + ], + ], ) G = np.transpose(G) h = np.array( [ - gamma_b * self.F + np.dot(self.h1, f) + np.dot(self.h1, g) * u_rl - (1 - gamma_b) * np.dot(self.h1, x) - kd * np.abs(np.dot(self.h1, std)), - gamma_b * self.F + np.dot(self.h2, f) + np.dot(self.h2, g) * u_rl - (1 - gamma_b) * np.dot(self.h2, x) - kd * np.abs(np.dot(self.h2, std)), - gamma_b * self.F + np.dot(self.h3, f) + np.dot(self.h3, g) * u_rl - (1 - gamma_b) * np.dot(self.h3, x) - kd * np.abs(np.dot(self.h3, std)), - gamma_b * self.F + np.dot(self.h4, f) + np.dot(self.h4, g) * u_rl - (1 - gamma_b) * np.dot(self.h4, x) - kd * np.abs(np.dot(self.h4, std)), - -u_rl + self.torque_bound, - u_rl + self.torque_bound, - -f[1] - g[1] * u_rl + self.max_speed, - f[1] + g[1] * u_rl + self.max_speed - ] + gamma_b * self.F + + np.dot(self.h1, f) + + np.dot(self.h1, g) * u_rl + - (1 - gamma_b) * np.dot(self.h1, x) + - kd * np.abs(np.dot(self.h1, std)), + gamma_b * self.F + + np.dot(self.h2, f) + + np.dot(self.h2, g) * u_rl + - (1 - gamma_b) * np.dot(self.h2, x) + - kd * np.abs(np.dot(self.h2, std)), + gamma_b * self.F + + np.dot(self.h3, f) + + np.dot(self.h3, g) * u_rl + - (1 - gamma_b) * np.dot(self.h3, x) + - kd * np.abs(np.dot(self.h3, std)), + gamma_b * self.F + + np.dot(self.h4, f) + + np.dot(self.h4, g) * u_rl + - (1 - gamma_b) * np.dot(self.h4, x) + - kd * np.abs(np.dot(self.h4, std)), + -u_rl + self.torque_bound, + u_rl + self.torque_bound, + -f[1] - g[1] * u_rl + self.max_speed, + f[1] + g[1] * u_rl + self.max_speed, + ], ) h = np.squeeze(h).astype(np.double) - + # Convert numpy arrays to cvx matrices to set up QP G = matrix(G, tc='d') h = matrix(h, tc='d') @@ -150,10 +189,10 @@ def control_barrier(self, original_action: torch.Tensor, f: np.ndarray, g: np.nd # Check if the adjusted action is within bounds if np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) - 0.001 >= self.torque_bound: u_bar[0] = self.torque_bound - u_rl - print("Error in QP") + print('Error in QP') elif np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) + 0.001 <= -self.torque_bound: u_bar[0] = -self.torque_bound - u_rl - print("Error in QP") + print('Error in QP') return torch.as_tensor(u_bar[0], dtype=torch.float32, device=self._device).unsqueeze(dim=0) @@ -173,18 +212,27 @@ def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: dt = 0.05 # Time step G = 10 # Gravitational constant m = 2 # Mass - l = 2 # Length + length = 2 # Length theta = np.arctan2(obs[1], obs[0]) # Calculate the angle theta_dot = obs[2] # Angular velocity # Dynamics equations - f = np.array([-3 * G / (2 * l) * np.sin(theta + np.pi) * dt**2 + theta_dot * dt + theta + 3 / (m * l**2) * original_action * dt**2, - theta_dot - 3 * G / (2 * l) * np.sin(theta + np.pi) * dt + 3 / (m * l**2) * original_action * dt]) + f = np.array( + [ + -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 + + theta_dot * dt + + theta + + 3 / (m * length**2) * original_action * dt**2, + theta_dot + - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt + + 3 / (m * length**2) * original_action * dt, + ], + ) return np.squeeze(f) - def update_GP_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: + def update_gp_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: """ Updates the Gaussian Process (GP) dynamics model based on observed states and actions. @@ -192,60 +240,70 @@ def update_GP_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: obs (np.ndarray): Observed states. act (np.ndarray): Actions taken. """ - obs=obs.detach().cpu().squeeze().numpy() - act=act.detach().cpu().squeeze().numpy() + obs = obs.detach().cpu().squeeze().numpy() + act = act.detach().cpu().squeeze().numpy() N = self.observation_size X = obs U = act L = len(X) - err = np.zeros((L-1, N-1)) - S = np.zeros((L-1, 2)) - for i in range(L-1): + err = np.zeros((L - 1, N - 1)) + S = np.zeros((L - 1, 2)) + for i in range(L - 1): f = self.get_dynamics(X[i], U[i]) theta_p = np.arctan2(X[i][1], X[i][0]) theta_dot_p = X[i][2] - theta = np.arctan2(X[i+1][1], X[i+1][0]) - theta_dot = X[i+1][2] + theta = np.arctan2(X[i + 1][1], X[i + 1][0]) + theta_dot = X[i + 1][2] S[i, :] = np.array([theta_p, theta_dot_p]) err[i, :] = np.array([theta, theta_dot]) - f - self.GP_model[0].fit(S, err[:, 0]) - self.GP_model[1].fit(S, err[:, 1]) + self.gp_model[0].fit(S, err[:, 0]) + self.gp_model[1].fit(S, err[:, 1]) - def get_GP_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.ndarray]: + def get_gp_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.ndarray]: """ - Retrieves the GP dynamics based on the current observation. + Retrieves the gp dynamics based on the current observation. Args: obs (torch.Tensor): Current state observation. Returns: - list[np.ndarray]: list containing the GP dynamics [f, g, x, std]. + list[np.ndarray]: list containing the gp dynamics [f, g, x, std]. """ obs = obs.cpu().detach().numpy() u_rl = 0 dt = 0.05 G = 10 m = 1 - l = 1 + length = 1 obs = np.squeeze(obs) theta = np.arctan2(obs[1], obs[0]) theta_dot = obs[2] - x = np.array([theta, theta_dot]) # 这个x估计就对应state + x = np.array([theta, theta_dot]) # 这个x估计就对应state f_nom = np.array( [ - -3*G/(2*l)*np.sin(theta + np.pi)*dt**2 + theta_dot*dt + theta + 3/(m*l**2)*u_rl*dt**2, - theta_dot - 3*G/(2*l)*np.sin(theta + np.pi)*dt + 3/(m*l**2)*u_rl*dt - ] + -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 + + theta_dot * dt + + theta + + 3 / (m * length**2) * u_rl * dt**2, + theta_dot + - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt + + 3 / (m * length**2) * u_rl * dt, + ], ) - g = np.array([3/(m*l**2)*dt**2, 3/(m*l**2)*dt]) + g = np.array([3 / (m * length**2) * dt**2, 3 / (m * length**2) * dt]) f_nom = np.squeeze(f_nom) f = np.zeros(2) if use_prev_model: - [m1, std1] = self.GP_model_prev[0].predict(x.reshape(1,-1), return_std=True) - [m2, std2] = self.GP_model_prev[1].predict(x.reshape(1,-1), return_std=True) + [m1, std1] = self.gp_model_prev[0].predict(x.reshape(1, -1), return_std=True) + [m2, std2] = self.gp_model_prev[1].predict(x.reshape(1, -1), return_std=True) else: - [m1, std1] = self.GP_model[0].predict(x.reshape(1, -1), return_std=True) - [m2, std2] = self.GP_model[1].predict(x.reshape(1, -1), return_std=True) + [m1, std1] = self.gp_model[0].predict(x.reshape(1, -1), return_std=True) + [m2, std2] = self.gp_model[1].predict(x.reshape(1, -1), return_std=True) f[0] = f_nom[0] + m1 f[1] = f_nom[1] + m2 - return [np.squeeze(f), np.squeeze(g), np.squeeze(x), np.array([np.squeeze(std1), np.squeeze(std2)])] + return [ + np.squeeze(f), + np.squeeze(g), + np.squeeze(x), + np.array([np.squeeze(std1), np.squeeze(std2)]), + ] diff --git a/omnisafe/common/buffer/vector_onpolicy_buffer.py b/omnisafe/common/buffer/vector_onpolicy_buffer.py index a8e2c25a8..3ebd61c87 100644 --- a/omnisafe/common/buffer/vector_onpolicy_buffer.py +++ b/omnisafe/common/buffer/vector_onpolicy_buffer.py @@ -87,7 +87,7 @@ def __init__( # pylint: disable=super-init-not-called,too-many-arguments ) for _ in range(num_envs) ] - + def add_field(self, name: str, shape: tuple[int, ...], dtype: torch.dtype) -> None: """Add a field to the buffer. diff --git a/omnisafe/common/robust_barrier_solver.py b/omnisafe/common/robust_barrier_solver.py index 80d8d33b6..639ae8d3a 100644 --- a/omnisafe/common/robust_barrier_solver.py +++ b/omnisafe/common/robust_barrier_solver.py @@ -1,145 +1,152 @@ +from __future__ import annotations + +from typing import Any + +import gymnasium as gym import numpy as np import torch -from cvxopt import matrix -from cvxopt import solvers -from omnisafe.common.utils import to_tensor, prRed, sort_vertices_cclockwise from qpth.qp import QPFunction -DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}, # state = [x y θ] - 'SimulatedCars': {'n_s': 10, 'n_u': 1}, # state = [x y θ v ω] - 'Pvtol': {'n_s': 6, 'n_u': 2}, # state = [x y θ v_x v_y thrust] - 'Pendulum-v1': {'n_s': 3, 'n_u': 1} - } +from omnisafe.common.utils import sort_vertices_cclockwise, to_tensor + + +DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}} class CBFQPLayer: - def __init__(self, env, device='cpu', gamma_b=20, k_d=3.0, l_p=0.03): - """Constructor of CBFLayer. - - Parameters - ---------- - env : gym.env - Gym environment. - gamma_b : float, optional - gamma of control barrier certificate. - k_d : float, optional - confidence parameter desired (2.0 corresponds to ~95% for example). + def __init__( + self, + env: gym.Env, + device: str = 'cpu', + gamma_b: float = 20, + k_d: float = 3.0, + l_p: float = 0.03, + ) -> None: + """Initializes a CBFLayer instance with specified parameters and environment. + + Args: + env (gym.Env): The Gym environment to interact with. + device (str, optional): The device type, such as 'cpu' or 'gpu'. Defaults to 'cpu'. + gamma_b (float, optional): The gamma parameter of the control barrier certificate. Defaults to 20. + k_d (float, optional): The confidence parameter desired (e.g., 2.0 corresponds to ~95% confidence). Defaults to 3.0. + l_p (float, optional): Some additional layer parameter, purpose unspecified. Defaults to 0.03. """ - self.device = torch.device(device) - self.env = env self.u_min, self.u_max = self.get_control_bounds() self.gamma_b = gamma_b - self.k_d = k_d self.l_p = l_p - self.action_dim = env.action_space.shape[0] - def get_safe_action(self, state_batch, action_batch, mean_pred_batch, sigma_batch, modular=False, cbf_info_batch=None): # TODO: 迁移的核心在于此,把它用CBF的方法来改写就好 + def get_safe_action( + self, + state_batch: torch.Tensor, + action_batch: torch.Tensor, + mean_pred_batch: torch.Tensor, + sigma_batch: torch.Tensor, + ) -> torch.Tensor: + """Computes safe actions based on current state and action predictions, adjusting for uncertainties. + + Args: + state_batch (torch.Tensor): Current state batch, tensor or ndarray. + action_batch (torch.Tensor): Nominal action batch, tensor or ndarray. + mean_pred_batch (torch.Tensor): Mean disturbance predictions, tensor or ndarray. + sigma_batch (torch.Tensor): Standard deviations of disturbances, tensor or ndarray. + cbf_info_batch (torch.Tensor, optional): Additional control barrier function information batch, tensor or ndarray. + + Returns: + torch.Tensor: Safe actions adjusted for given constraints and uncertainties. """ - - Parameters - ---------- - state_batch : torch.tensor or ndarray - action_batch : torch.tensor or ndarray - State batch - mean_pred_batch : torch.tensor or ndarray - Mean of disturbance - sigma_batch : torch.tensor or ndarray - Standard deviation of disturbance - - Returns - ------- - final_action_batch : torch.tensor - Safe actions to take in the environment. - """ - - # batch form if only a single data point is passed + # Batch form adjustment if only a single data point is passed expand_dims = len(state_batch.shape) == 1 if expand_dims: - action_batch = action_batch.unsqueeze(0) state_batch = state_batch.unsqueeze(0) + action_batch = action_batch.unsqueeze(0) mean_pred_batch = mean_pred_batch.unsqueeze(0) sigma_batch = sigma_batch.unsqueeze(0) - if cbf_info_batch is not None: - cbf_info_batch = cbf_info_batch.unsqueeze(0) - - if modular: - final_action = torch.clamp(action_batch, self.u_min.repeat(action_batch.shape[0], 1), self.u_max.repeat(action_batch.shape[0], 1)) - else: - Ps, qs, Gs, hs = self.get_cbf_qp_constraints(state_batch, action_batch, mean_pred_batch, sigma_batch, modular=modular, cbf_info_batch=cbf_info_batch) - - Ps, qs, Gs, hs = Ps.detach().cpu().numpy(), qs.detach().cpu().numpy(), Gs.detach().cpu().numpy(), hs.detach().cpu().numpy() - batch_size = Ps.shape[0] - safe_actions = [] - for i in range(batch_size): - Ps_m = matrix(np.diag([1., 1e16]), tc='d') - qs_m = matrix(np.zeros(2)) - Gs_m = matrix(np.float64(Gs[i]), tc='d') - hs_m = matrix(np.float64(hs[i]), tc='d') - solvers.options['show_progress'] = False - sol = solvers.qp(Ps_m, qs_m, Gs_m, hs_m) - safe_action=torch.as_tensor(sol['x'][0], dtype=torch.float32) - safe_actions.append(safe_action) - safe_action_batch = torch.as_tensor(safe_actions, dtype=torch.float32, device=self.device).unsqueeze(-1) - - # print(action_batch.shape, safe_action_batch.shape) - # safe_action_batch = self.solve_qp(Ps, qs, Gs, hs) - final_action = torch.clamp(action_batch + safe_action_batch, self.u_min.repeat(action_batch.shape[0], 1), self.u_max.repeat(action_batch.shape[0], 1)) - - return final_action if not expand_dims else final_action.squeeze(0) - - def solve_qp(self, Ps: torch.Tensor, qs: torch.Tensor, Gs: torch.Tensor, hs: torch.Tensor): - """Solves: + + Ps, qs, Gs, hs = self.get_cbf_qp_constraints( + state_batch, + action_batch, + mean_pred_batch, + sigma_batch, + ) + safe_action_batch = self.solve_qp(Ps, qs, Gs, hs) + final_action_batch = torch.clamp( + action_batch + safe_action_batch, + self.u_min.repeat(action_batch.shape[0], 1), + self.u_max.repeat(action_batch.shape[0], 1), + ) + + return final_action_batch if not expand_dims else final_action_batch.squeeze(0) + + def solve_qp( + self, + Ps: torch.Tensor, + qs: torch.Tensor, + Gs: torch.Tensor, + hs: torch.Tensor, + ) -> torch.Tensor: + """Solves a batch of quadratic programming (QP) problems. + + Each QP problem is defined as: minimize_{u,eps} 0.5 * u^T P u + q^T u - subject to G[u,eps]^T <= h - - Parameters - ---------- - Ps : torch.Tensor - (batch_size, n_u+1, n_u+1) - qs : torch.Tensor - (batch_size, n_u+1) - Gs : torch.Tensor - (batch_size, num_ineq_constraints, n_u+1) - hs : torch.Tensor - (batch_size, num_ineq_constraints) - Returns - ------- - safe_action_batch : torch.tensor - The solution of the qp without the last dimension (the slack). + subject to G[u,eps]^T <= h + + Args: + Ps (torch.Tensor): Quadratic cost matrix for each problem, with shape (batch_size, n_u+1, n_u+1). + qs (torch.Tensor): Linear cost vector for each problem, with shape (batch_size, n_u+1). + Gs (torch.Tensor): Inequality constraint matrix for each problem, with shape (batch_size, num_ineq_constraints, n_u+1). + hs (torch.Tensor): Inequality constraint vector for each problem, with shape (batch_size, num_ineq_constraints). + + Returns: + The safe action for each problem, omitting the slack variable, with dimension (batch_size, n_u). """ Ghs = torch.cat((Gs, hs.unsqueeze(2)), -1) Ghs_norm = torch.max(torch.abs(Ghs), dim=2, keepdim=True)[0] Gs /= Ghs_norm hs = hs / Ghs_norm.squeeze(-1) - sol = self.cbf_layer(Ps, qs, Gs, hs, solver_args={"check_Q_spd": False, "maxIter": 100000, "notImprovedLim": 10, "eps": 1e-4}) - safe_action_batch = sol[:, :self.env.action_space.shape[0]] - return safe_action_batch - - def cbf_layer(self, Qs, ps, Gs, hs, As=None, bs=None, solver_args=None): - """ - - Parameters - ---------- - Qs : torch.Tensor - ps : torch.Tensor - Gs : torch.Tensor - shape (batch_size, num_ineq_constraints, num_vars) - hs : torch.Tensor - shape (batch_size, num_ineq_constraints) - As : torch.Tensor, optional - bs : torch.Tensor, optional - solver_args : dict, optional - - Returns - ------- - result : torch.Tensor - Result of QP + sol = self.cbf_layer( + Ps, + qs, + Gs, + hs, + solver_args={ + 'check_Q_spd': False, + 'maxIter': 100000, + 'notImprovedLim': 10, + 'eps': 1e-4, + }, + ) + + return sol[:, : self.env.action_space.shape[0]] + + def cbf_layer( + self, + Qs: torch.Tensor, + ps: torch.Tensor, + Gs: torch.Tensor, + hs: torch.Tensor, + As: torch.Tensor | None = None, + bs: torch.Tensor | None = None, + solver_args: dict[str, Any] | None = None, + ) -> torch.Tensor: + """Applies a custom layer to solve QP problems using given constraints. + + Args: + Qs (torch.Tensor): Quadratic cost matrix for each problem. + ps (torch.Tensor): Linear cost vector for each problem. + Gs (torch.Tensor): Inequality constraint matrix for each problem, shape (batch_size, num_ineq_constraints, num_vars). + hs (torch.Tensor): Inequality constraint vector for each problem, shape (batch_size, num_ineq_constraints). + As (torch.Tensor, optional): Equality constraint matrix. Defaults to None. + bs (torch.Tensor, optional): Equality constraint vector. Defaults to None. + solver_args (dict, optional): Dictionary of solver arguments. Defaults to None. + + Returns: + Result of the QP solver for each problem. """ if solver_args is None: @@ -149,57 +156,54 @@ def cbf_layer(self, Qs, ps, Gs, hs, As=None, bs=None, solver_args=None): As = torch.Tensor().to(self.device).double() bs = torch.Tensor().to(self.device).double() - result = QPFunction(verbose=-1, **solver_args)(Qs.double(), ps.double(), Gs.double(), hs.double(), As, bs).float() - if torch.any(torch.isnan(result)): - prRed('QP Failed to solve - result is nan == {}!'.format(torch.any(torch.isnan(result)))) - raise Exception('QP Failed to solve') - return result - - def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sigma_pred_batch, modular=False, cbf_info_batch=None): # TODO: 解耦合的核心在这里 - """Build up matrices required to solve qp - - Program specifically solves: + return QPFunction(verbose=-1, **solver_args)( + Qs.double(), + ps.double(), + Gs.double(), + hs.double(), + As, + bs, + ).float() + + def get_cbf_qp_constraints( + self, + state_batch: torch.Tensor, + action_batch: torch.Tensor, + mean_pred_batch: torch.Tensor, + sigma_pred_batch: torch.Tensor, + gamma_b: float = 1.0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Builds up matrices required to solve a quadratic program (QP). + + The QP is defined to solve: minimize_{u,eps} 0.5 * u^T P u + q^T u - subject to G[u,eps]^T <= h - - Each control barrier certificate is of the form: - dh/dx^T (f_out + g_out u) >= -gamma^b h_out^3 where out here is an output of the state. - - In the case of SafetyGym_point dynamics: - state = [x y θ v ω] - state_d = [v*cos(θ) v*sin(θ) omega ω u^v u^ω] - - Quick Note on batch matrix multiplication for matrices A and B: - - Batch size should be first dim - - Everything needs to be 3-dimensional - - E.g. if B is a vec, i.e. shape (batch_size, vec_length) --> .view(batch_size, vec_length, 1) - - Parameters - ---------- - state_batch : torch.tensor - current state (check dynamics.py for details on each dynamics' specifics) - action_batch : torch.tensor - Nominal control input. - mean_pred_batch : torch.tensor - mean disturbance prediction state, dimensions (n_s, n_u) - sigma_pred_batch : torch.tensor - standard deviation in additive disturbance after undergoing the output dynamics. - gamma_b : float, optional - CBF parameter for the class-Kappa function - - Returns - ------- - P : torch.tensor - Quadratic cost matrix in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u) - q : torch.tensor - Linear cost vector in qp (minimize_{u,eps} 0.5 * u^T P u + q^T u) - G : torch.tensor - Inequality constraint matrix (G[u,eps] <= h) of size (num_constraints, n_u + 1) - h : torch.tensor - Inequality constraint vector (G[u,eps] <= h) of size (num_constraints,) + subject to G[u,eps]^T <= h + + Args: + state_batch (torch.Tensor): Current state batch. Refer to `dynamics.py` for specifics on each dynamic. + action_batch (torch.Tensor): Nominal control input batch. + mean_pred_batch (torch.Tensor): Mean disturbance prediction state batch, dimensions (n_s, n_u). + sigma_pred_batch (torch.Tensor): Standard deviation of the additive disturbance after undergoing the output dynamics. + gamma_b (float, optional): CBF parameter for the class-Kappa function. Defaults to 1.0. + + Returns: + tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: A tuple containing: + P (torch.Tensor): Quadratic cost matrix in the QP. + q (torch.Tensor): Linear cost vector in the QP. + G (torch.Tensor): Inequality constraint matrix for QP constraints. + h (torch.Tensor): Inequality constraint vector for QP constraints. """ - - assert len(state_batch.shape) == 2 and len(action_batch.shape) == 2 and len(mean_pred_batch.shape) == 2 and len(sigma_pred_batch.shape) == 2, print(state_batch.shape, action_batch.shape, mean_pred_batch.shape, sigma_pred_batch.shape) + assert ( + len(state_batch.shape) == 2 + and len(action_batch.shape) == 2 + and len(mean_pred_batch.shape) == 2 + and len(sigma_pred_batch.shape) == 2 + ), print( + state_batch.shape, + action_batch.shape, + mean_pred_batch.shape, + sigma_pred_batch.shape, + ) batch_size = state_batch.shape[0] gamma_b = self.gamma_b @@ -209,76 +213,7 @@ def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sig action_batch = torch.unsqueeze(action_batch, -1).to(self.device) mean_pred_batch = torch.unsqueeze(mean_pred_batch, -1).to(self.device) sigma_pred_batch = torch.unsqueeze(sigma_pred_batch, -1).to(self.device) - - if self.env.dynamics_mode == 'Pendulum': - num_constraints = 8 - n_u = action_batch.shape[1] # dimension of control inputs - # Inequality constraints (G[u, eps] <= h) - G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) # the extra variable is for epsilon (to make sure qp is always feasible) - h = torch.zeros((batch_size, num_constraints)).to(self.device) - - h1 = torch.FloatTensor([1, 0.01]).unsqueeze(-1).to(self.device) - h2 = torch.FloatTensor([1, -0.01]).unsqueeze(-1).to(self.device) - h3 = torch.FloatTensor([-1, 0.01]).unsqueeze(-1).to(self.device) - h4 = torch.FloatTensor([-1, -0.01]).unsqueeze(-1).to(self.device) - action_batch_scaled=(action_batch*15.0).squeeze(-1).to(self.device) # TODO: 写的好看点 - - theta = state_batch[:,0,:].squeeze(-1) - theta_dot = state_batch[:,1,:].squeeze(-1) - f_norm = torch.zeros(batch_size, 2).to(self.device) - # theta [batch_size, 1] - f_norm[:, 0] = -3*10/2*torch.sin(theta+torch.pi)*self.env.dt + theta - f_norm[: ,1] = theta_dot - 3*10/2*torch.sin(theta+torch.pi) - - g = torch.tensor([3*self.env.dt**2, 3*self.env.dt]).unsqueeze(0).to(self.device) - - f = torch.zeros_like(f_norm).to(self.device) - f[:, 0] = f_norm[:, 0] + mean_pred_batch[:,0,:].squeeze(-1) - f[:, 1] = f_norm[:, 1] + mean_pred_batch[:,1,:].squeeze(-1) - G = torch.tensor( - [ - [ - -torch.matmul(g, h1), - -torch.matmul(g, h2), - -torch.matmul(g, h3), - -torch.matmul(g, h4), - 1, - -1, - g[:, 1], - -g[:, 1] - ], - [ - -1, - -1, - -1, - -1, - 0, - 0, - 0, - 0 - ] - ] - ).transpose(0, 1).repeat(batch_size, 1, 1).to(self.device) - state_batch_squeeze = state_batch.squeeze(-1) - sigma_pred_batch_squeeze = sigma_pred_batch.squeeze(-1) - - h = torch.cat( - [ - self.gamma_b + torch.matmul(f, h1) + torch.matmul(g, h1) * action_batch_scaled - (1 - self.gamma_b) * torch.matmul(state_batch_squeeze, h1) - self.k_d * torch.abs(torch.matmul(sigma_pred_batch_squeeze, h1)), - self.gamma_b + torch.matmul(f, h2) + torch.matmul(g, h2) * action_batch_scaled - (1 - self.gamma_b) * torch.matmul(state_batch_squeeze, h2) - self.k_d * torch.abs(torch.matmul(sigma_pred_batch_squeeze, h2)), - self.gamma_b + torch.matmul(f, h3) + torch.matmul(g, h3) * action_batch_scaled - (1 - self.gamma_b) * torch.matmul(state_batch_squeeze, h3) - self.k_d * torch.abs(torch.matmul(sigma_pred_batch_squeeze, h3)), - self.gamma_b + torch.matmul(f, h4) + torch.matmul(g, h4) * action_batch_scaled - (1 - self.gamma_b) * torch.matmul(state_batch_squeeze, h4) - self.k_d * torch.abs(torch.matmul(sigma_pred_batch_squeeze, h4)), - -action_batch_scaled + 15.0, - action_batch_scaled + 15.0, - -f[:, 1].unsqueeze(-1) - g[:, 1] * action_batch_scaled + 60.0, - f[:, 1].unsqueeze(-1) + g[:, 1] * action_batch_scaled + 60.0 - ], - dim=1 - ).to(self.device) - P = torch.diag(torch.tensor([1.e0, 1e16])).repeat(batch_size, 1, 1).to(self.device) - q = torch.zeros((batch_size, self.action_dim + 1)).to(self.device) - - elif self.env.dynamics_mode == 'Unicycle': + if self.env.dynamics_mode == 'Unicycle': num_cbfs = len(self.env.hazards) l_p = self.l_p @@ -287,18 +222,10 @@ def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sig thetas = state_batch[:, 2, :].squeeze(-1) c_thetas = torch.cos(thetas) s_thetas = torch.sin(thetas) - - # p(x): lookahead output (batch_size, 2) ps = torch.zeros((batch_size, 2)).to(self.device) ps[:, 0] = state_batch[:, 0, :].squeeze(-1) + l_p * c_thetas ps[:, 1] = state_batch[:, 1, :].squeeze(-1) + l_p * s_thetas - - # p_dot(x) = f_p(x) + g_p(x)u + D_p where f_p(x) = 0, g_p(x) = RL and D_p is the disturbance - - # f_p(x) = [0,...,0]^T f_ps = torch.zeros((batch_size, 2, 1)).to(self.device) - - # g_p(x) = RL where L = diag([1, l_p]) Rs = torch.zeros((batch_size, 2, 2)).to(self.device) Rs[:, 0, 0] = c_thetas Rs[:, 0, 1] = -s_thetas @@ -307,9 +234,7 @@ def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sig Ls = torch.zeros((batch_size, 2, 2)).to(self.device) Ls[:, 0, 0] = 1 Ls[:, 1, 1] = l_p - g_ps = torch.bmm(Rs, Ls) # (batch_size, 2, 2) - - # D_p(x) = g_p [0 D_θ]^T + [D_x1 D_x2]^T + g_ps = torch.bmm(Rs, Ls) mu_theta_aug = torch.zeros([batch_size, 2, 1]).to(self.device) mu_theta_aug[:, 1, :] = mean_pred_batch[:, 2, :] mu_ps = torch.bmm(g_ps, mu_theta_aug) + mean_pred_batch[:, :2, :] @@ -318,42 +243,45 @@ def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sig sigma_ps = torch.bmm(torch.abs(g_ps), sigma_theta_aug) + sigma_pred_batch[:, :2, :] # Build RCBFs - hs = 1e3 * torch.ones((batch_size, num_cbfs), device=self.device) # the RCBF itself + hs = 1e3 * torch.ones((batch_size, num_cbfs), device=self.device) dhdps = torch.zeros((batch_size, num_cbfs, 2), device=self.device) hazards = self.env.hazards for i in range(len(hazards)): - if hazards[i]['type'] == 'circle': # 1/2 * (||ps - x_obs||^2 - r^2) + if hazards[i]['type'] == 'circle': obs_loc = to_tensor(hazards[i]['location'], torch.FloatTensor, self.device) - hs[:, i] = 0.5 * (torch.sum((ps - obs_loc)**2, dim=1) - (hazards[i]['radius'] + buffer)**2) - dhdps[:, i, :] = (ps - obs_loc) - elif hazards[i]['type'] == 'polygon': # max_j(h_j) where h_j = 1/2 * (dist2seg_j)^2 - vertices = sort_vertices_cclockwise(hazards[i]['vertices']) # (n_v, 2) - segments = np.diff(vertices, axis=0, - append=vertices[[0]]) # (n_v, 2) at row i contains vector from v_i to v_i+1 + hs[:, i] = 0.5 * ( + torch.sum((ps - obs_loc) ** 2, dim=1) - (hazards[i]['radius'] + buffer) ** 2 + ) + dhdps[:, i, :] = ps - obs_loc + elif hazards[i]['type'] == 'polygon': + vertices = sort_vertices_cclockwise(hazards[i]['vertices']) + segments = np.diff(vertices, axis=0, append=vertices[[0]]) segments = to_tensor(segments, torch.FloatTensor, self.device) vertices = to_tensor(vertices, torch.FloatTensor, self.device) - # Get max RBCF TODO: Can be optimized for j in range(segments.shape[0]): - # Compute Distances to segment - dot_products = torch.matmul(ps - vertices[j:j + 1], segments[j]) / torch.sum( - segments[j] ** 2) # (batch_size,) - mask0_ = dot_products < 0 # if <0 closest point on segment is vertex j - mask1_ = dot_products > 1 # if >0 closest point on segment is vertex j+1 - mask_ = torch.logical_and(dot_products >= 0, - dot_products <= 1) # Else find distance to line l_{v_j, v_j+1} - # Compute Distances - dists2seg = torch.zeros((batch_size)) + dot_products = torch.matmul( + ps - vertices[j : j + 1], + segments[j], + ) / torch.sum(segments[j] ** 2) + mask0_ = dot_products < 0 + mask1_ = dot_products > 1 + mask_ = torch.logical_and(dot_products >= 0, dot_products <= 1) + dists2seg = torch.zeros(batch_size) if mask0_.sum() > 0: dists2seg[mask0_] = torch.linalg.norm(ps[mask0_] - vertices[[j]], dim=1) if mask1_.sum() > 0: - dists2seg[mask1_] = torch.linalg.norm(ps[mask1_] - vertices[[(j + 1) % segments.shape[0]]], dim=1) + dists2seg[mask1_] = torch.linalg.norm( + ps[mask1_] - vertices[[(j + 1) % segments.shape[0]]], + dim=1, + ) if mask_.sum() > 0: dists2seg[mask_] = torch.linalg.norm( - dot_products[mask_, None] * segments[j].tile((torch.sum(mask_), 1)) + vertices[[j]] - - ps[mask_], dim=1) - # Compute hs_ for this segment - hs_ = 0.5 * ((dists2seg ** 2) + 0.5*buffer) # (batch_size,) - # Compute dhdps TODO: Can be optimized to only compute for indices that need updating + dot_products[mask_, None] * segments[j].tile((torch.sum(mask_), 1)) + + vertices[[j]] + - ps[mask_], + dim=1, + ) + hs_ = 0.5 * ((dists2seg**2) + 0.5 * buffer) dhdps_ = torch.zeros((batch_size, 2)) if mask0_.sum() > 0: dhdps_[mask0_] = ps[mask0_] - vertices[[j]] @@ -362,8 +290,9 @@ def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sig if mask_.sum() > 0: normal_vec = torch.tensor([segments[j][1], -segments[j][0]]) normal_vec /= torch.linalg.norm(normal_vec) - dhdps_[mask_] = (ps[mask_]-vertices[j]).matmul(normal_vec) * normal_vec.view((1,2)).repeat(torch.sum(mask_), 1) # dot products (batch_size, 1) - # Find indices to update (closest segment basically, worst case -> CBF boolean and is a min) + dhdps_[mask_] = (ps[mask_] - vertices[j]).matmul( + normal_vec, + ) * normal_vec.view((1, 2)).repeat(torch.sum(mask_), 1) idxs_to_update = torch.nonzero(hs[:, i] - hs_ > 0) # Update the actual hs to be used in the constraints if idxs_to_update.shape[0] > 0: @@ -371,38 +300,43 @@ def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sig # Compute dhdhps for those indices dhdps[idxs_to_update, i, :] = dhdps_[idxs_to_update, :] else: - raise Exception('Only obstacles of type `circle` or `polygon` are supported, got: {}'.format(hazards[i]['type'])) + raise Exception( + 'Only obstacles of type `circle` or `polygon` are supported, got: {}'.format( + hazards[i]['type'], + ), + ) - n_u = action_batch.shape[1] # dimension of control inputs - num_constraints = num_cbfs + 2 * n_u # each cbf is a constraint, and we need to add actuator constraints (n_u of them) + n_u = action_batch.shape[1] + num_constraints = num_cbfs + 2 * n_u - # Inequality constraints (G[u, eps] <= h) - G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) # the extra variable is for epsilon (to make sure qp is always feasible) + G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) h = torch.zeros((batch_size, num_constraints)).to(self.device) ineq_constraint_counter = 0 - # Add inequality constraints - G[:, :num_cbfs, :n_u] = -torch.bmm(dhdps, g_ps) # h1^Tg(x) - G[:, :num_cbfs, n_u] = -1 # for slack - h[:, :num_cbfs] = gamma_b * (hs ** 3) + (torch.bmm(dhdps, f_ps + mu_ps) - torch.bmm(torch.abs(dhdps), sigma_ps) + torch.bmm(torch.bmm(dhdps, g_ps), action_batch)).squeeze(-1) + G[:, :num_cbfs, :n_u] = -torch.bmm(dhdps, g_ps) + G[:, :num_cbfs, n_u] = -1 + h[:, :num_cbfs] = gamma_b * (hs**3) + ( + torch.bmm(dhdps, f_ps + mu_ps) + - torch.bmm(torch.abs(dhdps), sigma_ps) + + torch.bmm(torch.bmm(dhdps, g_ps), action_batch) + ).squeeze(-1) ineq_constraint_counter += num_cbfs - - # Let's also build the cost matrices, vectors to minimize control effort and penalize slack - P = torch.diag(torch.tensor([1.e0, 1.e-2, 1e5])).repeat(batch_size, 1, 1).to(self.device) + P = ( + torch.diag(torch.tensor([1.0e0, 1.0e-2, 1e5])) + .repeat(batch_size, 1, 1) + .to(self.device) + ) q = torch.zeros((batch_size, n_u + 1)).to(self.device) - # Add Actuator Constraints - n_u = action_batch.shape[1] # dimension of control inputs + n_u = action_batch.shape[1] for c in range(n_u): - # u_max >= u_nom + u ---> u <= u_max - u_nom if self.u_max is not None: G[:, ineq_constraint_counter, c] = 1 h[:, ineq_constraint_counter] = self.u_max[c] - action_batch[:, c].squeeze(-1) ineq_constraint_counter += 1 - # u_min <= u_nom + u ---> -u <= u_min - u_nom if self.u_min is not None: G[:, ineq_constraint_counter, c] = -1 h[:, ineq_constraint_counter] = -self.u_min[c] + action_batch[:, c].squeeze(-1) @@ -410,19 +344,14 @@ def get_cbf_qp_constraints(self, state_batch, action_batch, mean_pred_batch, sig return P, q, G, h - def get_control_bounds(self): + def get_control_bounds(self) -> tuple[torch.Tensor, torch.Tensor]: """ - Returns - ------- - u_min : torch.tensor - min control input. - u_max : torch.tensor - max control input. + Returns: + Action bounds, i.e., min control input and max control input. """ u_min = torch.tensor(self.env.safe_action_space.low).to(self.device) u_max = torch.tensor(self.env.safe_action_space.high).to(self.device) return u_min, u_max - \ No newline at end of file diff --git a/omnisafe/common/robust_gp_model.py b/omnisafe/common/robust_gp_model.py index 2824faf12..3380d1f2d 100644 --- a/omnisafe/common/robust_gp_model.py +++ b/omnisafe/common/robust_gp_model.py @@ -1,59 +1,110 @@ -""" Adapted almost directly from: -https://docs.gpytorch.ai/en/stable/examples/02_Scalable_Exact_GPs/Simple_GP_Regression_CUDA.html +from __future__ import annotations -Training is performed rapidly (and exactly) using GPUs and prediction is done very rapidly using LOVE. -""" +import os +import warnings +from typing import Callable -import torch -import numpy as np import gpytorch -import warnings -warnings.filterwarnings('ignore') -from omnisafe.common.utils import to_tensor, to_numpy +import gymnasium as gym +import numpy as np +import torch +from gpytorch.distributions import MultivariateNormal +from gpytorch.kernels import RBFKernel, ScaleKernel +from gpytorch.likelihoods import Likelihood +from gpytorch.means import ZeroMean +from gpytorch.priors import NormalPrior + +from omnisafe.common.utils import to_numpy, to_tensor +from omnisafe.typing import DEVICE_CPU + -DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}, # state = [x y θ] - 'SimulatedCars': {'n_s': 10, 'n_u': 1}, # state = [x y θ v ω] - 'Pvtol': {'n_s': 6, 'n_u': 2}, # state = [x y θ v_x v_y thrust] - 'Pendulum': {'n_s': 2, 'n_u': 1} - } -MAX_STD = {'Unicycle': [2e-1, 2e-1, 2e-1], 'SimulatedCars': [0, 0.2, 0, 0.2, 0, 0.2, 0, 0.2, 0, 0.2], 'Pvtol': [0, 0, 0, 0, 0, 0], 'Pendulum': [0.1, 0.1, 0.1]} +DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}} +MAX_STD = {'Unicycle': [2e-1, 2e-1, 2e-1]} class BaseGPy(gpytorch.models.ExactGP): + """ + A Gaussian Process (GP) model using a zero mean function and a scaled RBF kernel with priors. + + This class extends gpytorch.models.ExactGP, specifically designed for use in + disturbance estimation tasks. - def __init__(self, train_x, train_y, prior_std, likelihood): + Attributes: + mean_module (ZeroMean): The mean module which is set to zero mean. + covar_module (ScaleKernel): The covariance kernel, a scaled RBF kernel with specified priors. + + Args: + train_x (Tensor): Training input features, which should be a tensor. + train_y (Tensor): Training target values, which should be a tensor. + prior_std (float): The prior standard deviation used to adjust the output scale of the kernel. + likelihood (Likelihood): The likelihood function associated with the GP model. + """ + + def __init__( + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + prior_std: float, + likelihood: Likelihood, + ) -> None: + """Initialize the BaseGPy model.""" super().__init__(train_x, train_y, likelihood) - self.mean_module = gpytorch.means.ZeroMean() - self.covar_module = gpytorch.kernels.ScaleKernel( - gpytorch.kernels.RBFKernel(lengthscale_prior=gpytorch.priors.NormalPrior(1e5, 1e-5)), - outputscale_prior=gpytorch.priors.NormalPrior(prior_std + 1e-6, 1e-5)) - # Initialize lengthscale and outputscale to mean of priors + self.mean_module = ZeroMean() + self.covar_module = ScaleKernel( + RBFKernel(lengthscale_prior=NormalPrior(1e5, 1e-5)), + outputscale_prior=NormalPrior(prior_std + 1e-6, 1e-5), + ) self.covar_module.base_kernel.lengthscale = 1e5 self.covar_module.outputscale = prior_std + 1e-6 - def forward(self, x): + def forward(self, x: torch.Tensor) -> MultivariateNormal: + """Forward pass through the GP model to produce a multivariate normal distribution. + + Args: + x (Tensor): Input features for which predictions are to be made. + + Returns: + MultivariateNormal: A multivariate normal distribution reflecting the GP predictions. + """ mean = self.mean_module(x) covar = self.covar_module(x) - return gpytorch.distributions.MultivariateNormal(mean, covar) + return MultivariateNormal(mean, covar) + class GPyDisturbanceEstimator: - """ - A wrapper around teh BaseGPy model above. + """A class for estimating disturbances using Gaussian Processes with GPyTorch. + + Attributes: + device (torch.device): The device (CPU or CUDA) on which the tensors will be processed. + _train_x (torch.Tensor): Training data features. + _train_y (torch.Tensor): Training data targets. + likelihood (gpytorch.likelihoods.Likelihood): The likelihood model for GP inference. + model (BaseGPy): The GPyTorch model. + + Args: + train_x (torch.Tensor): Training data features. If not a tensor, it will be converted. + train_y (torch.Tensor): Training data targets. If not a tensor, it will be converted. + prior_std (float): Standard deviation of the prior distribution. + likelihood (Optional[gpytorch.likelihoods.Likelihood]): A GPyTorch likelihood. If None, a default GaussianLikelihood is used. + device (Optional[torch.device]): The torch device. Defaults to CPU if None. """ - def __init__(self, train_x, train_y, prior_std, likelihood=None, device=None): - - if device: - self.device = device - else: - self.device = torch.device("cpu") + def __init__( + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + prior_std: float, + likelihood: gpytorch.likelihoods.Likelihood | None = None, + device: torch.device = DEVICE_CPU, + ) -> None: + self.device = device if device else torch.device('cpu') if not torch.is_tensor(train_x): - train_x = to_tensor(train_x, torch.FloatTensor, self.device) + train_x = torch.tensor(train_x, dtype=torch.float32, device=self.device) if not torch.is_tensor(train_y): - train_y = to_tensor(train_y, torch.FloatTensor, self.device) - self.train_x = train_x - self.train_y = train_y + train_y = torch.tensor(train_y, dtype=torch.float32, device=self.device) + self._train_x = train_x + self._train_y = train_y if not likelihood: likelihood = gpytorch.likelihoods.GaussianLikelihood() @@ -61,182 +112,143 @@ def __init__(self, train_x, train_y, prior_std, likelihood=None, device=None): self.model = BaseGPy(train_x, train_y, prior_std, likelihood) self.model = self.model.to(self.device) + warnings.filterwarnings('ignore') - def train(self, training_iter, verbose=False): + def train(self, training_iter: int, verbose: bool = False) -> None: + """Trains the Gaussian Process model. - # Find optimal model hyperparameters + Args: + training_iter (int): Number of training iterations. + verbose (bool): If True, prints detailed logging information. + """ self.model.train() self.likelihood.train() - - # Use the adam optimizer - optimizer = torch.optim.Adam(self.model.parameters(), lr=0.1) # Includes GaussianLikelihood parameters - - # "Loss" for GPs - the marginal log likelihood + optimizer = torch.optim.Adam(self.model.parameters(), lr=0.1) mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model) for i in range(training_iter): - # Zero gradients from previous iteration optimizer.zero_grad() - # Output from model - output = self.model(self.train_x) - # Calc loss and backprop gradients - loss = -mll(output, self.train_y) + output = self.model(self._train_x) + loss = -mll(output, self._train_y) loss.backward() if verbose: - print('\tIter %d/%d - Loss: %.3f lengthscale: %.3f noise: %.3f' % ( - i + 1, training_iter, loss.item(), - self.model.covar_module.base_kernel.lengthscale.item(), - self.model.likelihood.noise.item() - )) + print( + f'\tIter {i + 1}/{training_iter} - Loss: {loss.item():.3f} lengthscale: ' + f'{self.model.covar_module.base_kernel.lengthscale.item():.3f} noise: ' + f'{self.likelihood.noise.item():.3f}', + ) optimizer.step() - def predict(self, test_x): + def predict(self, test_x: torch.Tensor) -> dict[str, torch.Tensor | np.ndarray]: + """ + Makes predictions on new data. + + Args: + test_x (torch.Tensor): Test data features. If not a tensor, it will be converted. - # Convert to torch tensor + Returns: + A dictionary containing prediction mean, variance, covariance matrix, and confidence + intervals. If the input was not a tensor, values will be converted to numpy arrays. + """ is_tensor = torch.is_tensor(test_x) if not is_tensor: - test_x = to_tensor(test_x, torch.FloatTensor, self.device) + test_x = torch.tensor(test_x, dtype=torch.float32, device=self.device) - # Get into evaluation (predictive posterior) mode self.model.eval() self.likelihood.eval() - # Test points are regularly spaced along [0,1] - # Make predictions by feeding model through likelihood with torch.no_grad(), gpytorch.settings.fast_pred_var(): observed_pred = self.likelihood(self.model(test_x)) - pred_dict = dict() - pred_dict['mean'] = observed_pred.mean.cpu() - pred_dict['f_var'] = observed_pred.variance.cpu() - pred_dict['f_covar'] = observed_pred.covariance_matrix.cpu() - lower_ci, upper_ci = observed_pred.confidence_region() - pred_dict['lower_ci'] = lower_ci.cpu() - pred_dict['upper_ci'] = upper_ci.cpu() - - # If they gave us ndarray, we give back ndarray + pred_dict = { + 'mean': observed_pred.mean.cpu(), + 'f_var': observed_pred.variance.cpu(), + 'f_covar': observed_pred.covariance_matrix.cpu(), + 'lower_ci': observed_pred.confidence_region()[0].cpu(), + 'upper_ci': observed_pred.confidence_region()[1].cpu(), + } + if not is_tensor: for key, val in pred_dict.items(): - pred_dict[key] = to_numpy(val) + pred_dict[key] = val.numpy() return pred_dict -class DynamicsModel: - def __init__(self, env, gp_model_size=2000, l_p=0.03, device='cpu'): - """Constructor of DynamicsModel. +class DynamicsModel: + """Initializes the DynamicsModel with a gym environment. - Parameters - ---------- - env : gym.env - Gym environment. - """ + Args: + env (gym.Env): The gym environment to model dynamics for. + gp_model_size (int, optional): Maximum history count for disturbances. Defaults to 2000. + l_p (float, optional): Learning parameter. Defaults to 0.03. + device (str, optional): The device to perform computations on. Defaults to 'cpu'. + """ + def __init__( + self, + env: gym.Env, + gp_model_size: int = 2000, + l_p: float = 0.03, + device: str = 'cpu', + ) -> None: self.env = env - # Get Dynamics self.get_f, self.get_g = self.get_dynamics() self.n_s = DYNAMICS_MODE[self.env.dynamics_mode]['n_s'] self.n_u = DYNAMICS_MODE[self.env.dynamics_mode]['n_u'] - # Keep Disturbance History to estimate it using GPs - self.disturb_estimators = None - self.disturbance_history = dict() - self.history_counter = 0 # keeping only max_history_count points in the buffer - self.max_history_count = gp_model_size # How many points we want to have in the GP + self._disturb_estimators = None + self.disturbance_history = {} + self.history_counter = 0 + self.max_history_count = gp_model_size self.disturbance_history['state'] = np.zeros((self.max_history_count, self.n_s)) self.disturbance_history['disturbance'] = np.zeros((self.max_history_count, self.n_s)) - self.train_x = None # x-data used to fit the last GP models - self.train_y = None # y-data used to fit the last GP models + self._train_x = None + self._train_y = None self.l_p = l_p - self.device = torch.device(device) - def predict_next_state(self, state_batch, u_batch, t_batch=None, use_gps=True): - """Given the current state and action, this function predicts the next state. - - Parameters - ---------- - state_batch : ndarray - State - u_batch : ndarray - Action - t_batch: ndarray, optional - Time batch for state dependant dynamics - use_gps : bool, optional - Use GPs to return mean and var - - Returns - ------- - next_state : ndarray - Next state + def predict_next_state(self, state_batch: np.ndarray, u_batch: np.ndarray) -> np.ndarray: """ + Predicts the next state given the current state and action batch. + Args: + state_batch (np.ndarray): The batch of current states. + u_batch (np.ndarray): The batch of actions applied. + + Returns: + np.ndarray: The batch of predicted next states. + """ expand_dims = len(state_batch.shape) == 1 if expand_dims: state_batch = np.expand_dims(state_batch, axis=0) - # Start with our prior for continuous time system x' = f(x) + g(x)u - if t_batch is not None: - next_state_batch = state_batch + self.env.dt * (self.get_f(state_batch, t_batch) + (self.get_g(state_batch, t_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1)) - else: - next_state_batch = state_batch + self.env.dt * (self.get_f(state_batch) + (self.get_g(state_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1)) - - if use_gps: # if we want estimate the disturbance, let's do it! - pred_mean, pred_std = self.predict_disturbance(state_batch) - next_state_batch += self.env.dt * pred_mean - else: - pred_std = np.zeros(state_batch.shape) + next_state_batch = state_batch + self.env.dt * ( + self.get_f(state_batch) + + (self.get_g(state_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1) + ) + pred_mean, pred_std = self.predict_disturbance(state_batch) + next_state_batch += self.env.dt * pred_mean if expand_dims: next_state_batch = next_state_batch.squeeze(0) if pred_std is not None: pred_std = pred_std.squeeze(0) - if t_batch is not None: - next_t_batch = t_batch + self.env.dt - return next_state_batch, self.env.dt * pred_std, next_t_batch - - return next_state_batch, self.env.dt * pred_std, t_batch - - def predict_next_obs(self, state, u): - """Predicts the next observation given the state and u. Note that this only predicts the mean next observation. + return next_state_batch - Parameters - ---------- - state : ndarray - u : ndarray + def get_dynamics(self) -> tuple[Callable, Callable]: + """Retrieves the dynamics functions for drift and control based on the environment's dynamics mode. - Returns - ------- - next_obs : ndarray - Next observation + Returns: + tuple: A tuple containing two callables, `get_f` and `get_g`, which compute the drift and control dynamics respectively. """ - - next_state, _, _ = self.predict_next_state(state, u) - next_obs = self.get_obs(next_state) - return next_obs - - def get_dynamics(self): - """Get affine CBFs for a given environment. - - Parameters - ---------- - - Returns - ------- - get_f : callable - Drift dynamics of the continuous system x' = f(x) + g(x)u - get_g : callable - Control dynamics of the continuous system x' = f(x) + g(x)u - """ - if self.env.dynamics_mode == 'Unicycle': - def get_f(state_batch, t_batch=None): - f_x = np.zeros(state_batch.shape) - return f_x + def get_f(state_batch: np.ndarray) -> np.ndarray: + return np.zeros(state_batch.shape) - def get_g(state_batch, t_batch=None): + def get_g(state_batch: np.ndarray) -> np.ndarray: theta = state_batch[:, 2] g_x = np.zeros((state_batch.shape[0], 3, 2)) g_x[:, 0, 0] = np.cos(theta) @@ -244,53 +256,28 @@ def get_g(state_batch, t_batch=None): g_x[:, 2, 1] = 1.0 return g_x - elif self.env.dynamics_mode == 'Pendulum': - - def get_f(state_batch, t_batch=None): - f_x = np.zeros(state_batch.shape) - theta = state_batch[:, 0] - theta_dot = state_batch[:, 1] - f_x = np.array( - [ - -3*10/2*np.sin(theta+np.pi)*self.env.dt + theta, - theta_dot - 3*10/2*np.sin(theta+np.pi) - ] - ) - return f_x - - def get_g(state_batch, t_batch=None): - g_x = np.zeros((state_batch.shape[0], 2, 1)) - g_x[:, 0, 0] = 3*self.env.dt**2 - g_x[:, 1, 0] = 3*self.env.dt - return g_x - else: raise Exception('Unknown Dynamics mode.') return get_f, get_g - def get_state(self, obs): - """Given the observation, this function does the pre-processing necessary and returns the state. - - Parameters - ---------- - obs_batch : ndarray or torch.tensor - Environment observation. + def get_state(self, obs: np.ndarray) -> np.ndarray: + """ + Processes the raw observations from the environment and returns the corresponding state representation. - Returns - ------- - state_batch : ndarray or torch.tensor - State of the system. + Args: + obs (np.ndarray): The environment observations. + Returns: + np.ndarray: The processed state of the system. """ - expand_dims = len(obs.shape) == 1 is_tensor = torch.is_tensor(obs) if is_tensor: dtype = obs.dtype device = obs.device - obs = to_numpy(obs) + obs = obs.cpu().numpy() if obs.is_cuda else obs.numpy() if expand_dims: obs = np.expand_dims(obs, 0) @@ -301,64 +288,29 @@ def get_state(self, obs): state_batch[:, 0] = obs[:, 0] state_batch[:, 1] = obs[:, 1] state_batch[:, 2] = theta - elif self.env.dynamics_mode == 'Pendulum': - theta = np.arctan2(obs[:, 1], obs[:, 0]) - theta_dot = obs[:, 2] - state_batch = np.zeros((obs.shape[0], 2)) - state_batch[:, 0] = theta - state_batch[:, 1] = theta_dot else: raise Exception('Unknown dynamics') if expand_dims: state_batch = state_batch.squeeze(0) - return to_tensor(state_batch, dtype, device) if is_tensor else state_batch - - def get_obs(self, state_batch): - """Given the state, this function returns it to an observation akin to the one obtained by calling env.step - - Parameters - ---------- - state : ndarray - Environment state batch of shape (batch_size, n_s) - - Returns - ------- - obs : ndarray - Observation batch of shape (batch_size, n_o) - - """ - - if self.env.dynamics_mode == 'Unicycle': - obs = np.zeros((state_batch.shape[0], 4)) - obs[:, 0] = state_batch[:, 0] - obs[:, 1] = state_batch[:, 1] - obs[:, 2] = np.cos(state_batch[:, 2]) - obs[:, 3] = np.sin(state_batch[:, 2]) - else: - raise Exception('Unknown dynamics') - return obs - - def append_transition(self, state_batch, u_batch, next_state_batch, t_batch=None): - """Estimates the disturbance from the current dynamics transition and adds it to buffer. - - Parameters - ---------- - state_batch : ndarray - shape (n_s,) or (batch_size, n_s) - u_batch : ndarray - shape (n_u,) or (batch_size, n_u) - next_state_batch : ndarray - shape (n_s,) or (batch_size, n_s) - t_batch : ndarray, optional - shape (1,) or (batch_size, 1) - - Returns - ------- - + if is_tensor: + return torch.tensor(state_batch, dtype=dtype, device=device) + return state_batch + + def append_transition( + self, + state_batch: np.ndarray, + u_batch: np.ndarray, + next_state_batch: np.ndarray, + ) -> None: + """Estimates the disturbance from the current dynamics transition and adds it to the buffer. + + Args: + state_batch (np.ndarray): The batch of current states, shape (n_s,) or (batch_size, n_s). + u_batch (np.ndarray): The batch of actions applied, shape (n_u,) or (batch_size, n_u). + next_state_batch (np.ndarray): The batch of next states, shape (n_s,) or (batch_size, n_s). """ - expand_dims = len(state_batch.shape) == 1 if expand_dims: @@ -366,71 +318,68 @@ def append_transition(self, state_batch, u_batch, next_state_batch, t_batch=None next_state_batch = np.expand_dims(next_state_batch, 0) u_batch = np.expand_dims(u_batch, 0) - u_batch = np.expand_dims(u_batch, -1) # for broadcasting batch matrix multiplication - disturbance_batch = (next_state_batch - state_batch - self.env.dt * (self.get_f(state_batch, t_batch) + (self.get_g(state_batch, t_batch) @ u_batch).squeeze(-1))) / self.env.dt + u_batch = np.expand_dims(u_batch, -1) + disturbance_batch = ( + next_state_batch + - state_batch + - self.env.dt + * (self.get_f(state_batch) + (self.get_g(state_batch) @ u_batch).squeeze(-1)) + ) / self.env.dt - # Append new data point (state, disturbance) to our dataset for i in range(state_batch.shape[0]): - - self.disturbance_history['state'][self.history_counter % self.max_history_count] = state_batch[i] - self.disturbance_history['disturbance'][self.history_counter % self.max_history_count] = disturbance_batch[i] - - # Increment how many data points we have + self.disturbance_history['state'][self.history_counter % self.max_history_count] = ( + state_batch[i] + ) + self.disturbance_history['disturbance'][ + self.history_counter % self.max_history_count + ] = disturbance_batch[i] self.history_counter += 1 - # Update GP models every max_history_count data points - if self.history_counter % (self.max_history_count/10) == 0: + if self.history_counter % (self.max_history_count // 10) == 0: self.fit_gp_model() - def fit_gp_model(self, training_iter=70): - """ - - Parameters - ---------- - training_iter : int - Number of training iterations for GP model. - - Returns - ------- + def fit_gp_model(self, training_iter: int = 70) -> None: + """Fits a Gaussian Process model to the disturbance data. + Args: + training_iter (int, optional): Number of training iterations for the GP model. Defaults to 70. """ - - if self.history_counter < self.max_history_count: # didn't fill the buffer yet - train_x = self.disturbance_history['state'][:self.history_counter] - train_y = self.disturbance_history['disturbance'][:self.history_counter] - else: # buffer filled, use all the data points + if self.history_counter < self.max_history_count: + train_x = self.disturbance_history['state'][: self.history_counter] + train_y = self.disturbance_history['disturbance'][: self.history_counter] + else: train_x = self.disturbance_history['state'] train_y = self.disturbance_history['disturbance'] - # Normalize Data train_x_std = np.std(train_x, axis=0) train_x_normalized = train_x / (train_x_std + 1e-8) train_y_std = np.std(train_y, axis=0) train_y_normalized = train_y / (train_y_std + 1e-8) - self.disturb_estimators = [] + self._disturb_estimators = [] for i in range(self.n_s): - # self.disturb_estimators.append(GPyDisturbanceEstimator(train_x, train_y[:, i])) - self.disturb_estimators.append(GPyDisturbanceEstimator(train_x_normalized, train_y_normalized[:, i], MAX_STD[self.env.dynamics_mode][i], device=self.device)) - self.disturb_estimators[i].train(training_iter) - - # track the data I last used to fit the GPs for saving purposes (need it to initialize before loading weights) - self.train_x = train_x - self.train_y = train_y - - def predict_disturbance(self, test_x): - """Predict the disturbance at the queried states using the GP models. - - Parameters - ---------- - test_x : ndarray or torch.tensor - shape(n_test, n_s) - Returns - ------- - means: ndarray or torch.tensor - Prediction means -- shape(n_test, n_s) - vars: ndarray or torch.tensor - Prediction variances -- shape(n_test, n_s) + self._disturb_estimators.append( + GPyDisturbanceEstimator( + train_x_normalized, + train_y_normalized[:, i], + MAX_STD[self.env.dynamics_mode][i], + device=self.device, + ), + ) + self._disturb_estimators[i].train(training_iter) + + self._train_x = train_x + self._train_y = train_y + + def predict_disturbance(self, test_x: np.ndarray) -> tuple: + """Predicts the disturbance at the queried states using the trained Gaussian Process models. + + Args: + test_x (np.ndarray): The state for which to predict disturbances, shape (n_test, n_s). + + Returns: + tuple: A tuple of arrays (means, variances) where means is the predicted mean disturbance + and variances is the corresponding variance, shape (n_test, n_s). """ is_tensor = torch.is_tensor(test_x) @@ -445,19 +394,18 @@ def predict_disturbance(self, test_x): test_x = np.expand_dims(test_x, axis=0) means = np.zeros(test_x.shape) - f_std = np.zeros(test_x.shape) # standard deviation + f_std = np.zeros(test_x.shape) - if self.disturb_estimators: - # Normalize - train_x_std = np.std(self.train_x, axis=0) - train_y_std = np.std(self.train_y, axis=0) + if self._disturb_estimators: + train_x_std = np.std(self._train_x, axis=0) + train_y_std = np.std(self._train_y, axis=0) test_x = test_x / train_x_std for i in range(self.n_s): - prediction_ = self.disturb_estimators[i].predict(test_x) + prediction_ = self._disturb_estimators[i].predict(test_x) means[:, i] = prediction_['mean'] * (train_y_std[i] + 1e-8) f_std[:, i] = np.sqrt(prediction_['f_var']) * (train_y_std[i] + 1e-8) - else: # zero-mean, max_sigma prior + else: f_std = np.ones(test_x.shape) for i in range(self.n_s): f_std[:, i] *= MAX_STD[self.env.dynamics_mode][i] @@ -466,33 +414,48 @@ def predict_disturbance(self, test_x): means = means.squeeze(0) f_std = f_std.squeeze(0) - return (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) if is_tensor else (means, f_std) - - def load_disturbance_models(self, output): + return ( + (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) + if is_tensor + else (means, f_std) + ) - if output is None: - return + def load_disturbance_models(self, save_dir: str, epoch: str) -> None: + """Loads the disturbance models and their training data. - self.disturb_estimators = [] - - weights = torch.load('{}/gp_models.pkl'.format(output), map_location=self.device) - self.train_x = torch.load('{}/gp_models_train_x.pkl'.format(output)) - self.train_y = torch.load('{}/gp_models_train_y.pkl'.format(output)) + Args: + save_dir (str): The directory where the model files are saved. + epoch (str): The epoch identifier used in the filenames to load the specific model checkpoint. + """ + self._disturb_estimators = [] + weights = torch.load( + os.path.join(save_dir, f'gp_models_{epoch}.pkl'), + map_location=self.device, + ) + self._train_x = torch.load(os.path.join(save_dir, f'gp_models_train_x_{epoch}.pkl')) + self._train_y = torch.load(os.path.join(save_dir, f'gp_models_train_y_{epoch}.pkl')) for i in range(self.n_s): - self.disturb_estimators.append(GPyDisturbanceEstimator(self.train_x, self.train_y[:, i], MAX_STD[self.env.dynamics_mode][i], device=self.device)) - self.disturb_estimators[i].model.load_state_dict(weights[i]) - - def save_disturbance_models(self, output): - - if not self.disturb_estimators or self.train_x is None or self.train_y is None: - return - weights = [] - for i in range(len(self.disturb_estimators)): - weights.append(self.disturb_estimators[i].model.state_dict()) - torch.save(weights, '{}/gp_models.pkl'.format(output)) - # Also save data used to fit model (needed for initializing the model before loading weights) - torch.save(self.train_x, '{}/gp_models_train_x.pkl'.format(output)) - torch.save(self.train_y, '{}/gp_models_train_y.pkl'.format(output)) - - def seed(self, seed): - torch.manual_seed(seed) \ No newline at end of file + self._disturb_estimators.append( + GPyDisturbanceEstimator( + self._train_x, + self._train_y[:, i], + MAX_STD[self.env.dynamics_mode][i], + device=self.device, + ), + ) + self._disturb_estimators[i].model.load_state_dict(weights[i]) + + @property + def train_x(self) -> np.ndarray: + """Returns the training data input features used for the disturbance estimators.""" + return self._train_x + + @property + def train_y(self) -> np.ndarray: + """Returns the training data labels used for the disturbance estimators.""" + return self._train_y + + @property + def disturb_estimators(self) -> list[GPyDisturbanceEstimator]: + """Provides access to the list of trained disturbance estimator models.""" + return self._disturb_estimators diff --git a/omnisafe/common/utils.py b/omnisafe/common/utils.py index beee622e5..ec36fe157 100644 --- a/omnisafe/common/utils.py +++ b/omnisafe/common/utils.py @@ -1,182 +1,51 @@ -import math import numpy as np -import os import torch -from torch.autograd import Variable -USE_CUDA = torch.cuda.is_available() +def to_numpy(x: torch.Tensor) -> np.ndarray: + """Convert a torch tensor to a numpy array. -def prRed(prt): print("\033[91m {}\033[00m".format(prt)) + Args: + x (torch.Tensor): A torch tensor to be converted. - -def prGreen(prt): print("\033[92m {}\033[00m".format(prt)) - - -def prYellow(prt): print("\033[93m {}\033[00m".format(prt)) - - -def prLightPurple(prt): print("\033[94m {}\033[00m".format(prt)) - - -def prPurple(prt): print("\033[95m {}\033[00m".format(prt)) - - -def prCyan(prt): print("\033[96m {}\033[00m".format(prt)) - - -def prLightGray(prt): print("\033[97m {}\033[00m".format(prt)) - - -def prBlack(prt): print("\033[98m {}\033[00m".format(prt)) - - -def mat_to_euler_2d(rot_mat): + Returns: + np.ndarray: A numpy array representation of the input tensor. """ - rot_mat has shape: - [[c -s 0], - [s c 0], - [0 0 1]] - """ - - theta = np.arcsin(rot_mat[1, 0]) - return theta + return x.cpu().detach().double().numpy() -def euler_to_mat_2d(theta_batch): - s = np.sin(theta_batch) - c = np.cos(theta_batch) - Rs = np.zeros((theta_batch.shape[0], 2, 2)) - Rs[:, 0, 0] = c - Rs[:, 0, 1] = -s - Rs[:, 1, 0] = s - Rs[:, 1, 1] = c - return Rs +def to_tensor( + x: np.ndarray, + dtype: torch.dtype, + device: torch.device, + requires_grad: bool = False, +) -> torch.Tensor: + """Convert a numpy array to a torch tensor of specified type and device. -def to_numpy(x): - # convert torch tensor to numpy array - return x.cpu().detach().double().numpy() + Args: + x (np.ndarray): A numpy array to be converted. + dtype (torch.dtype): The desired data type for the tensor. + device (torch.device): The device to store the tensor on. + requires_grad (bool): If True, gradients will be computed for operations involving this tensor. -def to_tensor(x, dtype, device, requires_grad=False): - # convert numpy array to torch tensor + Returns: + torch.Tensor: A torch tensor representation of the input array. + """ if type(x).__module__ != 'numpy': return x return torch.from_numpy(x).type(dtype).to(device).requires_grad_(requires_grad) -def scale_action(action, action_lb, action_ub, device=None): - - act_k = (action_ub - action_lb) / 2. - act_b = (action_ub + action_lb) / 2. - return act_k * action + act_b - - -def soft_update(target, source, tau): - for target_param, param in zip(target.parameters(), source.parameters()): - target_param.data.copy_( - target_param.data * (1.0 - tau) + param.data * tau - ) - - -def hard_update(target, source): - for target_param, param in zip(target.parameters(), source.parameters()): - target_param.data.copy_(param.data) - -def create_log_gaussian(mean, log_std, t): - quadratic = -((0.5 * (t - mean) / (log_std.exp())).pow(2)) - l = mean.shape - log_z = log_std - z = l[-1] * math.log(2 * math.pi) - log_p = quadratic.sum(dim=-1) - log_z.sum(dim=-1) - 0.5 * z - return log_p +def sort_vertices_cclockwise(vertices: np.ndarray) -> np.ndarray: + """Sort vertices of a 2D convex polygon in counter-clockwise direction. + Args: + vertices (np.ndarray): An array of shape (n_v, 2) where n_v is the number of vertices. -def logsumexp(inputs, dim=None, keepdim=False): - if dim is None: - inputs = inputs.view(-1) - dim = 0 - s, _ = torch.max(inputs, dim=dim, keepdim=True) - outputs = s + (inputs - s).exp().sum(dim=dim, keepdim=True).log() - if not keepdim: - outputs = outputs.squeeze(dim) - return outputs - - -def get_output_folder(parent_dir, env_name): - """Return save folder. - - Assumes folders in the parent_dir have suffix -run{run - number}. Finds the highest run number and sets the output folder - to that number + 1. This is just convenient so that if you run the - same script multiple times tensorboard can plot all of the results - on the same plots with different names. - - Parameters - ---------- - parent_dir: str - Path of the directory containing all experiment runs. - - Returns - ------- - parent_dir/run_dir - Path to this run's save directory. - """ - os.makedirs(parent_dir, exist_ok=True) - experiment_id = 0 - for folder_name in os.listdir(parent_dir): - if not os.path.isdir(os.path.join(parent_dir, folder_name)): - continue - try: - folder_name = int(folder_name.split('-run')[-1]) - if folder_name > experiment_id: - experiment_id = folder_name - except: - pass - experiment_id += 1 - - parent_dir = os.path.join(parent_dir, env_name) - parent_dir = parent_dir + '-run{}'.format(experiment_id) - os.makedirs(parent_dir, exist_ok=True) - return parent_dir - - -def get_wrapped_policy(agent, cbf_wrapper, dynamics_model, compensator=None, warmup=False, action_space=None, - policy_eval=False): - - def wrapped_policy(observation): - - if warmup and action_space: - action = action_space.sample() # Sample random action - else: - action, _ = agent.select_action(observation, evaluate=policy_eval) # Sample action from policy - - if compensator: - action_comp = compensator(observation) - else: - action_comp = 0 - state = dynamics_model.get_state(observation) - disturb_mean, disturb_std = dynamics_model.predict_disturbance(state) - action_safe = cbf_wrapper.get_safe_action(state, action + action_comp, disturb_mean, disturb_std) - # print('state = {}, action = {}, action_comp = {}, u_safe = {}'.format(state, action, action_comp, u_safe)) - return action + action_comp + action_safe - - return wrapped_policy - -def sort_vertices_cclockwise(vertices): - """ Function used to sort vertices of 2D convex polygon in counter clockwise direction. - - Parameters - ---------- - vertices : numpy.ndarray - Array of size (n_v, 2) where n_v is the number of vertices and d is the dimension of the space - - Returns - ------- - sorted_vertices : numpy.ndarray - Array of size (n_v, 2) of the vertices sorted in counter-clockwise direction. + Returns: + np.ndarray: An array of vertices sorted in counter-clockwise direction. """ - - assert vertices.shape[1] == 2, "Vertices must each have dimension 2, got {}".format(vertices.shape[1]) + assert vertices.shape[1] == 2, f'Vertices must each have dimension 2, got {vertices.shape[1]}' # Sort vertices polygon_center = vertices.sum(axis=0, keepdims=True) / vertices.shape[0] # (1, d) @@ -184,32 +53,3 @@ def sort_vertices_cclockwise(vertices): thetas = np.arctan2(rel_vecs[:, 1], rel_vecs[:, 0]) idxs = np.argsort(thetas) return vertices[idxs, :] - -def get_polygon_normals(vertices): - """ - - Parameters - ---------- - vertices : numpy.ndarray - Array of size (n_v, 2) where n_v is the number of 2D vertices. - Returns - ------- - normals : numpy.ndarray - Array of size (n_v, 2) where each row i is the 2D normal vector of the line from vertices_sorted[i] - vertices_sorted[i+1] - - centers : numpy.ndarary - Array of size (n_v, 2) where each row i is the 2D center point of the segment from vertices_sorted[i] to vertices_sorted[i+1] - """ - - sorted_vertices = sort_vertices_cclockwise(vertices) # (n_v, 2) - diffs = np.diff(sorted_vertices, axis=0, append=sorted_vertices[[0]]) # (n_v, 2) at row i contains vector from v_i to v_i+1 - - # Compute Normals (rotate each diff by -90 degrees) - diffs = np.diff(sorted_vertices, axis=0, append=sorted_vertices[[0]]) # (n_v, 2) at row i contains vector from v_i to v_i+1 - normals = np.array([diffs[:, 1], -diffs[:, 0]]).transpose() - normals = normals / np.linalg.norm(normals) - # Compute Centers - centers = (diffs + 2*vertices) / 2.0 - return normals, centers - - diff --git a/omnisafe/configs/off-policy/DDPGCBF.yaml b/omnisafe/configs/off-policy/DDPGCBF.yaml index 1579aa658..3eec4dced 100644 --- a/omnisafe/configs/off-policy/DDPGCBF.yaml +++ b/omnisafe/configs/off-policy/DDPGCBF.yaml @@ -29,7 +29,7 @@ defaults: # total number of steps to train total_steps: 80_000 # number of evaluate episodes - eval_episodes: 0 + eval_episodes: 1 # algorithm configurations algo_cfgs: # number of steps to update the policy @@ -77,7 +77,7 @@ defaults: # use tensorboard for logging use_tensorboard: True # save model frequency - save_model_freq: 100 + save_model_freq: 20 # save logger path log_dir: "./runs" # save model path @@ -105,7 +105,7 @@ defaults: # Size of hidden layers hidden_sizes: [400, 300] # Activation function - + activation: relu # The learning rate of Critic network lr: 0.001 diff --git a/omnisafe/configs/off-policy/SACRCBF.yaml b/omnisafe/configs/off-policy/SACRCBF.yaml index bb133e56c..53c5e5a17 100644 --- a/omnisafe/configs/off-policy/SACRCBF.yaml +++ b/omnisafe/configs/off-policy/SACRCBF.yaml @@ -27,13 +27,13 @@ defaults: # number of parallel agent, similar to a3c parallel: 1 # total number of steps to train - total_steps: 80_000 + total_steps: 200000 # number of evaluate episodes - eval_episodes: 0 + eval_episodes: 1 # algorithm configurations algo_cfgs: # number of steps to update the policy - steps_per_epoch: 200 + steps_per_epoch: 1000 # number of steps per sample update_cycle: 1 # number of iterations to update the policy @@ -93,7 +93,7 @@ defaults: # use tensorboard for logging use_tensorboard: True # save model frequency - save_model_freq: 100 + save_model_freq: 40 # save logger path log_dir: "./runs" # save model path @@ -126,23 +126,9 @@ defaults: lr: 0.0003 # Dynamics model configurations dynamics_model_cfgs: - # The max number of episodes updateing GP models + # The max number of episodes updating GP models gp_max_episodes: 100 # The size of gp model gp_model_size: 2000 # Whether to use the action compensator use_compensator: False - -Pendulum-v1: - # algorithm configurations - algo_cfgs: - # Actor perdorm random action before `start_learning_steps` steps - start_learning_steps: 0 - # control barrier function configurations - cbf_cfgs: - # gamma of control barrier certificate. - gamma_b: 0.5 - # confidence parameter desired - k_d: 1.5 - # environment dynamics coefficient - l_p: 0.03 \ No newline at end of file diff --git a/omnisafe/configs/on-policy/TRPO.yaml b/omnisafe/configs/on-policy/TRPO.yaml index a8d60878b..ab025a391 100644 --- a/omnisafe/configs/on-policy/TRPO.yaml +++ b/omnisafe/configs/on-policy/TRPO.yaml @@ -155,4 +155,4 @@ Pendulum-v1: # hidden layer sizes hidden_sizes: [64, 64] # activation function - activation: relu \ No newline at end of file + activation: relu diff --git a/omnisafe/configs/on-policy/TRPOCBF.yaml b/omnisafe/configs/on-policy/TRPOCBF.yaml index 74922c9d2..8fecee0d4 100644 --- a/omnisafe/configs/on-policy/TRPOCBF.yaml +++ b/omnisafe/configs/on-policy/TRPOCBF.yaml @@ -136,4 +136,4 @@ defaults: # learning rate lr: 0.01 # number of iterations to update the compensator - update_iters: 1 \ No newline at end of file + update_iters: 1 diff --git a/omnisafe/envs/__init__.py b/omnisafe/envs/__init__.py index ebeb6af4e..c21b1973c 100644 --- a/omnisafe/envs/__init__.py +++ b/omnisafe/envs/__init__.py @@ -15,12 +15,14 @@ """Environment API for OmniSafe.""" from omnisafe.envs import classic_control +from omnisafe.envs.barrier_function_env import BarrierFunctionEnv from omnisafe.envs.core import CMDP, env_register, make, support_envs from omnisafe.envs.crabs_env import CRABSEnv from omnisafe.envs.custom_env import CustomEnv from omnisafe.envs.meta_drive_env import SafetyMetaDriveEnv from omnisafe.envs.barrier_function_env import BarrierFunctionEnv from omnisafe.envs.mujoco_env import MujocoEnv +from omnisafe.envs.robust_barrier_function_env import RobustBarrierFunctionEnv from omnisafe.envs.safety_gymnasium_env import SafetyGymnasiumEnv from omnisafe.envs.safety_gymnasium_modelbased import SafetyGymnasiumModelBased from omnisafe.envs.safety_isaac_gym_env import SafetyIsaacGymEnv diff --git a/omnisafe/envs/barrier_function_env.py b/omnisafe/envs/barrier_function_env.py index f8d0d964c..d664e749b 100644 --- a/omnisafe/envs/barrier_function_env.py +++ b/omnisafe/envs/barrier_function_env.py @@ -21,24 +21,26 @@ import gymnasium import numpy as np import torch - from gymnasium import spaces + +from omnisafe.common.logger import Logger from omnisafe.envs.core import CMDP, env_register from omnisafe.typing import Box -# @env_register +@env_register class BarrierFunctionEnv(CMDP): """Interface of control barrier function-based environments. - - .. warning:: - Since environments based on control barrier functions require special judgment and control of environmental dynamics, + + .. warning:: + Since environments based on control barrier functions require special judgment and control of environmental dynamics, they do not support the use of vectorized environments for parallelization. Attributes: need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. need_time_limit_wrapper (bool): Whether to use time limit wrapper. """ + need_auto_reset_wrapper = True need_time_limit_wrapper = False _support_envs: ClassVar[list[str]] = [ @@ -70,7 +72,7 @@ def __init__( super().__init__(env_id) self._env_id = env_id if num_envs == 1: - self._env = gymnasium.make(id=env_id, autoreset=False, **kwargs) + self._env = gymnasium.make(id=env_id, autoreset=False) self._env_specific_setting() assert isinstance(self._env.action_space, Box), 'Only support Box action space.' assert isinstance( @@ -82,21 +84,26 @@ def __init__( else: raise NotImplementedError('Only support num_envs=1 now.') self._device = torch.device(device) - + self._episodic_violation = [] self._num_envs = num_envs self._metadata = self._env.metadata + self.env_spec_log = {'Metrics/Max_angle_violation': 0.0} - def _env_specific_setting(self): + def _env_specific_setting(self) -> None: """Execute some specific setting for environments. - - Some algorithms based on control barrier functions have made fine-tuning adjustments to the environment. + + Some algorithms based on control barrier functions have made fine-tuning adjustments to the environment. We have organized these adjustments and encapsulated them in this function. """ if self._env_id == 'Pendulum-v1': - self._env.unwrapped.max_torque = 15. - self._env.unwrapped.max_speed = 60. - self._env.unwrapped.action_space = spaces.Box(low=-self._env.unwrapped.max_torque, high=self._env.unwrapped.max_torque, shape=(1,)) - high = np.array([1., 1., self._env.unwrapped.max_speed]) + self._env.unwrapped.max_torque = 15.0 + self._env.unwrapped.max_speed = 60.0 + self._env.unwrapped.action_space = spaces.Box( + low=-self._env.unwrapped.max_torque, + high=self._env.unwrapped.max_torque, + shape=(1,), + ) + high = np.array([1.0, 1.0, self._env.unwrapped.max_speed]) self._env.unwrapped.observation_space = spaces.Box(low=-high, high=high) self._env.dt = 0.05 self._env.dynamics_mode = 'Pendulum' @@ -139,6 +146,7 @@ def step( for x in (obs, reward, terminated, truncated) ) cost = torch.abs(torch.atan2(obs[1], obs[0])).to(self._device) + self._episodic_violation.append(cost) if 'final_observation' in info: info['final_observation'] = np.array( @@ -155,6 +163,20 @@ def step( return obs, reward, cost, terminated, truncated, info + def spec_log(self, logger: Logger) -> None: + """Log specific environment into logger. + + Max angle violation in one episode. + + .. note:: + This function will be called after each episode. + + Args: + logger (Logger): The logger to use for logging. + """ + logger.store({'Metrics/Max_angle_violation': max(self._episodic_violation)}) + self._episodic_violation = [] + def reset( self, seed: int | None = None, @@ -172,7 +194,7 @@ def reset( """ obs, info = self._env.reset(seed=seed, options=options) if self._env_id == 'Pendulum-v1': - while (self._env.unwrapped.state[0] > 1.0 or self._env.unwrapped.state[0] < -1.0): + while self._env.unwrapped.state[0] > 1.0 or self._env.unwrapped.state[0] < -1.0: obs, info = self._env.reset(options=options) return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info @@ -184,14 +206,6 @@ def set_seed(self, seed: int) -> None: """ self.reset(seed=seed) - def sample_action(self) -> torch.Tensor: - """Sample a random action. - - Returns: - A random action. - """ - return torch.normal(torch.zeros(self.action_space.shape), torch.ones(self.action_space.shape)) - def render(self) -> Any: """Render the environment. @@ -205,5 +219,5 @@ def close(self) -> None: self._env.close() @property - def unwrapped(self): - return self._env.unwrapped \ No newline at end of file + def unwrapped(self) -> gymnasium.Env: + return self._env.unwrapped diff --git a/omnisafe/envs/robust_barrier_function_env.py b/omnisafe/envs/robust_barrier_function_env.py index 12e680b86..1f1c10418 100644 --- a/omnisafe/envs/robust_barrier_function_env.py +++ b/omnisafe/envs/robust_barrier_function_env.py @@ -18,33 +18,33 @@ from typing import Any, ClassVar +import gymnasium import numpy as np import torch +from gymnasium import spaces -import gymnasium from omnisafe.envs.core import CMDP, env_register -from omnisafe.typing import Box -from gymnasium import spaces from omnisafe.envs.unicycle_env import UnicycleEnv +from omnisafe.typing import Box @env_register class RobustBarrierFunctionEnv(CMDP): """Interface of control barrier function-based environments. - - .. warning:: - Since environments based on control barrier functions require special judgment and control of environmental dynamics, + + .. warning:: + Since environments based on control barrier functions require special judgment and control of environmental dynamics, they do not support the use of vectorized environments for parallelization. Attributes: need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. need_time_limit_wrapper (bool): Whether to use time limit wrapper. """ + need_auto_reset_wrapper = True need_time_limit_wrapper = False _support_envs: ClassVar[list[str]] = [ 'Unicycle', - 'Pendulum-v1', ] def __init__( @@ -74,9 +74,6 @@ def __init__( if num_envs == 1: if self._env_id == 'Unicycle': self._env = UnicycleEnv() - elif self._env_id == 'Pendulum-v1': - self._env = gymnasium.make(id=env_id, autoreset=False, **kwargs) - self._env_specific_setting() else: raise NotImplementedError('Only support Unicycle now.') assert isinstance(self._env.action_space, Box), 'Only support Box action space.' @@ -93,19 +90,6 @@ def __init__( self._num_envs = num_envs self._metadata = self._env.metadata - def _env_specific_setting(self): - """Execute some specific setting for environments. - - Some algorithms based on control barrier functions have made fine-tuning adjustments to the environment. - We have organized these adjustments and encapsulated them in this function. - """ - if self._env_id == 'Pendulum-v1': - self._env.unwrapped.max_torque = 15. - self._env.unwrapped.max_speed = 60. - self._env.unwrapped.action_space = spaces.Box(low=-self._env.unwrapped.max_torque, high=self._env.unwrapped.max_torque, shape=(1,)) - high = np.array([1., 1., self._env.unwrapped.max_speed]) - self._env.unwrapped.observation_space = spaces.Box(low=-high, high=high) - def step( self, action: torch.Tensor, @@ -136,23 +120,13 @@ def step( truncated: Whether the episode has been truncated due to a time limit. info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). """ - if self._env_id == 'Unicycle': - obs, reward, cost, terminated, truncated, info = self._env.step( - action.detach().cpu().numpy(), - ) - obs, reward, cost, terminated, truncated = ( - torch.as_tensor(x, dtype=torch.float32, device=self._device) - for x in (obs, reward, cost, terminated, truncated) - ) - elif self._env_id == 'Pendulum-v1': - obs, reward, terminated, truncated, info = self._env.step( - action.detach().cpu().numpy(), - ) - obs, reward, terminated, truncated = ( - torch.as_tensor(x, dtype=torch.float32, device=self._device) - for x in (obs, reward, terminated, truncated) - ) - cost = torch.abs(torch.atan2(obs[1], obs[0])).to(self._device) + obs, reward, cost, terminated, truncated, info = self._env.step( + action.detach().cpu().numpy(), + ) + obs, reward, cost, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, cost, terminated, truncated) + ) if 'final_observation' in info: info['final_observation'] = np.array( [ @@ -184,9 +158,6 @@ def reset( info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). """ obs, info = self._env.reset(seed=seed, options=options) - if self._env_id == 'Pendulum-v1': - while (self._env.unwrapped.state[0] > 1.0 or self._env.unwrapped.state[0] < -1.0): - obs, info = self._env.reset(options=options) return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info def set_seed(self, seed: int) -> None: @@ -203,7 +174,10 @@ def sample_action(self) -> torch.Tensor: Returns: A random action. """ - return torch.normal(torch.zeros(self.action_space.shape), torch.ones(self.action_space.shape)) + return torch.normal( + torch.zeros(self.action_space.shape), + torch.ones(self.action_space.shape), + ) def render(self) -> Any: """Render the environment. @@ -216,9 +190,6 @@ def render(self) -> Any: def close(self) -> None: """Close the environment.""" self._env.close() - - def __getattr__(self, name): - try: - return getattr(self._env, name) - except AttributeError: - raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'") + + def __getattr__(self, name: str) -> Any: + return getattr(self._env, name) diff --git a/omnisafe/envs/unicycle_env.py b/omnisafe/envs/unicycle_env.py index fb16394a5..4fca58eed 100644 --- a/omnisafe/envs/unicycle_env.py +++ b/omnisafe/envs/unicycle_env.py @@ -1,33 +1,41 @@ -import numpy as np +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any, Callable + import gymnasium as gym +import numpy as np from gymnasium import spaces -from collections.abc import Iterable -def to_pixel(meas_cm, shift=0): +def to_pixel(meas_cm: list[float] | float, shift: int = 0) -> float: + """Convert measurements from centimeters to pixels. + Args: + meas_cm (list[float] | float): A single measurement or a list of measurements in centimeters. + shift (int, optional): An integer value that is added to the converted measurement(s). Default is 0. + + Returns: + float | np.ndarray: The measurement converted to pixels. + """ if isinstance(meas_cm, Iterable): return 1.5 * 37.795 * meas_cm + np.array(shift) return 1.5 * 37.795 * meas_cm + shift + class UnicycleEnv(gym.Env): """Custom Environment that follows SafetyGym interface""" - metadata = {'render.modes': ['human']} + def __init__(self) -> None: - def __init__(self, obs_config='default'): - - super(UnicycleEnv, self).__init__() + super().__init__() self.dynamics_mode = 'Unicycle' - # Define action and observation space - # They must be gym.spaces objects - # Example when using discrete actions: self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,)) self.safe_action_space = spaces.Box(low=-2.5, high=2.5, shape=(2,)) self.observation_space = spaces.Box(low=-1e10, high=1e10, shape=(7,)) - self.bds = np.array([[-3., -3.], [3., 3.]]) + self.bds = np.array([[-3.0, -3.0], [3.0, 3.0]]) self.dt = 0.02 self.max_episode_steps = 1000 @@ -36,9 +44,11 @@ def __init__(self, obs_config='default'): # Initialize Env self.state = None self.episode_step = 0 - self.initial_state = np.array([[-2.5, -2.5, 0.0], [-2.5, 2.5, 0.0], [-2.5, 0.0, 0.0], [2.5, -2.5, np.pi/2]]) + self.initial_state = np.array( + [[-2.5, -2.5, 0.0], [-2.5, 2.5, 0.0], [-2.5, 0.0, 0.0], [2.5, -2.5, np.pi / 2]], + ) self.goal_pos = np.array([2.5, 2.5]) - self.rand_init = False # Random Initial State + self.rand_init = False self.reset() @@ -49,148 +59,141 @@ def __init__(self, obs_config='default'): self.disturb_covar = np.diag([0.005, 0.005, 0.05]) * 20 # Build Hazards - self.obs_config = obs_config self.hazards = [] - if obs_config == 'default': # default - self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([0., 0.])}) - self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([-1., 1.])}) - self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([-1., -1.])}) - self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([1., -1.])}) - self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([1., 1.])}) - elif obs_config == 'test': - # self.build_hazards(obs_config) - self.hazards.append({'type': 'polygon', 'vertices': 0.6*np.array([[-1., -1.], [1., -1], [1., 1.], [-1., 1.]])}) - self.hazards[-1]['vertices'][:, 0] += 0.5 - self.hazards[-1]['vertices'][:, 1] -= 0.5 - self.hazards.append({'type': 'circle', 'radius': 0.6, 'location': 1.5*np.array([1., 1.])}) - self.hazards.append( - {'type': 'polygon', 'vertices': np.array([[0.9, 0.9], [2.1, 2.1], [2.1, 0.9]])}) - else: - n_hazards = 6 - hazard_radius = 0.6 - self.get_random_hazard_locations(n_hazards, hazard_radius) + + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([0.0, 0.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([-1.0, 1.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([-1.0, -1.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([1.0, -1.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([1.0, 1.0])}, + ) # Viewer self.viewer = None - - def step(self, action): - """Organize the observation to understand what's going on - - Parameters - ---------- - action : ndarray - Action that the agent takes in the environment - - Returns - ------- - new_obs : ndarray - The new observation with the following structure: - [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, dist2goal] - + def step( + self, + action: np.ndarray, + ) -> tuple[np.ndarray, float, float, bool, bool, dict[str, Any]]: + """ + Advance the environment state based on the action taken by the agent. + + Parameters: + action(np.ndarray): Control action taken by the agent. + + Returns: + A tuple containing: + - new_obs : np.ndarray, the new observation structured as [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, dist2goal]. + - reward : float, reward received after taking the action. + - cost : float, cost incurred after taking the action. + - terminated : bool, whether the episode has terminated. + - truncated : bool, whether the episode was truncated. + - info : dict, additional information about the environment's state. """ - action = np.clip(action, -1.0, 1.0) state, reward, cost, terminated, truncated, info = self._step(action) return self.get_obs(), reward, cost, terminated, truncated, info - def _step(self, action): + def _step(self, action: np.ndarray) -> tuple: """ - - Parameters - ---------- - action - - Returns - ------- - state : ndarray - New internal state of the agent. - reward : float - Reward collected during this transition. - terminated : bool - Whether the episode terminated. - info : dict - Additional info relevant to the environment. + Update the internal state based on the action, considering dynamics and disturbances. + + Parameters: + action(np.ndarray): Control action taken by the agent. + + Returns: + A tuple containing: + - state : np.ndarray, new internal state of the agent. + - reward : float, reward collected during this transition. + - cost : float, cost incurred during this transition. + - terminated : bool, whether the episode has terminated. + - truncated : bool, whether the episode was truncated due to reaching a step limit. + - info : dict, additional information relevant to the environment. """ - - # Start with our prior for continuous time system x' = f(x) + g(x)u self.state += self.dt * (self.get_f(self.state) + self.get_g(self.state) @ action) - self.state -= self.dt * 0.1 * self.get_g(self.state) @ np.array([np.cos(self.state[2]), 0]) #* np.random.multivariate_normal(self.disturb_mean, self.disturb_covar, 1).squeeze() + self.state -= self.dt * 0.1 * self.get_g(self.state) @ np.array([np.cos(self.state[2]), 0]) self.episode_step += 1 - info = dict() - dist_goal = self._goal_dist() - reward = (self.last_goal_dist - dist_goal) # -1e-3 * dist_goal + reward = self.last_goal_dist - dist_goal self.last_goal_dist = dist_goal - # Check if goal is met terminated = False if self.goal_met(): - info['goal_met'] = True reward += self.reward_goal terminated = True truncated = self.episode_step >= self.max_episode_steps - # Include constraint cost in reward (only during training, i.e. obs_config=='default') - if self.obs_config == 'default': - info['cost'] = 0 - for hazard in self.hazards: - if hazard['type'] == 'circle': # They should all be circles if 'default' - info['cost'] += 0.1 * (np.sum((self.state[:2] - hazard['location']) ** 2) < hazard['radius'] ** 2) - return self.state, reward, info['cost'], terminated, truncated, info - - def goal_met(self): - """Return true if the current goal is met this step + cost = 0.0 + for hazard in self.hazards: + if hazard['type'] == 'circle': + cost += 0.1 * ( + np.sum((self.state[:2] - hazard['location']) ** 2) < hazard['radius'] ** 2 + ) - Returns - ------- - goal_met : bool - True if the goal condition is met. + return self.state, reward, cost, terminated, truncated, {} + def goal_met(self) -> bool: """ + Check if the current goal has been met in this step. + Returns: + True if the agent has reached the goal, False otherwise. + """ return np.linalg.norm(self.state[:2] - self.goal_pos) <= self.goal_size - def reset(self, seed=None, options=None): - """ Reset the state of the environment to an initial state. - - Returns - ------- - observation : ndarray - Next observation. + def reset(self, seed: int | None = None, options: dict | None = None) -> tuple: + """ + Reset the environment to an initial state. + + Parameters: + seed : int, optional + Seed for random number generator. + options : dict, optional + Additional options to customize the environment reset. + + Returns: + A tuple containing: + - observation : np.ndarray, the first observation after reset. + - info : dict, additional information about the reset state. """ - self.episode_step = 0 - # Re-initialize state if self.rand_init: self.state = np.copy(self.initial_state[np.random.randint(self.initial_state.shape[0])]) else: self.state = np.copy(self.initial_state[0]) - # Re-initialize last goal dist self.last_goal_dist = self._goal_dist() - return self.get_obs(), dict() + return self.get_obs(), {} - def render(self, mode='human', close=False): + def render(self, mode: str = 'human') -> np.ndarray: """Render the environment to the screen - Parameters - ---------- + Parameters:--- mode : str close : bool - Returns - ------- + Returns: """ if mode != 'human' and mode != 'rgb_array': rel_loc = self.goal_pos - self.state[:2] theta_error = np.arctan2(rel_loc[1], rel_loc[0]) - self.state[2] - print('Ep_step = {}, \tState = {}, \tDist2Goal = {}, alignment_error = {}'.format(self.episode_step, self.state, self._goal_dist(), theta_error)) + print( + f'Ep_step = {self.episode_step}, \tState = {self.state}, \tDist2Goal = {self._goal_dist()}, alignment_error = {theta_error}', + ) screen_width = 600 screen_height = 400 @@ -203,24 +206,52 @@ def render(self, mode='human', close=False): obstacles = [] for i in range(len(self.hazards)): if self.hazards[i]['type'] == 'circle': - obstacles.append(pyglet_rendering.make_circle(radius=to_pixel(self.hazards[i]['radius'], shift=0), filled=True)) - obs_trans = pyglet_rendering.Transform(translation=(to_pixel(self.hazards[i]['location'][0], shift=screen_width/2), to_pixel(self.hazards[i]['location'][1], shift=screen_height/2))) + obstacles.append( + pyglet_rendering.make_circle( + radius=to_pixel(self.hazards[i]['radius'], shift=0), + filled=True, + ), + ) + obs_trans = pyglet_rendering.Transform( + translation=( + to_pixel(self.hazards[i]['location'][0], shift=screen_width / 2), + to_pixel(self.hazards[i]['location'][1], shift=screen_height / 2), + ), + ) obstacles[i].set_color(1.0, 0.0, 0.0) obstacles[i].add_attr(obs_trans) elif self.hazards[i]['type'] == 'polygon': - obstacles.append(pyglet_rendering.make_polygon(to_pixel(self.hazards[i]['vertices'], shift=[screen_width/2, screen_height/2]), filled=True)) + obstacles.append( + pyglet_rendering.make_polygon( + to_pixel( + self.hazards[i]['vertices'], + shift=[screen_width / 2, screen_height / 2], + ), + filled=True, + ), + ) self.viewer.add_geom(obstacles[i]) # Make Goal goal = pyglet_rendering.make_circle(radius=to_pixel(0.1, shift=0), filled=True) - goal_trans = pyglet_rendering.Transform(translation=(to_pixel(self.goal_pos[0], shift=screen_width/2), to_pixel(self.goal_pos[1], shift=screen_height/2))) + goal_trans = pyglet_rendering.Transform( + translation=( + to_pixel(self.goal_pos[0], shift=screen_width / 2), + to_pixel(self.goal_pos[1], shift=screen_height / 2), + ), + ) goal.add_attr(goal_trans) goal.set_color(0.0, 0.5, 0.0) self.viewer.add_geom(goal) # Make Robot self.robot = pyglet_rendering.make_circle(radius=to_pixel(0.1), filled=True) - self.robot_trans = pyglet_rendering.Transform(translation=(to_pixel(self.state[0], shift=screen_width/2), to_pixel(self.state[1], shift=screen_height/2))) + self.robot_trans = pyglet_rendering.Transform( + translation=( + to_pixel(self.state[0], shift=screen_width / 2), + to_pixel(self.state[1], shift=screen_height / 2), + ), + ) self.robot_trans.set_rotation(self.state[2]) self.robot.add_attr(self.robot_trans) self.robot.set_color(0.5, 0.5, 0.8) @@ -234,20 +265,18 @@ def render(self, mode='human', close=False): if self.state is None: return None - self.robot_trans.set_translation(to_pixel(self.state[0], shift=screen_width/2), to_pixel(self.state[1], shift=screen_height/2)) + self.robot_trans.set_translation( + to_pixel(self.state[0], shift=screen_width / 2), + to_pixel(self.state[1], shift=screen_height / 2), + ) self.robot_trans.set_rotation(self.state[2]) - return self.viewer.render(return_rgb_array=mode == "rgb_array") - - def get_obs(self): - """Given the state, this function returns it to an observation akin to the one obtained by calling env.step + return self.viewer.render(return_rgb_array=mode == 'rgb_array') - Parameters - ---------- + def get_obs(self) -> np.ndarray: + """Given the state, this function returns corresponding observation. - Returns - ------- - observation : ndarray + Returns: Observation: [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, exp(-dist2goal)] """ @@ -255,36 +284,43 @@ def get_obs(self): goal_dist = np.linalg.norm(rel_loc) goal_compass = self.obs_compass() # compass to the goal - return np.array([self.state[0], self.state[1], np.cos(self.state[2]), np.sin(self.state[2]), goal_compass[0], goal_compass[1], np.exp(-goal_dist)]) - - def _get_dynamics(self): - """Get affine CBFs for a given environment. - - Parameters - ---------- - - Returns - ------- - get_f : callable - Drift dynamics of the continuous system x' = f(x) + g(x)u - get_g : callable - Control dynamics of the continuous system x' = f(x) + g(x)u + return np.array( + [ + self.state[0], + self.state[1], + np.cos(self.state[2]), + np.sin(self.state[2]), + goal_compass[0], + goal_compass[1], + np.exp(-goal_dist), + ], + ) + + def _get_dynamics(self) -> tuple[Callable, Callable]: + """Get affine Control Barrier Function (CBF) dynamics for a given environment. + + This method provides access to the system's drift and control dynamics, formulated for continuous systems of the form x' = f(x) + g(x)u, where 'x' is the state vector and 'u' is the control vector. + + Returns: + get_f : Callable[[np.ndarray], np.ndarray] + Function to compute the drift dynamics 'f(x)' of the system. + + get_g : Callable[[np.ndarray], np.ndarray] + Function to compute the control dynamics 'g(x)' of the system. """ - def get_f(state): - f_x = np.zeros(state.shape) - return f_x + def get_f(state: np.ndarray) -> np.ndarray: + """Function to compute the drift dynamics 'f(x)' of the system.""" + return np.zeros(state.shape) - def get_g(state): + def get_g(state: np.ndarray) -> np.ndarray: + """Function to compute the control dynamics 'g(x)' of the system.""" theta = state[2] - g_x = np.array([[np.cos(theta), 0], - [np.sin(theta), 0], - [ 0, 1.0]]) - return g_x + return np.array([[np.cos(theta), 0], [np.sin(theta), 0], [0, 1.0]]) return get_f, get_g - def obs_compass(self): + def obs_compass(self) -> np.ndarray: """ Return a robot-centric compass observation of a list of positions. Compass is a normalized (unit-lenght) egocentric XY vector, @@ -297,33 +333,36 @@ def obs_compass(self): # Get ego vector in world frame vec = self.goal_pos - self.state[:2] # Rotate into frame - R = np.array([[np.cos(self.state[2]), -np.sin(self.state[2])], [np.sin(self.state[2]), np.cos(self.state[2])]]) + R = np.array( + [ + [np.cos(self.state[2]), -np.sin(self.state[2])], + [np.sin(self.state[2]), np.cos(self.state[2])], + ], + ) vec = np.matmul(vec, R) # Normalize vec /= np.sqrt(np.sum(np.square(vec))) + 0.001 return vec - def _goal_dist(self): + def _goal_dist(self) -> np.ndarray: return np.linalg.norm(self.goal_pos - self.state[:2]) - def close(self): + def close(self) -> None: if self.viewer: self.viewer.close() self.viewer = None - def get_random_hazard_locations(self, n_hazards: int, hazard_radius: float): + def get_random_hazard_locations(self, n_hazards: int, hazard_radius: float) -> None: """ - Parameters - ---------- + Parameters:--- n_hazards : int Number of hazards to create hazard_radius : float Radius of hazards - Returns - ------- - hazards_locs : ndarray + Returns: + hazards_locs : np.ndarray Numpy array of shape (n_hazards, 2) containing xy locations of hazards. """ @@ -335,28 +374,64 @@ def get_random_hazard_locations(self, n_hazards: int, hazard_radius: float): hazards = [] hazards_centers = np.zeros((n_hazards, 2)) n = 0 # Number of hazards actually placed - for i in range(n_hazards): + for _ in range(n_hazards): successfully_placed = False - iter = 0 + iteration = 0 hazard_type = np.random.randint(3) # 0-> Circle 1->Square 2->Triangle - radius = hazard_radius * (1-0.2*2.0*(np.random.random() - 0.5)) - while not successfully_placed and iter < 100: - hazards_centers[n] = (buffered_bds[1] - buffered_bds[0]) * np.random.random(2) + buffered_bds[0] - successfully_placed = np.all(np.linalg.norm(hazards_centers[:n] - hazards_centers[[n]], axis=1) > 3.5*hazard_radius) - successfully_placed = np.logical_and(successfully_placed, np.linalg.norm(self.goal_pos - hazards_centers[n]) > 2.0*hazard_radius) - successfully_placed = np.logical_and(successfully_placed, np.all(np.linalg.norm(self.initial_state[:, :2] - hazards_centers[[n]], axis=1) > 2.0*hazard_radius)) - iter += 1 + radius = hazard_radius * (1 - 0.2 * 2.0 * (np.random.random() - 0.5)) + while not successfully_placed and iteration < 100: + hazards_centers[n] = (buffered_bds[1] - buffered_bds[0]) * np.random.random( + 2, + ) + buffered_bds[0] + successfully_placed = np.all( + np.linalg.norm(hazards_centers[:n] - hazards_centers[[n]], axis=1) + > 3.5 * hazard_radius, + ) + successfully_placed = np.logical_and( + successfully_placed, + np.linalg.norm(self.goal_pos - hazards_centers[n]) > 2.0 * hazard_radius, + ) + successfully_placed = np.logical_and( + successfully_placed, + np.all( + np.linalg.norm(self.initial_state[:, :2] - hazards_centers[[n]], axis=1) + > 2.0 * hazard_radius, + ), + ) + iteration += 1 if not successfully_placed: continue if hazard_type == 0: # Circle hazards.append({'type': 'circle', 'location': hazards_centers[n], 'radius': radius}) elif hazard_type == 1: # Square - hazards.append({'type': 'polygon', 'vertices': np.array( - [[-radius, -radius], [-radius, radius], [radius, radius], [radius, -radius]])}) + hazards.append( + { + 'type': 'polygon', + 'vertices': np.array( + [ + [-radius, -radius], + [-radius, radius], + [radius, radius], + [radius, -radius], + ], + ), + }, + ) hazards[-1]['vertices'] += hazards_centers[n] else: # Triangle - hazards.append({'type': 'polygon', 'vertices': np.array( - [[-radius, -radius], [-radius, radius], [radius, radius], [radius, -radius]])}) + hazards.append( + { + 'type': 'polygon', + 'vertices': np.array( + [ + [-radius, -radius], + [-radius, radius], + [radius, radius], + [radius, -radius], + ], + ), + }, + ) # Pick a vertex and delete it idx = np.random.randint(4) hazards[-1]['vertices'] = np.delete(hazards[-1]['vertices'], idx, axis=0) diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py index 8732d6e34..2f17f852b 100644 --- a/omnisafe/evaluator.py +++ b/omnisafe/evaluator.py @@ -301,6 +301,39 @@ def __load_model_and_env( ) self._actor = actor_builder.build_actor(actor_type) self._actor.load_state_dict(model_params['pi']) + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + from omnisafe.common.barrier_comp import BarrierCompensator + + self.compensator = BarrierCompensator( + obs_dim=observation_space.shape[0], + act_dim=action_space.shape[0], + cfgs=self._cfgs['compensator_cfgs'], + ) + model_path = os.path.join(save_dir, 'torch_save', model_name) + try: + model_params = torch.load(model_path) + except FileNotFoundError as error: + raise FileNotFoundError( + 'The model is not found in the save directory.', + ) from error + self.compensator.load_state_dict(model_params['compensator']) + if self._cfgs['algo'] == 'SACRCBF': + from omnisafe.common.robust_barrier_solver import CBFQPLayer + from omnisafe.common.robust_gp_model import DynamicsModel + + epoch = model_name.split('.pt')[0].split('-')[-1] + self.solver = CBFQPLayer( + env=self._env, + device=self._cfgs['train_cfgs']['device'], + gamma_b=self._cfgs['cbf_cfgs']['gamma_b'], + k_d=self._cfgs['cbf_cfgs']['k_d'], + l_p=self._cfgs['cbf_cfgs']['l_p'], + ) + self.dynamics_model = DynamicsModel(env=self._env) + self.dynamics_model.load_disturbance_models( + save_dir=os.path.join(self._save_dir, 'gp_model_save'), + epoch=epoch, + ) if self._cfgs['algo'] in ['CRABS']: self._init_crabs(model_params) @@ -377,11 +410,23 @@ def load_saved( # load the config self._save_dir = save_dir self._model_name = model_name + epoch = model_name.split('.pt')[0].split('-')[-1] self.__load_cfgs(save_dir) self.__set_render_mode(render_mode) + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + from omnisafe.common.barrier_solver import PendulumSolver + + self.solver = PendulumSolver() + path = os.path.join( + save_dir, + 'gp_model_save', + f'gaussian_process_regressor_{epoch}.pkl', + ) + self.solver.build_gp_model(save_dir=path) + env_kwargs = { 'env_id': self._cfgs['env_id'], 'num_envs': 1, @@ -452,6 +497,32 @@ def evaluate( raise ValueError( 'The policy must be provided or created before evaluating the agent.', ) + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + approx_compensating_act = self.compensator(obs=obs) + compensated_act_mean_raw = act + approx_compensating_act + [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=False) + compensating_act = self.solver.control_barrier( + compensated_act_mean_raw, + f, + g, + x, + std, + ) + act = compensated_act_mean_raw + compensating_act + + if self._cfgs['algo'] == 'SACRCBF': + state_batch = self.dynamics_model.get_state(obs) + mean_pred_batch, sigma_pred_batch = self.dynamics_model.predict_disturbance( + state_batch, + ) + safe_act = self.solver.get_safe_action( + state_batch, + act, + mean_pred_batch, + sigma_pred_batch, + ) + act = safe_act + obs, rew, cost, terminated, truncated, _ = self._env.step(act) if 'Saute' in self._cfgs['algo'] or 'Simmer' in self._cfgs['algo']: self._safety_obs -= cost.unsqueeze(-1) / self._safety_budget diff --git a/omnisafe/models/actor/actor_builder.py b/omnisafe/models/actor/actor_builder.py index 80c68e1be..75358134c 100644 --- a/omnisafe/models/actor/actor_builder.py +++ b/omnisafe/models/actor/actor_builder.py @@ -16,12 +16,12 @@ from __future__ import annotations +from omnisafe.models.actor.beta_learning_actor import BetaLearningActor from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor from omnisafe.models.actor.gaussian_sac_actor import GaussianSACActor from omnisafe.models.actor.mlp_actor import MLPActor from omnisafe.models.actor.perturbation_actor import PerturbationActor from omnisafe.models.actor.vae_actor import VAE -from omnisafe.models.actor.beta_learning_actor import BetaLearningActor from omnisafe.models.base import Actor from omnisafe.typing import Activation, ActorType, InitFunction, OmnisafeSpace diff --git a/omnisafe/models/actor/beta_learning_actor.py b/omnisafe/models/actor/beta_learning_actor.py index 8f9675934..e0ee6b3e9 100644 --- a/omnisafe/models/actor/beta_learning_actor.py +++ b/omnisafe/models/actor/beta_learning_actor.py @@ -18,22 +18,19 @@ import torch import torch.nn as nn -import numpy as np +from torch.distributions import Beta, Distribution -from torch.distributions import Distribution, Beta - -from omnisafe.models.actor.gaussian_actor import GaussianActor +from omnisafe.models.base import Actor from omnisafe.typing import Activation, InitFunction, OmnisafeSpace from omnisafe.utils.model import build_mlp_network -from omnisafe.models.base import Actor # pylint: disable-next=too-many-instance-attributes class BetaLearningActor(Actor): - + """Initialize an instance of :class:`BetaLearningActor`.""" _current_dist: Beta - + def __init__( self, obs_space: OmnisafeSpace, @@ -42,30 +39,30 @@ def __init__( activation: Activation = 'relu', weight_initialization_mode: InitFunction = 'kaiming_uniform', ) -> None: - """Initialize an instance of :class:`GaussianLearningActor`.""" + """Initialize an instance of :class:`BetaLearningActor`.""" super().__init__(obs_space, act_space, hidden_sizes, activation, weight_initialization_mode) - + self.mean: nn.Module = build_mlp_network( sizes=[self._obs_dim, self._hidden_sizes[0], self._hidden_sizes[0]], activation=activation, output_activation='tanh', weight_initialization_mode=weight_initialization_mode, ) - + self.alpha_net: nn.Module = build_mlp_network( sizes=[self._hidden_sizes[-1], self._act_dim], activation='identity', output_activation='softplus', weight_initialization_mode=weight_initialization_mode, ) - + self.beta_net: nn.Module = build_mlp_network( sizes=[self._hidden_sizes[-1], self._act_dim], activation='identity', output_activation='softplus', weight_initialization_mode=weight_initialization_mode, ) - + def _distribution(self, obs: torch.Tensor) -> Beta: """Get the distribution of the actor. @@ -80,8 +77,8 @@ def _distribution(self, obs: torch.Tensor) -> Beta: The normal distribution of the mean and standard deviation from the actor. """ mean = self.mean(obs) - alphas = 1.0+self.alpha_net(mean) - betas = 1.0+self.beta_net(mean) + alphas = 1.0 + self.alpha_net(mean) + betas = 1.0 + self.beta_net(mean) return Beta(alphas, betas) def predict(self, obs: torch.Tensor, deterministic: bool = False) -> torch.Tensor: diff --git a/pyproject.toml b/pyproject.toml index a74b46723..350414746 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,11 @@ dependencies = [ "matplotlib >= 3.7.1", "gdown >= 4.6.0", "pytorch_lightning >= 2.2.2", + "cvxopt== 1.3.2", + "gpytorch== 1.11", + "joblib == 1.3.2", + "qpth == 0.0.16", + "scikit_learn == 1.3.2" ] dynamic = ["version", "entry-points"] diff --git a/requirements.txt b/requirements.txt index 0abf5e41a..03fec36c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,8 @@ seaborn >= 0.12.2 pandas >= 1.5.3 matplotlib >= 3.7.1 gdown >= 4.6.0 +cvxopt==1.3.2 +gpytorch==1.11 +joblib==1.3.2 +qpth==0.0.16 +scikit_learn==1.3.2 From 71ffe782ccb28768f3aeba93e7ed9c5a6bfcab29 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Fri, 3 May 2024 22:06:04 +0800 Subject: [PATCH 03/18] wip --- .pre-commit-config.yaml | 4 +- docs/source/spelling_wordlist.txt | 27 +++ omnisafe/adapter/barrier_function_adapter.py | 43 +++-- .../adapter/beta_barrier_function_adapter.py | 61 +++--- .../offpolicy_barrier_function_adapter.py | 84 +++++--- .../robust_barrier_function_adapter.py | 32 +++- omnisafe/algorithms/off_policy/ddpg.py | 2 +- omnisafe/algorithms/off_policy/ddpg_cbf.py | 26 +-- omnisafe/algorithms/off_policy/sac_rcbf.py | 6 +- .../on_policy/barrier_function/ppo_cbf.py | 10 +- .../on_policy/barrier_function/trpo_cbf.py | 36 +++- .../on_policy/base/policy_gradient.py | 22 ++- omnisafe/common/barrier_comp.py | 5 +- omnisafe/common/barrier_solver.py | 37 ++-- omnisafe/common/robust_barrier_solver.py | 126 +++++------- omnisafe/common/robust_gp_model.py | 86 +++++---- omnisafe/common/utils.py | 55 ------ omnisafe/configs/on-policy/TRPOCBF.yaml | 4 +- omnisafe/envs/barrier_function_env.py | 24 +-- omnisafe/envs/robust_barrier_function_env.py | 19 +- omnisafe/envs/unicycle_env.py | 180 +----------------- omnisafe/evaluator.py | 47 +++-- omnisafe/utils/tools.py | 37 ++++ 23 files changed, 446 insertions(+), 527 deletions(-) delete mode 100644 omnisafe/common/utils.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 99b01f43f..42e2956f9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -117,5 +117,7 @@ repos: ^omnisafe/common/control_barrier_function/crabs/models.py$| ^omnisafe/common/control_barrier_function/crabs/optimizers.py$| ^omnisafe/common/control_barrier_function/crabs/utils.py$| - ^conftest.py$ + ^conftest.py$| + ^omnisafe/envs/unicycle_env.py| + ^setup.py$ ) diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index 460cabd1a..46e297388 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -486,3 +486,30 @@ UpdateDynamics mathbb meger Jupyter +compensator +CBF +Vectorize +gp +optim +cvx +QP +gpytorch +ExactGP +RBF +parallelization +compensators +thetadot +VK +Sharma +Kosaraju +Seetharaman +Sadler +Suttle +Cheng +Orosz +JW +Burdick +Vipul +Sivaranjani +Vijay +suttle diff --git a/omnisafe/adapter/barrier_function_adapter.py b/omnisafe/adapter/barrier_function_adapter.py index 735ff690e..a91218b48 100644 --- a/omnisafe/adapter/barrier_function_adapter.py +++ b/omnisafe/adapter/barrier_function_adapter.py @@ -18,6 +18,7 @@ import torch from rich.progress import track +from sklearn.gaussian_process import GaussianProcessRegressor from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter from omnisafe.common.barrier_comp import BarrierCompensator @@ -46,8 +47,8 @@ class BarrierFunctionAdapter(OnPolicyAdapter): def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: """Initialize an instance of :class:`BarrierFunctionAdapter`.""" super().__init__(env_id, num_envs, seed, cfgs) - self.solver = None - self.compensator = None + self.solver: PendulumSolver + self.compensator: BarrierCompensator self.first_iter = 1 def _wrapper( @@ -85,16 +86,15 @@ def _wrapper( def set_solver(self, solver: PendulumSolver) -> None: """Set the barrier function solver for Pendulum environment.""" - self.solver: PendulumSolver = solver + self.solver = solver def set_compensator(self, compensator: BarrierCompensator) -> None: """Set the action compensator.""" - self.compensator: BarrierCompensator = compensator + self.compensator = compensator def reset_gp_model(self) -> None: """Reset the gaussian processing model of barrier function solver.""" - self.solver.GP_model_prev = self.solver.GP_model.copy() - self.solver.build_GP_model() + self.solver.reset_gp_model() def rollout( # pylint: disable=too-many-locals self, @@ -103,7 +103,7 @@ def rollout( # pylint: disable=too-many-locals buffer: VectorOnPolicyBuffer, logger: Logger, ) -> None: - """Rollout the environment and store the data in the buffer. + """Rollout the environment with barrier function controller. Args: steps_per_epoch (int): Number of steps per epoch. @@ -117,8 +117,6 @@ def rollout( # pylint: disable=too-many-locals self.reset_gp_model() obs, _ = self.reset() - while abs(self._env.unwrapped.state[0]) > 1: - obs, _ = self._env.reset() path_obs = [] path_act = [] for step in track( @@ -135,9 +133,9 @@ def rollout( # pylint: disable=too-many-locals compensated_act_mean_raw = act_mean + approx_compensating_act if self.first_iter: - [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model=False) + [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=False) else: - [f, g, x, std] = self.solver.get_GP_dynamics(obs, use_prev_model=True) + [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=True) compensating_act = self.solver.control_barrier( compensated_act_mean_raw, @@ -150,16 +148,15 @@ def rollout( # pylint: disable=too-many-locals compensated_act_mean = compensated_act_mean_raw + compensating_act final_act = torch.normal(compensated_act_mean, act_std) - logp = agent.actor.log_prob(final_act).detach() - path_obs.append(obs.detach().cpu().squeeze().numpy()) - path_act.append(final_act.detach().cpu().squeeze().numpy()) + logp = agent.actor.log_prob(final_act) + + path_obs.append(obs) + path_act.append(final_act) next_obs, reward, cost, terminated, truncated, info = self.step(final_act) self._log_value(reward=reward, cost=cost, info=info) - if self._cfgs.algo_cfgs.use_cost: - logger.store({'Value/cost': value_c}) logger.store({'Value/reward': value_r}) logger.store({'Metrics/angle': cost}) @@ -202,13 +199,19 @@ def rollout( # pylint: disable=too-many-locals self._ep_cost[idx] = 0.0 self._ep_len[idx] = 0.0 - if step < 650: - self.solver.update_GP_dynamics(obs=path_obs, act=path_act) + if step < self._cfgs.algo_cfgs.update_dynamics_steps: + self.solver.update_gp_dynamics( + obs=torch.cat(path_obs), # type: ignore + act=torch.cat(path_act), # type: ignore + ) path_obs = [] path_act = [] obs, _ = self.reset() - while abs(self._env.unwrapped.state[0]) > 1: - obs, _ = self._env.reset() buffer.finish_path(last_value_r, last_value_c, idx) self.first_iter = 0 + + @property + def gp_models(self) -> list[GaussianProcessRegressor]: + """Return the gp models to be saved.""" + return self.solver.gp_models diff --git a/omnisafe/adapter/beta_barrier_function_adapter.py b/omnisafe/adapter/beta_barrier_function_adapter.py index ee8ccc298..844c0b4ce 100644 --- a/omnisafe/adapter/beta_barrier_function_adapter.py +++ b/omnisafe/adapter/beta_barrier_function_adapter.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""BarrierFunction Adapter for OmniSafe.""" +"""Barrier Function Adapter for OmniSafe.""" from __future__ import annotations +from typing import Callable + import numpy as np import torch from rich.progress import track @@ -28,17 +30,28 @@ from omnisafe.utils.config import Config -def cbf(state: np.ndarray | None = None, eta: float = 0.99) -> tuple[np.ndarray, np.ndarray]: - """ - Calculates CBF constraint set at a given state. Default is - the current state. +# # pylint: disable-next=too-many-locals +def cbf(state: np.ndarray, eta: float = 0.99) -> tuple[np.ndarray, np.ndarray]: + """Calculates the Control Barrier Function (CBF) constraints. + + Args: + state (np.ndarray | None): A numpy array containing the pendulum's current angular position + (theta) and angular velocity (thetadot). + eta (float): A scaling factor used to adjust the safety bounds. + + Returns: + tuple containing two elements: 1. The minimum control torque that keeps the pendulum within + the safety bounds. 2. The maximum control torque that keeps the pendulum within the safety + bounds. + + Raises: + ValueError: If the `eta` value is not within the open interval (0, 1). """ g = 9.8 m = 1 length = 1 tau = 5e-2 theta_safety_bounds = [-1.0, 1.0] - thetadot_safety_bounds = [-np.inf, np.inf] torque_bounds = [-15.0, 15.0] if (eta > 1 - 1e-3) or (eta < 1e-5): raise ValueError('eta should be inside (0, 1)') @@ -47,7 +60,7 @@ def cbf(state: np.ndarray | None = None, eta: float = 0.99) -> tuple[np.ndarray, theta, thetadot = state[0], state[1] theta_min, theta_max = theta_safety_bounds[0], theta_safety_bounds[1] - thetadot_min, thetadot_max = thetadot_safety_bounds[0], thetadot_safety_bounds[1] + thetadot_min, thetadot_max = -np.inf, np.inf u_min1 = (1 / c2) * ( ((1 / (tau**2)) * (-eta * (theta - theta_min) - tau * thetadot)) - c1 * np.sin(theta) ) @@ -61,27 +74,21 @@ def cbf(state: np.ndarray | None = None, eta: float = 0.99) -> tuple[np.ndarray, u_min = max(u_min1, u_min2, torque_bounds[0]) u_max = min(u_max1, u_max2, torque_bounds[1]) - u_min = torque_bounds[0] - u_max = torque_bounds[1] + return (u_min, u_max) - return [u_min, u_max] - -def vectorize_f(f: callable) -> callable: - """Converts a function `f` that operates on 1D numpy arrays and outputs pairs of scalars, - into a vectorized function that accepts batches of torch tensorized arrays and outputs - pairs of torch tensors. +def vectorize_f(f: Callable) -> Callable: + """Vectorize the function. Args: - f (callable): A function that accepts 1D numpy arrays and returns a tuple (lower_bound, upper_bound), where both are scalars. + f (callable): A function that accepts 1D numpy arrays and returns a tuple (lower_bound, upper_bound). Returns: callable: A vectorized function that can process batches of torch tensors and return pairs of torch tensors. """ def vectorized_f_(obs: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """ - Inner function to process the torch tensor batch. + """Inner function to process the torch tensor batch. Args: obs (torch.Tensor): A batch of observations as torch tensors. @@ -94,13 +101,13 @@ def vectorized_f_(obs: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if len(obs.shape) == 1: batch_size = 1 lbs, ubs = f(obs) - lbs = np.array(lbs) - ubs = np.array(ubs) + lbs = torch.as_tensor(lbs) + ubs = torch.as_tensor(ubs) else: batch_size = obs.shape[0] - lbs = np.zeros([batch_size, 1]) - ubs = np.zeros([batch_size, 1]) + lbs = torch.zeros([batch_size, 1]) + ubs = torch.zeros([batch_size, 1]) for i in range(batch_size): lbs[i], ubs[i] = f(obs[i]) @@ -129,10 +136,7 @@ class BetaBarrierFunctionAdapter(OnPolicyAdapter): def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: """Initialize an instance of :class:`BarrierFunctionAdapter`.""" super().__init__(env_id, num_envs, seed, cfgs) - self.solver = None - self.compensator = None - self.first_iter = 1 - self.constraint_fn = vectorize_f(cbf) + self.constraint_fn: Callable = vectorize_f(cbf) def _wrapper( self, @@ -183,8 +187,6 @@ def rollout( # pylint: disable=too-many-locals """ self._reset_log() obs, _ = self.reset() - while abs(self._env.unwrapped.state[0]) > 1: - obs, _ = self._env.reset() for step in track( range(steps_per_epoch), description=f'Processing rollout for epoch: {logger.current_epoch}...', @@ -240,7 +242,4 @@ def rollout( # pylint: disable=too-many-locals self._ep_cost[idx] = 0.0 self._ep_len[idx] = 0.0 obs, _ = self.reset() - while abs(self._env.unwrapped.state[0]) > 1: - obs, _ = self._env.reset() buffer.finish_path(last_value_r, last_value_c, idx) - self.first_iter = 0 diff --git a/omnisafe/adapter/offpolicy_barrier_function_adapter.py b/omnisafe/adapter/offpolicy_barrier_function_adapter.py index e1353884b..49bf7909c 100644 --- a/omnisafe/adapter/offpolicy_barrier_function_adapter.py +++ b/omnisafe/adapter/offpolicy_barrier_function_adapter.py @@ -12,10 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""BarrierFunction Adapter for OmniSafe.""" +"""BarrierFunction OffPolicy Adapter for OmniSafe.""" from __future__ import annotations +from typing import Any + import torch from sklearn.gaussian_process import GaussianProcessRegressor @@ -30,14 +32,24 @@ class OffPolicyBarrierFunctionAdapter(OffPolicyAdapter): + """OffPolicy Barrier Function Adapter for OmniSafe. + + :class:`OffPolicyBarrierFunctionAdapter` is used to adapt the environment with CBF controller. + + Args: + env_id (str): The environment id. + num_envs (int): The number of environments. + seed (int): The random seed. + cfgs (Config): The configuration. + """ def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: """Initialize an instance of :class:`BarrierFunctionAdapter`.""" super().__init__(env_id, num_envs, seed, cfgs) - self.solver = None - self.compensator = None - self.first_iter = 1 - self.episode_rollout = {} + self.solver: PendulumSolver + self.compensator: BarrierCompensator + self.first_iter: int = 1 + self.episode_rollout: dict[str, Any] = {} self.episode_rollout['obs'] = [] self.episode_rollout['final_act'] = [] self.episode_rollout['approx_compensating_act'] = [] @@ -100,16 +112,15 @@ def eval_policy( # pylint: disable=too-many-locals def set_solver(self, solver: PendulumSolver) -> None: """Set the barrier function solver for Pendulum environment.""" - self.solver: PendulumSolver = solver + self.solver = solver def set_compensator(self, compensator: BarrierCompensator) -> None: """Set the action compensator.""" - self.compensator: BarrierCompensator = compensator + self.compensator = compensator def reset_gp_model(self) -> None: """Reset the gaussian processing model of barrier function solver.""" - self.solver.gp_model_prev = self.solver.gp_model.copy() - self.solver.build_gp_model() + self.solver.reset_gp_model() def rollout( # pylint: disable=too-many-locals self, @@ -119,13 +130,23 @@ def rollout( # pylint: disable=too-many-locals logger: Logger, use_rand_action: bool, ) -> None: + """Rollout in off-policy manner with barrier function controller. + + Args: + rollout_step (int): Number of rollout steps. + agent (ConstraintActorCritic): Constraint actor-critic, including actor, reward critic, + and cost critic. + buffer (VectorOnPolicyBuffer): Vector on-policy buffer. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + use_rand_action (bool): Whether to use random action. + """ for _ in range(rollout_step): if use_rand_action: - act = (torch.rand(self.action_space.shape) * 2 - 1).unsqueeze(0).to(self._device) + act = (torch.rand(self.action_space.shape) * 2 - 1).unsqueeze(0).to(self._device) # type: ignore else: act = agent.actor.predict(self._current_obs, deterministic=False) - final_act = self.get_safe_action(obs=self._current_obs, act=act) + final_act = self.get_safe_action(self._current_obs, act) self.episode_rollout['obs'].append(self._current_obs) self.episode_rollout['final_act'].append(final_act) @@ -146,15 +167,15 @@ def rollout( # pylint: disable=too-many-locals for idx, done in enumerate(torch.logical_or(terminated, truncated)): if done: self._log_metrics(logger, idx) - compensator_loss = self.compensator.train( + compensator_loss = self.compensator.update( torch.cat(self.episode_rollout['obs']), torch.cat(self.episode_rollout['approx_compensating_act']), torch.cat(self.episode_rollout['compensating_act']), ) logger.store({'Value/Loss_compensator': compensator_loss.item()}) self.solver.update_gp_dynamics( - obs=torch.cat(self.episode_rollout['obs']), - act=torch.cat(self.episode_rollout['final_act']), + obs=torch.cat(self.episode_rollout['obs']), # type: ignore + act=torch.cat(self.episode_rollout['final_act']), # type: ignore ) self.episode_rollout['obs'] = [] @@ -168,30 +189,41 @@ def rollout( # pylint: disable=too-many-locals if not self.first_iter: self.reset_gp_model() - @torch.no_grad def get_safe_action( self, obs: torch.Tensor, act: torch.Tensor, is_eval: bool = False, ) -> torch.Tensor: - approx_compensating_act = self.compensator(obs=self._current_obs) - compensated_act_mean_raw = act + approx_compensating_act + """Computes a safe action by applying compensatory actions. + + Args: + obs (torch.Tensor): The current observation from the environment. + act (torch.Tensor): The proposed action to be evaluated for safety. + is_eval (bool, optional): A flag to indicate whether this is an evaluation phase, defaulting to False. - if self.first_iter: - [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=False) - else: - [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=True) + Returns: + torch.Tensor: The safe action to be executed in the environment. + """ + with torch.no_grad(): + approx_compensating_act = self.compensator(obs=self._current_obs) + compensated_act_mean_raw = act + approx_compensating_act + + if self.first_iter: + [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=False) + else: + [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=True) - compensating_act = self.solver.control_barrier(compensated_act_mean_raw, f, g, x, std) - safe_act = compensated_act_mean_raw + compensating_act + compensating_act = self.solver.control_barrier(compensated_act_mean_raw, f, g, x, std) + safe_act = compensated_act_mean_raw + compensating_act - if not is_eval: - self.episode_rollout['compensating_act'].append(compensating_act) - self.episode_rollout['approx_compensating_act'].append(approx_compensating_act) + if not is_eval: + self.episode_rollout['compensating_act'].append(compensating_act) + self.episode_rollout['approx_compensating_act'].append(approx_compensating_act) return safe_act @property def gp_models(self) -> list[GaussianProcessRegressor]: + """Return the gp models to be saved.""" return self.solver.gp_models diff --git a/omnisafe/adapter/robust_barrier_function_adapter.py b/omnisafe/adapter/robust_barrier_function_adapter.py index 843676c7f..f56674319 100644 --- a/omnisafe/adapter/robust_barrier_function_adapter.py +++ b/omnisafe/adapter/robust_barrier_function_adapter.py @@ -32,12 +32,22 @@ class RobustBarrierFunctionAdapter(OffPolicyAdapter): + """Off Policy Robust Barrier Function Adapter for OmniSafe. + + :class:`RobustBarrierFunctionAdapter` is used to adapt the environment with RCBF controller. + + Args: + env_id (str): The environment id. + num_envs (int): The number of environments. + seed (int): The random seed. + cfgs (Config): The configuration. + """ def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: """Initialize an instance of :class:`BarrierFunctionAdapter`.""" super().__init__(env_id, num_envs, seed, cfgs) - self.solver = None - self.compensator = None + self.solver: CBFQPLayer + self.dynamics_model: DynamicsModel self._current_steps = 0 self._num_episodes = 0 @@ -70,13 +80,13 @@ def _wrapper( def set_solver(self, solver: CBFQPLayer) -> None: """Set the barrier function solver for Pendulum environment.""" - self.solver: CBFQPLayer = solver - self.solver.env = self._env + self.solver = solver + self.solver.env = self._env # type: ignore def set_dynamics_model(self, dynamics_model: DynamicsModel) -> None: """Set the dynamics model.""" self.dynamics_model = dynamics_model - self.dynamics_model.env = self._env + self.dynamics_model.env = self._env # type: ignore def eval_policy( # pylint: disable=too-many-locals self, @@ -143,7 +153,7 @@ def rollout( # pylint: disable=too-many-locals state = self.dynamics_model.get_state(self._current_obs) self._current_steps += 1 if use_rand_action: - act = (torch.rand(self.action_space.shape) * 2 - 1).unsqueeze(0).to(self._device) + act = (torch.rand(self.action_space.shape) * 2 - 1).unsqueeze(0).to(self._device) # type: ignore else: act = agent.step(self._current_obs, deterministic=False) @@ -182,12 +192,21 @@ def rollout( # pylint: disable=too-many-locals @property def safe_action_space(self) -> OmnisafeSpace: + """Return the action space in the safe domain.""" if hasattr(self._env, 'safe_action_space'): return self._env.safe_action_space return self._env.action_space def get_safe_action(self, obs: torch.Tensor, act: torch.Tensor) -> torch.Tensor: + """Computes a safe action by applying robust barrier function. + Args: + obs (torch.Tensor): The current observation from the environment. + act (torch.Tensor): The proposed action to be evaluated for safety. + + Returns: + torch.Tensor: The safe action to be executed in the environment. + """ state_batch = self.dynamics_model.get_state(obs) mean_pred_batch, sigma_pred_batch = self.dynamics_model.predict_disturbance(state_batch) @@ -199,4 +218,5 @@ def get_safe_action(self, obs: torch.Tensor, act: torch.Tensor) -> torch.Tensor: ) def __getattr__(self, name: str) -> Any: + """Return the unwrapped environment attributes.""" return getattr(self._env, name) diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index f0c633220..2d6bad948 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -558,7 +558,7 @@ def _log_when_not_update(self) -> None: }, ) - def _log_what_to_save(self) -> dict[str, Any]: + def _log_what_to_save(self) -> None: """Define what need to be saved below.""" what_to_save: dict[str, Any] = {} diff --git a/omnisafe/algorithms/off_policy/ddpg_cbf.py b/omnisafe/algorithms/off_policy/ddpg_cbf.py index ad1306d5b..32b27be1d 100644 --- a/omnisafe/algorithms/off_policy/ddpg_cbf.py +++ b/omnisafe/algorithms/off_policy/ddpg_cbf.py @@ -13,6 +13,7 @@ # limitations under the License. # ============================================================================== """Implementation of the DDPG algorithm with Control Barrier Function.""" +# mypy: ignore-errors from __future__ import annotations @@ -34,12 +35,13 @@ @registry.register # pylint: disable-next=too-many-instance-attributes, too-few-public-methods class DDPGCBF(DDPG): - """The Soft Actor-Critic algorithm with Control Barrier Function. + """The DDPG algorithm with CBF. References: - - Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor - - Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. - - URL: `DDPG `_ + - Title: End-to-end safe reinforcement learning through barrier functions for + safety-critical continuous control tasks + - Authors: R Cheng, G Orosz, RM Murray, JW Burdick. + - URL: `DDPGCBF `_ """ def _init_env(self) -> None: @@ -95,14 +97,14 @@ def _init(self) -> None: ) def _init_log(self) -> None: - # """Log the DDPGRCBF specific information. - - # +----------------------------+--------------------------+ - # | Things to log | Description | - # +============================+==========================+ - # | Metrics/LagrangeMultiplier | The Lagrange multiplier. | - # +----------------------------+--------------------------+ - # """ + """Log the DDPGCBF specific information. + + +----------------------------+---------------------------------+ + | Things to log | Description | + +============================+=================================+ + | Value/Loss_compensator | The Loss of action compensator. | + +----------------------------+---------------------------------+ + """ super()._init_log() self._logger.register_key('Value/Loss_compensator') diff --git a/omnisafe/algorithms/off_policy/sac_rcbf.py b/omnisafe/algorithms/off_policy/sac_rcbf.py index 9fbd20a39..1e9547369 100644 --- a/omnisafe/algorithms/off_policy/sac_rcbf.py +++ b/omnisafe/algorithms/off_policy/sac_rcbf.py @@ -13,7 +13,7 @@ # limitations under the License. # ============================================================================== """Implementation of the Soft Actor-Critic algorithm with Robust Control Barrier Function.""" - +# mypy: ignore-errors from __future__ import annotations @@ -175,8 +175,8 @@ def _specific_save(self) -> None: train_y = self._env.dynamics_model.train_y disturb_estimators = self._env.dynamics_model.disturb_estimators weights = [] - for i in range(len(disturb_estimators)): - weights.append(disturb_estimators[i].model.state_dict()) + for disturb_estimator in disturb_estimators: + weights.append(disturb_estimator.model.state_dict()) torch.save(weights, os.path.join(path, f'gp_models_{self._logger.current_epoch}.pkl')) torch.save( train_x, diff --git a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py index 24b27d939..b77c36c76 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py +++ b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of the PPO algorithm with Control Barrier Function.""" +"""Implementation of the PPO algorithm with Control Barrier Function and Beta Actor.""" from __future__ import annotations @@ -26,6 +26,14 @@ @registry.register class PPOBetaCBF(PPO): + """The PPO algorithm with CBF and Beta Actor. + + References: + - Title: Sampling-based Safe Reinforcement Learning for Nonlinear Dynamical Systems + - Authors: Wesley A. Suttle, Vipul K. Sharma, Krishna C. Kosaraju, S. Sivaranjani, Ji Liu, + Vijay Gupta, Brian M. Sadler. + - URL: `PPOBetaCBF `_ + """ def _init_log(self) -> None: super()._init_log() diff --git a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py index 3fceec4f7..72238e41a 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py +++ b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py @@ -13,9 +13,13 @@ # limitations under the License. # ============================================================================== """Implementation of the TRPO algorithm with Control Barrier Function.""" +# mypy: ignore-errors from __future__ import annotations +import os + +import joblib import torch from torch.utils.data import DataLoader, TensorDataset @@ -25,14 +29,30 @@ from omnisafe.common.barrier_comp import BarrierCompensator from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.utils import distributed +from omnisafe.utils.distributed import get_rank @registry.register class TRPOCBF(TRPO): + """The TRPO algorithm with CBF. + + References: + - Title: End-to-end safe reinforcement learning through barrier functions for + safety-critical continuous control tasks + - Authors: R Cheng, G Orosz, RM Murray, JW Burdick. + - URL: `TRPOCBF `_ + """ def _init_log(self) -> None: + """Log the TRPOCBF specific information. + + +----------------------------+---------------------------------+ + | Things to log | Description | + +============================+=================================+ + | Value/Loss_compensator | The Loss of action compensator. | + +----------------------------+---------------------------------+ + """ super()._init_log() - self._logger.register_key('Metrics/angle', min_and_max=True) self._logger.register_key('Value/Loss_compensator') def _init_env(self) -> None: @@ -110,7 +130,7 @@ def _update(self) -> None: ) self._update_actor(obs, act, logp, adv_r, adv_c) - compensator_loss = self._env.compensator.train( + compensator_loss = self._env.compensator.update( observation=obs, approx_compensating_act=approx_compensating_act, compensating_act=compensating_act, @@ -138,3 +158,15 @@ def _update(self) -> None: 'Value/Loss_compensator': compensator_loss.item(), }, ) + + def _specific_save(self) -> None: + """Save some algorithms specific models per epoch.""" + super()._specific_save() + if get_rank() == 0: + path = os.path.join( + self._logger.log_dir, + 'gp_model_save', + f'gaussian_process_regressor_{self._logger.current_epoch}.pkl', + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + joblib.dump(self._env.gp_models, path) diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index e0792d6ab..831076de6 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -180,12 +180,7 @@ def _init_log(self) -> None: config=self._cfgs, ) - what_to_save: dict[str, Any] = {} - what_to_save['pi'] = self._actor_critic.actor - if self._cfgs.algo_cfgs.obs_normalize: - obs_normalizer = self._env.save()['obs_normalizer'] - what_to_save['obs_normalizer'] = obs_normalizer - self._logger.setup_torch_saver(what_to_save) + self._log_what_to_save() self._logger.torch_save() self._logger.register_key( @@ -296,6 +291,7 @@ def learn(self) -> tuple[float, float, float]: epoch + 1 ) == self._cfgs.train_cfgs.epochs: self._logger.torch_save() + self._specific_save() ep_ret = self._logger.get_stats('Metrics/EpRet')[0] ep_cost = self._logger.get_stats('Metrics/EpCost')[0] @@ -586,3 +582,17 @@ def _loss_pi( }, ) return loss + + def _log_what_to_save(self) -> None: + """Define what need to be saved below.""" + what_to_save: dict[str, Any] = {} + + what_to_save['pi'] = self._actor_critic.actor + if self._cfgs.algo_cfgs.obs_normalize: + obs_normalizer = self._env.save()['obs_normalizer'] + what_to_save['obs_normalizer'] = obs_normalizer + + self._logger.setup_torch_saver(what_to_save) + + def _specific_save(self) -> None: + """Save some algorithms specific models per epoch.""" diff --git a/omnisafe/common/barrier_comp.py b/omnisafe/common/barrier_comp.py index 1a27d5863..40381ccd3 100644 --- a/omnisafe/common/barrier_comp.py +++ b/omnisafe/common/barrier_comp.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== +"""Implementation of Compensator Used in Control Barrier Function.""" + from __future__ import annotations @@ -43,6 +45,7 @@ class BarrierCompensator(torch.nn.Module): """ def __init__(self, obs_dim: int, act_dim: int, cfgs: Config) -> None: + """Initialize the action compensator.""" super().__init__() self._cfgs: Config = cfgs self.model: torch.nn.Module = build_mlp_network( @@ -63,7 +66,7 @@ def forward(self, obs: torch.Tensor) -> torch.Tensor: """ return self.model(obs) - def train( + def update( self, observation: torch.Tensor, approx_compensating_act: torch.Tensor, diff --git a/omnisafe/common/barrier_solver.py b/omnisafe/common/barrier_solver.py index b00af906e..ea287b4ad 100644 --- a/omnisafe/common/barrier_solver.py +++ b/omnisafe/common/barrier_solver.py @@ -14,6 +14,10 @@ # ============================================================================== """Implementation of the Control Barrier Function Solver.""" +# pylint: disable=invalid-name,wrong-spelling-in-docstring +# mypy: ignore-errors + + from __future__ import annotations import warnings @@ -27,6 +31,7 @@ from sklearn.gaussian_process.kernels import ConstantKernel as C +# pylint: disable-next=too-many-instance-attributes class PendulumSolver: """Solver for the pendulum problem using Gaussian Process models. @@ -38,6 +43,7 @@ class PendulumSolver: device (str): Device to run the computations on. """ + # pylint: disable-next=invalid-name def __init__( self, action_size: int = 1, @@ -63,9 +69,11 @@ def __init__( self._device = device self._gamma_b = 0.5 self._kd = 1.5 + self.gp_model_prev: list[GaussianProcessRegressor, GaussianProcessRegressor] + self.gp_model: list[GaussianProcessRegressor, GaussianProcessRegressor] + self._build_barrier() self.build_gp_model() - self.gp_model_prev = None warnings.filterwarnings('ignore') def build_gp_model(self, save_dir: str | None = None) -> None: @@ -80,6 +88,7 @@ def build_gp_model(self, save_dir: str | None = None) -> None: else: gp_list = joblib.load(save_dir) self.gp_model = gp_list + self.gp_model_prev = gp_list.copy() @property def gp_models(self) -> list[GaussianProcessRegressor]: @@ -95,7 +104,7 @@ def _build_barrier(self) -> None: self.h3 = np.array([-1, 0.01]) self.h4 = np.array([-1, -0.01]) - def control_barrier( + def control_barrier( # pylint: disable=invalid-name self, original_action: torch.Tensor, f: np.ndarray, @@ -103,9 +112,7 @@ def control_barrier( x: np.ndarray, std: np.ndarray, ) -> torch.Tensor: - """ - Adjusts the original action using a control barrier function to ensure - that the action complies with the system's physical constraints. + """Adjusts the original action using a control barrier function. Args: original_action (torch.Tensor): The original action proposed by the RL algorithm. @@ -117,7 +124,6 @@ def control_barrier( Returns: torch.Tensor: The adjusted action that respects the system's constraints. """ - # Define gamma for the barrier function gamma_b = 0.5 kd = 1.5 @@ -196,10 +202,9 @@ def control_barrier( return torch.as_tensor(u_bar[0], dtype=torch.float32, device=self._device).unsqueeze(dim=0) + # pylint: disable-next=attribute-defined-outside-init,import-outside-toplevel,invalid-name def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: - """ - Calculates the dynamics of the system based on the current observation - and the original action. + """Calculates the dynamics of the system. Args: obs (list[float]): The current observation of the system state. @@ -208,7 +213,6 @@ def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: Returns: np.ndarray: The calculated dynamics of the system. """ - dt = 0.05 # Time step G = 10 # Gravitational constant m = 2 # Mass @@ -233,8 +237,7 @@ def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: return np.squeeze(f) def update_gp_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: - """ - Updates the Gaussian Process (GP) dynamics model based on observed states and actions. + """Updates the Gaussian Process (GP) dynamics model based on observed states and actions. Args: obs (np.ndarray): Observed states. @@ -260,8 +263,7 @@ def update_gp_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: self.gp_model[1].fit(S, err[:, 1]) def get_gp_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.ndarray]: - """ - Retrieves the gp dynamics based on the current observation. + """Retrieves the gp dynamics based on the current observation. Args: obs (torch.Tensor): Current state observation. @@ -278,7 +280,7 @@ def get_gp_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.nd obs = np.squeeze(obs) theta = np.arctan2(obs[1], obs[0]) theta_dot = obs[2] - x = np.array([theta, theta_dot]) # 这个x估计就对应state + x = np.array([theta, theta_dot]) f_nom = np.array( [ -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 @@ -307,3 +309,8 @@ def get_gp_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.nd np.squeeze(x), np.array([np.squeeze(std1), np.squeeze(std2)]), ] + + def reset_gp_model(self) -> None: + """Reset the gaussian processing model of barrier function solver.""" + self.gp_model_prev = self.gp_model.copy() + self.build_gp_model() diff --git a/omnisafe/common/robust_barrier_solver.py b/omnisafe/common/robust_barrier_solver.py index 639ae8d3a..3e14d002c 100644 --- a/omnisafe/common/robust_barrier_solver.py +++ b/omnisafe/common/robust_barrier_solver.py @@ -1,19 +1,46 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Robust Control Barrier Function Solver for OmniSafe.""" + + +# mypy: ignore-errors +# pylint: disable=invalid-name,wrong-spelling-in-docstring from __future__ import annotations from typing import Any import gymnasium as gym -import numpy as np import torch from qpth.qp import QPFunction -from omnisafe.common.utils import sort_vertices_cclockwise, to_tensor +from omnisafe.utils.tools import to_tensor DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}} class CBFQPLayer: + """CBFQLayer for robust control barrier function solver. + + Args: + env (gym.Env): The Gym environment to interact with. + device (str, optional): The device type, such as 'cpu' or 'gpu'. Defaults to 'cpu'. + gamma_b (float, optional): The gamma parameter. Defaults to 20. + k_d (float, optional): The confidence parameter desired. Defaults to 3.0. + l_p (float, optional): Some additional layer parameter, purpose unspecified. Defaults to 0.03. + """ def __init__( self, @@ -23,15 +50,7 @@ def __init__( k_d: float = 3.0, l_p: float = 0.03, ) -> None: - """Initializes a CBFLayer instance with specified parameters and environment. - - Args: - env (gym.Env): The Gym environment to interact with. - device (str, optional): The device type, such as 'cpu' or 'gpu'. Defaults to 'cpu'. - gamma_b (float, optional): The gamma parameter of the control barrier certificate. Defaults to 20. - k_d (float, optional): The confidence parameter desired (e.g., 2.0 corresponds to ~95% confidence). Defaults to 3.0. - l_p (float, optional): Some additional layer parameter, purpose unspecified. Defaults to 0.03. - """ + """Initializes a CBFLayer instance with specified parameters and environment.""" self.device = torch.device(device) self.env = env self.u_min, self.u_max = self.get_control_bounds() @@ -54,7 +73,6 @@ def get_safe_action( action_batch (torch.Tensor): Nominal action batch, tensor or ndarray. mean_pred_batch (torch.Tensor): Mean disturbance predictions, tensor or ndarray. sigma_batch (torch.Tensor): Standard deviations of disturbances, tensor or ndarray. - cbf_info_batch (torch.Tensor, optional): Additional control barrier function information batch, tensor or ndarray. Returns: torch.Tensor: Safe actions adjusted for given constraints and uncertainties. @@ -96,15 +114,14 @@ def solve_qp( subject to G[u,eps]^T <= h Args: - Ps (torch.Tensor): Quadratic cost matrix for each problem, with shape (batch_size, n_u+1, n_u+1). - qs (torch.Tensor): Linear cost vector for each problem, with shape (batch_size, n_u+1). - Gs (torch.Tensor): Inequality constraint matrix for each problem, with shape (batch_size, num_ineq_constraints, n_u+1). - hs (torch.Tensor): Inequality constraint vector for each problem, with shape (batch_size, num_ineq_constraints). + Ps (torch.Tensor): Quadratic cost matrix for each problem. + qs (torch.Tensor): Linear cost vector for each problem. + Gs (torch.Tensor): Inequality constraint matrix for each problem. + hs (torch.Tensor): Inequality constraint vector for each problem. Returns: The safe action for each problem, omitting the slack variable, with dimension (batch_size, n_u). """ - Ghs = torch.cat((Gs, hs.unsqueeze(2)), -1) Ghs_norm = torch.max(torch.abs(Ghs), dim=2, keepdim=True)[0] Gs /= Ghs_norm @@ -139,8 +156,8 @@ def cbf_layer( Args: Qs (torch.Tensor): Quadratic cost matrix for each problem. ps (torch.Tensor): Linear cost vector for each problem. - Gs (torch.Tensor): Inequality constraint matrix for each problem, shape (batch_size, num_ineq_constraints, num_vars). - hs (torch.Tensor): Inequality constraint vector for each problem, shape (batch_size, num_ineq_constraints). + Gs (torch.Tensor): Inequality constraint matrix for each problem. + hs (torch.Tensor): Inequality constraint vector for each problem. As (torch.Tensor, optional): Equality constraint matrix. Defaults to None. bs (torch.Tensor, optional): Equality constraint vector. Defaults to None. solver_args (dict, optional): Dictionary of solver arguments. Defaults to None. @@ -148,7 +165,6 @@ def cbf_layer( Returns: Result of the QP solver for each problem. """ - if solver_args is None: solver_args = {} @@ -165,6 +181,7 @@ def cbf_layer( bs, ).float() + # pylint: disable-next=too-many-locals def get_cbf_qp_constraints( self, state_batch: torch.Tensor, @@ -180,10 +197,10 @@ def get_cbf_qp_constraints( subject to G[u,eps]^T <= h Args: - state_batch (torch.Tensor): Current state batch. Refer to `dynamics.py` for specifics on each dynamic. + state_batch (torch.Tensor): Current state batch. action_batch (torch.Tensor): Nominal control input batch. - mean_pred_batch (torch.Tensor): Mean disturbance prediction state batch, dimensions (n_s, n_u). - sigma_pred_batch (torch.Tensor): Standard deviation of the additive disturbance after undergoing the output dynamics. + mean_pred_batch (torch.Tensor): Mean disturbance prediction state batch. + sigma_pred_batch (torch.Tensor): Standard deviation of the additive disturbance. gamma_b (float, optional): CBF parameter for the class-Kappa function. Defaults to 1.0. Returns: @@ -246,65 +263,15 @@ def get_cbf_qp_constraints( hs = 1e3 * torch.ones((batch_size, num_cbfs), device=self.device) dhdps = torch.zeros((batch_size, num_cbfs, 2), device=self.device) hazards = self.env.hazards - for i in range(len(hazards)): - if hazards[i]['type'] == 'circle': - obs_loc = to_tensor(hazards[i]['location'], torch.FloatTensor, self.device) + for i, hazard in enumerate(hazards): + if hazard['type'] == 'circle': + obs_loc = to_tensor(hazard['location'], torch.FloatTensor, self.device) hs[:, i] = 0.5 * ( - torch.sum((ps - obs_loc) ** 2, dim=1) - (hazards[i]['radius'] + buffer) ** 2 + torch.sum((ps - obs_loc) ** 2, dim=1) - (hazard['radius'] + buffer) ** 2 ) dhdps[:, i, :] = ps - obs_loc - elif hazards[i]['type'] == 'polygon': - vertices = sort_vertices_cclockwise(hazards[i]['vertices']) - segments = np.diff(vertices, axis=0, append=vertices[[0]]) - segments = to_tensor(segments, torch.FloatTensor, self.device) - vertices = to_tensor(vertices, torch.FloatTensor, self.device) - for j in range(segments.shape[0]): - dot_products = torch.matmul( - ps - vertices[j : j + 1], - segments[j], - ) / torch.sum(segments[j] ** 2) - mask0_ = dot_products < 0 - mask1_ = dot_products > 1 - mask_ = torch.logical_and(dot_products >= 0, dot_products <= 1) - dists2seg = torch.zeros(batch_size) - if mask0_.sum() > 0: - dists2seg[mask0_] = torch.linalg.norm(ps[mask0_] - vertices[[j]], dim=1) - if mask1_.sum() > 0: - dists2seg[mask1_] = torch.linalg.norm( - ps[mask1_] - vertices[[(j + 1) % segments.shape[0]]], - dim=1, - ) - if mask_.sum() > 0: - dists2seg[mask_] = torch.linalg.norm( - dot_products[mask_, None] * segments[j].tile((torch.sum(mask_), 1)) - + vertices[[j]] - - ps[mask_], - dim=1, - ) - hs_ = 0.5 * ((dists2seg**2) + 0.5 * buffer) - dhdps_ = torch.zeros((batch_size, 2)) - if mask0_.sum() > 0: - dhdps_[mask0_] = ps[mask0_] - vertices[[j]] - if mask1_.sum() > 0: - dhdps_[mask1_] = ps[mask1_] - vertices[[(j + 1) % segments.shape[0]]] - if mask_.sum() > 0: - normal_vec = torch.tensor([segments[j][1], -segments[j][0]]) - normal_vec /= torch.linalg.norm(normal_vec) - dhdps_[mask_] = (ps[mask_] - vertices[j]).matmul( - normal_vec, - ) * normal_vec.view((1, 2)).repeat(torch.sum(mask_), 1) - idxs_to_update = torch.nonzero(hs[:, i] - hs_ > 0) - # Update the actual hs to be used in the constraints - if idxs_to_update.shape[0] > 0: - hs[idxs_to_update, i] = hs_[idxs_to_update] - # Compute dhdhps for those indices - dhdps[idxs_to_update, i, :] = dhdps_[idxs_to_update, :] else: - raise Exception( - 'Only obstacles of type `circle` or `polygon` are supported, got: {}'.format( - hazards[i]['type'], - ), - ) + raise NotImplementedError n_u = action_batch.shape[1] num_constraints = num_cbfs + 2 * n_u @@ -345,12 +312,11 @@ def get_cbf_qp_constraints( return P, q, G, h def get_control_bounds(self) -> tuple[torch.Tensor, torch.Tensor]: - """ + """Obtain the action bounds. Returns: Action bounds, i.e., min control input and max control input. """ - u_min = torch.tensor(self.env.safe_action_space.low).to(self.device) u_max = torch.tensor(self.env.safe_action_space.high).to(self.device) diff --git a/omnisafe/common/robust_gp_model.py b/omnisafe/common/robust_gp_model.py index 3380d1f2d..885a50389 100644 --- a/omnisafe/common/robust_gp_model.py +++ b/omnisafe/common/robust_gp_model.py @@ -1,3 +1,21 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of Dynamics Model Based on GPyTorch.""" +# mypy: ignore-errors + + from __future__ import annotations import os @@ -14,8 +32,8 @@ from gpytorch.means import ZeroMean from gpytorch.priors import NormalPrior -from omnisafe.common.utils import to_numpy, to_tensor from omnisafe.typing import DEVICE_CPU +from omnisafe.utils.tools import to_tensor DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}} @@ -23,8 +41,7 @@ class BaseGPy(gpytorch.models.ExactGP): - """ - A Gaussian Process (GP) model using a zero mean function and a scaled RBF kernel with priors. + """A Gaussian Process (GP) model using a zero mean function and a scaled RBF kernel with priors. This class extends gpytorch.models.ExactGP, specifically designed for use in disturbance estimation tasks. @@ -57,6 +74,7 @@ def __init__( self.covar_module.base_kernel.lengthscale = 1e5 self.covar_module.outputscale = prior_std + 1e-6 + # pylint: disable=arguments-differ def forward(self, x: torch.Tensor) -> MultivariateNormal: """Forward pass through the GP model to produce a multivariate normal distribution. @@ -85,7 +103,7 @@ class GPyDisturbanceEstimator: train_x (torch.Tensor): Training data features. If not a tensor, it will be converted. train_y (torch.Tensor): Training data targets. If not a tensor, it will be converted. prior_std (float): Standard deviation of the prior distribution. - likelihood (Optional[gpytorch.likelihoods.Likelihood]): A GPyTorch likelihood. If None, a default GaussianLikelihood is used. + likelihood (Optional[gpytorch.likelihoods.Likelihood]): A GPyTorch likelihood. device (Optional[torch.device]): The torch device. Defaults to CPU if None. """ @@ -97,6 +115,7 @@ def __init__( likelihood: gpytorch.likelihoods.Likelihood | None = None, device: torch.device = DEVICE_CPU, ) -> None: + """Initialize the GPyDisturbanceEstimator.""" self.device = device if device else torch.device('cpu') if not torch.is_tensor(train_x): @@ -140,8 +159,7 @@ def train(self, training_iter: int, verbose: bool = False) -> None: optimizer.step() def predict(self, test_x: torch.Tensor) -> dict[str, torch.Tensor | np.ndarray]: - """ - Makes predictions on new data. + """Makes predictions on new data. Args: test_x (torch.Tensor): Test data features. If not a tensor, it will be converted. @@ -174,6 +192,7 @@ def predict(self, test_x: torch.Tensor) -> dict[str, torch.Tensor | np.ndarray]: return pred_dict +# pylint: disable-next=too-many-instance-attributes class DynamicsModel: """Initializes the DynamicsModel with a gym environment. @@ -191,6 +210,7 @@ def __init__( l_p: float = 0.03, device: str = 'cpu', ) -> None: + """Initializes the DynamicsModel with a gym environment.""" self.env = env self.get_f, self.get_g = self.get_dynamics() self.n_s = DYNAMICS_MODE[self.env.dynamics_mode]['n_s'] @@ -209,8 +229,7 @@ def __init__( self.device = torch.device(device) def predict_next_state(self, state_batch: np.ndarray, u_batch: np.ndarray) -> np.ndarray: - """ - Predicts the next state given the current state and action batch. + """Predicts the next state given the current state and action batch. Args: state_batch (np.ndarray): The batch of current states. @@ -241,7 +260,7 @@ def get_dynamics(self) -> tuple[Callable, Callable]: """Retrieves the dynamics functions for drift and control based on the environment's dynamics mode. Returns: - tuple: A tuple containing two callables, `get_f` and `get_g`, which compute the drift and control dynamics respectively. + tuple: A tuple containing two callable methods, `get_f` and `get_g`. """ if self.env.dynamics_mode == 'Unicycle': @@ -257,27 +276,23 @@ def get_g(state_batch: np.ndarray) -> np.ndarray: return g_x else: - raise Exception('Unknown Dynamics mode.') + raise NotImplementedError('Unknown Dynamics mode.') return get_f, get_g - def get_state(self, obs: np.ndarray) -> np.ndarray: - """ - Processes the raw observations from the environment and returns the corresponding state representation. + def get_state(self, obs: torch.Tensor) -> torch.Tensor: + """Processes the raw observations from the environment. Args: - obs (np.ndarray): The environment observations. + obs (torch.Tensor): The environment observations. Returns: - np.ndarray: The processed state of the system. + torch.Tensor: The processed state of the system. """ expand_dims = len(obs.shape) == 1 - is_tensor = torch.is_tensor(obs) - - if is_tensor: - dtype = obs.dtype - device = obs.device - obs = obs.cpu().numpy() if obs.is_cuda else obs.numpy() + dtype = obs.dtype + device = obs.device + obs = obs.cpu().numpy() if obs.is_cuda else obs.numpy() if expand_dims: obs = np.expand_dims(obs, 0) @@ -289,14 +304,12 @@ def get_state(self, obs: np.ndarray) -> np.ndarray: state_batch[:, 1] = obs[:, 1] state_batch[:, 2] = theta else: - raise Exception('Unknown dynamics') + raise NotImplementedError('Unknown dynamics') if expand_dims: state_batch = state_batch.squeeze(0) - if is_tensor: - return torch.tensor(state_batch, dtype=dtype, device=device) - return state_batch + return torch.tensor(state_batch, dtype=dtype, device=device) def append_transition( self, @@ -371,23 +384,18 @@ def fit_gp_model(self, training_iter: int = 70) -> None: self._train_x = train_x self._train_y = train_y - def predict_disturbance(self, test_x: np.ndarray) -> tuple: + def predict_disturbance(self, test_x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """Predicts the disturbance at the queried states using the trained Gaussian Process models. Args: - test_x (np.ndarray): The state for which to predict disturbances, shape (n_test, n_s). + test_x (torch.Tensor): The state for which to predict disturbances, shape (n_test, n_s). Returns: - tuple: A tuple of arrays (means, variances) where means is the predicted mean disturbance - and variances is the corresponding variance, shape (n_test, n_s). + tuple: A tuple of arrays (means, variances). """ - - is_tensor = torch.is_tensor(test_x) - - if is_tensor: - dtype = test_x.dtype - device = test_x.device - test_x = to_numpy(test_x) + dtype = test_x.dtype + device = test_x.device + test_x = test_x.cpu().detach().double().numpy() expand_dims = len(test_x.shape) == 1 if expand_dims: @@ -414,11 +422,7 @@ def predict_disturbance(self, test_x: np.ndarray) -> tuple: means = means.squeeze(0) f_std = f_std.squeeze(0) - return ( - (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) - if is_tensor - else (means, f_std) - ) + return (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) def load_disturbance_models(self, save_dir: str, epoch: str) -> None: """Loads the disturbance models and their training data. diff --git a/omnisafe/common/utils.py b/omnisafe/common/utils.py deleted file mode 100644 index ec36fe157..000000000 --- a/omnisafe/common/utils.py +++ /dev/null @@ -1,55 +0,0 @@ -import numpy as np -import torch - - -def to_numpy(x: torch.Tensor) -> np.ndarray: - """Convert a torch tensor to a numpy array. - - Args: - x (torch.Tensor): A torch tensor to be converted. - - Returns: - np.ndarray: A numpy array representation of the input tensor. - """ - return x.cpu().detach().double().numpy() - - -def to_tensor( - x: np.ndarray, - dtype: torch.dtype, - device: torch.device, - requires_grad: bool = False, -) -> torch.Tensor: - """Convert a numpy array to a torch tensor of specified type and device. - - Args: - x (np.ndarray): A numpy array to be converted. - dtype (torch.dtype): The desired data type for the tensor. - device (torch.device): The device to store the tensor on. - requires_grad (bool): If True, gradients will be computed for operations involving this tensor. - - Returns: - torch.Tensor: A torch tensor representation of the input array. - """ - if type(x).__module__ != 'numpy': - return x - return torch.from_numpy(x).type(dtype).to(device).requires_grad_(requires_grad) - - -def sort_vertices_cclockwise(vertices: np.ndarray) -> np.ndarray: - """Sort vertices of a 2D convex polygon in counter-clockwise direction. - - Args: - vertices (np.ndarray): An array of shape (n_v, 2) where n_v is the number of vertices. - - Returns: - np.ndarray: An array of vertices sorted in counter-clockwise direction. - """ - assert vertices.shape[1] == 2, f'Vertices must each have dimension 2, got {vertices.shape[1]}' - - # Sort vertices - polygon_center = vertices.sum(axis=0, keepdims=True) / vertices.shape[0] # (1, d) - rel_vecs = vertices - polygon_center - thetas = np.arctan2(rel_vecs[:, 1], rel_vecs[:, 0]) - idxs = np.argsort(thetas) - return vertices[idxs, :] diff --git a/omnisafe/configs/on-policy/TRPOCBF.yaml b/omnisafe/configs/on-policy/TRPOCBF.yaml index 8fecee0d4..9d1b67ec0 100644 --- a/omnisafe/configs/on-policy/TRPOCBF.yaml +++ b/omnisafe/configs/on-policy/TRPOCBF.yaml @@ -82,6 +82,8 @@ defaults: fvp_obs: None # The sub-sampling rate of the observation fvp_sample_freq: 1 + # The max steps to update dynamics model + update_dynamics_steps: 650 # logger configurations logger_cfgs: # use wandb for logging @@ -91,7 +93,7 @@ defaults: # use tensorboard for logging use_tensorboard: True # save model frequency - save_model_freq: 100 + save_model_freq: 10 # save logger path log_dir: "./runs" # save model path diff --git a/omnisafe/envs/barrier_function_env.py b/omnisafe/envs/barrier_function_env.py index d664e749b..01477c1fe 100644 --- a/omnisafe/envs/barrier_function_env.py +++ b/omnisafe/envs/barrier_function_env.py @@ -33,8 +33,8 @@ class BarrierFunctionEnv(CMDP): """Interface of control barrier function-based environments. .. warning:: - Since environments based on control barrier functions require special judgment and control of environmental dynamics, - they do not support the use of vectorized environments for parallelization. + Since environments based on control barrier functions require special judgment and control + of environmental dynamics, they do not support the use of vectorized environments. Attributes: need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. @@ -84,7 +84,7 @@ def __init__( else: raise NotImplementedError('Only support num_envs=1 now.') self._device = torch.device(device) - self._episodic_violation = [] + self._episodic_violation: list[float] = [] self._num_envs = num_envs self._metadata = self._env.metadata self.env_spec_log = {'Metrics/Max_angle_violation': 0.0} @@ -96,17 +96,16 @@ def _env_specific_setting(self) -> None: We have organized these adjustments and encapsulated them in this function. """ if self._env_id == 'Pendulum-v1': - self._env.unwrapped.max_torque = 15.0 - self._env.unwrapped.max_speed = 60.0 + self._env.unwrapped.max_torque = 15.0 # type: ignore + self._env.unwrapped.max_speed = 60.0 # type: ignore self._env.unwrapped.action_space = spaces.Box( - low=-self._env.unwrapped.max_torque, - high=self._env.unwrapped.max_torque, + low=-self._env.unwrapped.max_torque, # type: ignore + high=self._env.unwrapped.max_torque, # type: ignore shape=(1,), ) - high = np.array([1.0, 1.0, self._env.unwrapped.max_speed]) + high = np.array([1.0, 1.0, self._env.unwrapped.max_speed]) # type: ignore self._env.unwrapped.observation_space = spaces.Box(low=-high, high=high) - self._env.dt = 0.05 - self._env.dynamics_mode = 'Pendulum' + self._env.dt = 0.05 # type: ignore def step( self, @@ -146,7 +145,7 @@ def step( for x in (obs, reward, terminated, truncated) ) cost = torch.abs(torch.atan2(obs[1], obs[0])).to(self._device) - self._episodic_violation.append(cost) + self._episodic_violation.append(cost.item()) if 'final_observation' in info: info['final_observation'] = np.array( @@ -194,7 +193,7 @@ def reset( """ obs, info = self._env.reset(seed=seed, options=options) if self._env_id == 'Pendulum-v1': - while self._env.unwrapped.state[0] > 1.0 or self._env.unwrapped.state[0] < -1.0: + while self._env.unwrapped.state[0] > 1.0 or self._env.unwrapped.state[0] < -1.0: # type: ignore obs, info = self._env.reset(options=options) return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info @@ -220,4 +219,5 @@ def close(self) -> None: @property def unwrapped(self) -> gymnasium.Env: + """Return the original interface of environment.""" return self._env.unwrapped diff --git a/omnisafe/envs/robust_barrier_function_env.py b/omnisafe/envs/robust_barrier_function_env.py index 1f1c10418..9bce446ce 100644 --- a/omnisafe/envs/robust_barrier_function_env.py +++ b/omnisafe/envs/robust_barrier_function_env.py @@ -18,10 +18,8 @@ from typing import Any, ClassVar -import gymnasium import numpy as np import torch -from gymnasium import spaces from omnisafe.envs.core import CMDP, env_register from omnisafe.envs.unicycle_env import UnicycleEnv @@ -33,8 +31,9 @@ class RobustBarrierFunctionEnv(CMDP): """Interface of control barrier function-based environments. .. warning:: - Since environments based on control barrier functions require special judgment and control of environmental dynamics, - they do not support the use of vectorized environments for parallelization. + Since environments based on control barrier functions require special judgment and control + of environmental dynamics, they do not support the use of vectorized environments for + parallelization. Attributes: need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. @@ -168,17 +167,6 @@ def set_seed(self, seed: int) -> None: """ self.reset(seed=seed) - def sample_action(self) -> torch.Tensor: - """Sample a random action. - - Returns: - A random action. - """ - return torch.normal( - torch.zeros(self.action_space.shape), - torch.ones(self.action_space.shape), - ) - def render(self) -> Any: """Render the environment. @@ -192,4 +180,5 @@ def close(self) -> None: self._env.close() def __getattr__(self, name: str) -> Any: + """Return the unwrapped environment attributes.""" return getattr(self._env, name) diff --git a/omnisafe/envs/unicycle_env.py b/omnisafe/envs/unicycle_env.py index 4fca58eed..dd0515fba 100644 --- a/omnisafe/envs/unicycle_env.py +++ b/omnisafe/envs/unicycle_env.py @@ -1,3 +1,5 @@ +# pylint: disable=all +# mypy: ignore-errors from __future__ import annotations from collections.abc import Iterable @@ -9,15 +11,6 @@ def to_pixel(meas_cm: list[float] | float, shift: int = 0) -> float: - """Convert measurements from centimeters to pixels. - - Args: - meas_cm (list[float] | float): A single measurement or a list of measurements in centimeters. - shift (int, optional): An integer value that is added to the converted measurement(s). Default is 0. - - Returns: - float | np.ndarray: The measurement converted to pixels. - """ if isinstance(meas_cm, Iterable): return 1.5 * 37.795 * meas_cm + np.array(shift) @@ -25,7 +18,6 @@ def to_pixel(meas_cm: list[float] | float, shift: int = 0) -> float: class UnicycleEnv(gym.Env): - """Custom Environment that follows SafetyGym interface""" def __init__(self) -> None: @@ -84,41 +76,11 @@ def step( self, action: np.ndarray, ) -> tuple[np.ndarray, float, float, bool, bool, dict[str, Any]]: - """ - Advance the environment state based on the action taken by the agent. - - Parameters: - action(np.ndarray): Control action taken by the agent. - - Returns: - A tuple containing: - - new_obs : np.ndarray, the new observation structured as [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, dist2goal]. - - reward : float, reward received after taking the action. - - cost : float, cost incurred after taking the action. - - terminated : bool, whether the episode has terminated. - - truncated : bool, whether the episode was truncated. - - info : dict, additional information about the environment's state. - """ action = np.clip(action, -1.0, 1.0) state, reward, cost, terminated, truncated, info = self._step(action) return self.get_obs(), reward, cost, terminated, truncated, info def _step(self, action: np.ndarray) -> tuple: - """ - Update the internal state based on the action, considering dynamics and disturbances. - - Parameters: - action(np.ndarray): Control action taken by the agent. - - Returns: - A tuple containing: - - state : np.ndarray, new internal state of the agent. - - reward : float, reward collected during this transition. - - cost : float, cost incurred during this transition. - - terminated : bool, whether the episode has terminated. - - truncated : bool, whether the episode was truncated due to reaching a step limit. - - info : dict, additional information relevant to the environment. - """ self.state += self.dt * (self.get_f(self.state) + self.get_g(self.state) @ action) self.state -= self.dt * 0.1 * self.get_g(self.state) @ np.array([np.cos(self.state[2]), 0]) @@ -143,29 +105,9 @@ def _step(self, action: np.ndarray) -> tuple: return self.state, reward, cost, terminated, truncated, {} def goal_met(self) -> bool: - """ - Check if the current goal has been met in this step. - - Returns: - True if the agent has reached the goal, False otherwise. - """ return np.linalg.norm(self.state[:2] - self.goal_pos) <= self.goal_size def reset(self, seed: int | None = None, options: dict | None = None) -> tuple: - """ - Reset the environment to an initial state. - - Parameters: - seed : int, optional - Seed for random number generator. - options : dict, optional - Additional options to customize the environment reset. - - Returns: - A tuple containing: - - observation : np.ndarray, the first observation after reset. - - info : dict, additional information about the reset state. - """ self.episode_step = 0 if self.rand_init: @@ -178,16 +120,6 @@ def reset(self, seed: int | None = None, options: dict | None = None) -> tuple: return self.get_obs(), {} def render(self, mode: str = 'human') -> np.ndarray: - """Render the environment to the screen - - Parameters:--- - mode : str - close : bool - - Returns: - - """ - if mode != 'human' and mode != 'rgb_array': rel_loc = self.goal_pos - self.state[:2] theta_error = np.arctan2(rel_loc[1], rel_loc[0]) - self.state[2] @@ -297,17 +229,6 @@ def get_obs(self) -> np.ndarray: ) def _get_dynamics(self) -> tuple[Callable, Callable]: - """Get affine Control Barrier Function (CBF) dynamics for a given environment. - - This method provides access to the system's drift and control dynamics, formulated for continuous systems of the form x' = f(x) + g(x)u, where 'x' is the state vector and 'u' is the control vector. - - Returns: - get_f : Callable[[np.ndarray], np.ndarray] - Function to compute the drift dynamics 'f(x)' of the system. - - get_g : Callable[[np.ndarray], np.ndarray] - Function to compute the control dynamics 'g(x)' of the system. - """ def get_f(state: np.ndarray) -> np.ndarray: """Function to compute the drift dynamics 'f(x)' of the system.""" @@ -321,15 +242,6 @@ def get_g(state: np.ndarray) -> np.ndarray: return get_f, get_g def obs_compass(self) -> np.ndarray: - """ - Return a robot-centric compass observation of a list of positions. - Compass is a normalized (unit-lenght) egocentric XY vector, - from the agent to the object. - This is equivalent to observing the egocentric XY angle to the target, - projected into the sin/cos space we use for joints. - (See comment on joint observation for why we do this.) - """ - # Get ego vector in world frame vec = self.goal_pos - self.state[:2] # Rotate into frame @@ -351,91 +263,3 @@ def close(self) -> None: if self.viewer: self.viewer.close() self.viewer = None - - def get_random_hazard_locations(self, n_hazards: int, hazard_radius: float) -> None: - """ - - Parameters:--- - n_hazards : int - Number of hazards to create - hazard_radius : float - Radius of hazards - - Returns: - hazards_locs : np.ndarray - Numpy array of shape (n_hazards, 2) containing xy locations of hazards. - """ - - # Create buffer with boundaries - buffered_bds = np.copy(self.bds) - buffered_bds[0] = buffered_bds[0] + hazard_radius - buffered_bds[1] -= hazard_radius - - hazards = [] - hazards_centers = np.zeros((n_hazards, 2)) - n = 0 # Number of hazards actually placed - for _ in range(n_hazards): - successfully_placed = False - iteration = 0 - hazard_type = np.random.randint(3) # 0-> Circle 1->Square 2->Triangle - radius = hazard_radius * (1 - 0.2 * 2.0 * (np.random.random() - 0.5)) - while not successfully_placed and iteration < 100: - hazards_centers[n] = (buffered_bds[1] - buffered_bds[0]) * np.random.random( - 2, - ) + buffered_bds[0] - successfully_placed = np.all( - np.linalg.norm(hazards_centers[:n] - hazards_centers[[n]], axis=1) - > 3.5 * hazard_radius, - ) - successfully_placed = np.logical_and( - successfully_placed, - np.linalg.norm(self.goal_pos - hazards_centers[n]) > 2.0 * hazard_radius, - ) - successfully_placed = np.logical_and( - successfully_placed, - np.all( - np.linalg.norm(self.initial_state[:, :2] - hazards_centers[[n]], axis=1) - > 2.0 * hazard_radius, - ), - ) - iteration += 1 - if not successfully_placed: - continue - if hazard_type == 0: # Circle - hazards.append({'type': 'circle', 'location': hazards_centers[n], 'radius': radius}) - elif hazard_type == 1: # Square - hazards.append( - { - 'type': 'polygon', - 'vertices': np.array( - [ - [-radius, -radius], - [-radius, radius], - [radius, radius], - [radius, -radius], - ], - ), - }, - ) - hazards[-1]['vertices'] += hazards_centers[n] - else: # Triangle - hazards.append( - { - 'type': 'polygon', - 'vertices': np.array( - [ - [-radius, -radius], - [-radius, radius], - [radius, radius], - [radius, -radius], - ], - ), - }, - ) - # Pick a vertex and delete it - idx = np.random.randint(4) - hazards[-1]['vertices'] = np.delete(hazards[-1]['vertices'], idx, axis=0) - hazards[-1]['vertices'] += hazards_centers[n] - n += 1 - - self.hazards = hazards diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py index 2f17f852b..c94c38389 100644 --- a/omnisafe/evaluator.py +++ b/omnisafe/evaluator.py @@ -13,6 +13,8 @@ # limitations under the License. # ============================================================================== """Implementation of Evaluator.""" +# mypy: ignore-errors + from __future__ import annotations @@ -37,6 +39,7 @@ SafeARCPlanner, ) from omnisafe.common import Normalizer +<<<<<<< HEAD from omnisafe.common.control_barrier_function.crabs.models import ( AddGaussianNoise, CrabsCore, @@ -47,6 +50,12 @@ from omnisafe.common.control_barrier_function.crabs.optimizers import Barrier from omnisafe.common.control_barrier_function.crabs.utils import Normalizer as CRABSNormalizer from omnisafe.common.control_barrier_function.crabs.utils import create_model_and_trainer +======= +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.common.robust_barrier_solver import CBFQPLayer +from omnisafe.common.robust_gp_model import DynamicsModel +>>>>>>> wip from omnisafe.envs.core import CMDP, make from omnisafe.envs.wrapper import ActionRepeat, ActionScale, ObsNormalize, TimeLimit from omnisafe.models.actor import ActorBuilder @@ -94,6 +103,9 @@ def __init__( self._safety_obs = torch.ones(1) self._cost_count = torch.zeros(1) self.__set_render_mode(render_mode) + self._dynamics_model: DynamicsModel | None = None + self._solver: PendulumSolver | CBFQPLayer | None = None + self._compensator = None def __set_render_mode(self, render_mode: str) -> None: """Set the render mode. @@ -130,7 +142,7 @@ def __load_cfgs(self, save_dir: str) -> None: self._dict_cfgs = kwargs self._cfgs = Config.dict2config(kwargs) - # pylint: disable-next=too-many-branches + # pylint: disable-next=attribute-defined-outside-init,import-outside-toplevel,too-many-branches,too-many-locals def __load_model_and_env( self, save_dir: str, @@ -302,9 +314,7 @@ def __load_model_and_env( self._actor = actor_builder.build_actor(actor_type) self._actor.load_state_dict(model_params['pi']) if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': - from omnisafe.common.barrier_comp import BarrierCompensator - - self.compensator = BarrierCompensator( + self._compensator = BarrierCompensator( obs_dim=observation_space.shape[0], act_dim=action_space.shape[0], cfgs=self._cfgs['compensator_cfgs'], @@ -316,21 +326,18 @@ def __load_model_and_env( raise FileNotFoundError( 'The model is not found in the save directory.', ) from error - self.compensator.load_state_dict(model_params['compensator']) + self._compensator.load_state_dict(model_params['compensator']) if self._cfgs['algo'] == 'SACRCBF': - from omnisafe.common.robust_barrier_solver import CBFQPLayer - from omnisafe.common.robust_gp_model import DynamicsModel - epoch = model_name.split('.pt')[0].split('-')[-1] - self.solver = CBFQPLayer( + self._solver = CBFQPLayer( env=self._env, device=self._cfgs['train_cfgs']['device'], gamma_b=self._cfgs['cbf_cfgs']['gamma_b'], k_d=self._cfgs['cbf_cfgs']['k_d'], l_p=self._cfgs['cbf_cfgs']['l_p'], ) - self.dynamics_model = DynamicsModel(env=self._env) - self.dynamics_model.load_disturbance_models( + self._dynamics_model = DynamicsModel(env=self._env) + self._dynamics_model.load_disturbance_models( save_dir=os.path.join(self._save_dir, 'gp_model_save'), epoch=epoch, ) @@ -417,15 +424,14 @@ def load_saved( self.__set_render_mode(render_mode) if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': - from omnisafe.common.barrier_solver import PendulumSolver - self.solver = PendulumSolver() + self._solver = PendulumSolver() path = os.path.join( save_dir, 'gp_model_save', f'gaussian_process_regressor_{epoch}.pkl', ) - self.solver.build_gp_model(save_dir=path) + self._solver.build_gp_model(save_dir=path) env_kwargs = { 'env_id': self._cfgs['env_id'], @@ -441,6 +447,7 @@ def load_saved( self.__load_model_and_env(save_dir, model_name, env_kwargs) + # pylint: disable-next=too-many-locals def evaluate( self, num_episodes: int = 10, @@ -498,10 +505,10 @@ def evaluate( 'The policy must be provided or created before evaluating the agent.', ) if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': - approx_compensating_act = self.compensator(obs=obs) + approx_compensating_act = self._compensator(obs=obs) compensated_act_mean_raw = act + approx_compensating_act - [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=False) - compensating_act = self.solver.control_barrier( + [f, g, x, std] = self._solver.get_gp_dynamics(obs, use_prev_model=False) + compensating_act = self._solver.control_barrier( compensated_act_mean_raw, f, g, @@ -511,11 +518,11 @@ def evaluate( act = compensated_act_mean_raw + compensating_act if self._cfgs['algo'] == 'SACRCBF': - state_batch = self.dynamics_model.get_state(obs) - mean_pred_batch, sigma_pred_batch = self.dynamics_model.predict_disturbance( + state_batch = self._dynamics_model.get_state(obs) + mean_pred_batch, sigma_pred_batch = self._dynamics_model.predict_disturbance( state_batch, ) - safe_act = self.solver.get_safe_action( + safe_act = self._solver.get_safe_action( state_batch, act, mean_pred_batch, diff --git a/omnisafe/utils/tools.py b/omnisafe/utils/tools.py index 2c0c626eb..7c7a10ceb 100644 --- a/omnisafe/utils/tools.py +++ b/omnisafe/utils/tools.py @@ -356,3 +356,40 @@ def get_device(device: torch.device | str | int = DEVICE_CPU) -> torch.device: return torch.device('cpu') return device + + +def to_tensor( + x: np.ndarray, + dtype: torch.dtype, + device: torch.device, + requires_grad: bool = False, +) -> torch.Tensor: + """Convert a numpy array to a torch tensor of specified type and device. + + Args: + x (np.ndarray): A numpy array to be converted. + dtype (torch.dtype): The desired data type for the tensor. + device (torch.device): The device to store the tensor on. + requires_grad (bool): If True, gradients will be computed for operations involving this tensor. + + Returns: + torch.Tensor: A torch tensor representation of the input array. + """ + return torch.from_numpy(x).type(dtype).to(device).requires_grad_(requires_grad) + + +def sort_vertices_cclockwise(vertices: np.ndarray) -> np.ndarray: + """Sort vertices of a 2D convex polygon in counter-clockwise direction. + + Args: + vertices (np.ndarray): An array of shape (n_v, 2) where n_v is the number of vertices. + + Returns: + np.ndarray: An array of vertices sorted in counter-clockwise direction. + """ + assert vertices.shape[1] == 2, f'Vertices must each have dimension 2, got {vertices.shape[1]}' + polygon_center = vertices.sum(axis=0, keepdims=True) / vertices.shape[0] # (1, d) + rel_vecs = vertices - polygon_center + thetas = np.arctan2(rel_vecs[:, 1], rel_vecs[:, 0]) + idxs = np.argsort(thetas) + return vertices[idxs, :] From 23f66d13dd38920c1785cc66b761d528760c0e4e Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Mon, 6 May 2024 15:13:50 +0800 Subject: [PATCH 04/18] chore: update pytest --- .pre-commit-config.yaml | 2 +- omnisafe/adapter/barrier_function_adapter.py | 28 +- .../adapter/beta_barrier_function_adapter.py | 18 +- .../offpolicy_barrier_function_adapter.py | 2 +- .../robust_barrier_function_adapter.py | 2 +- omnisafe/algorithms/off_policy/__init__.py | 15 +- omnisafe/algorithms/off_policy/ddpg_cbf.py | 3 - .../on_policy/barrier_function/trpo_cbf.py | 7 +- omnisafe/common/robust_barrier_solver.py | 2 + omnisafe/envs/__init__.py | 5 +- omnisafe/envs/classic_control/__init__.py | 3 + .../envs_from_cbf.py} | 16 +- .../envs_from_rcbf.py} | 188 ++++++++++++- omnisafe/envs/unicycle_env.py | 265 ------------------ omnisafe/evaluator.py | 30 +- tests/test_policy.py | 51 +++- 16 files changed, 333 insertions(+), 304 deletions(-) rename omnisafe/envs/{barrier_function_env.py => classic_control/envs_from_cbf.py} (95%) rename omnisafe/envs/{robust_barrier_function_env.py => classic_control/envs_from_rcbf.py} (51%) delete mode 100644 omnisafe/envs/unicycle_env.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 42e2956f9..63f378224 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -114,10 +114,10 @@ repos: ^tests/| ^setup.py$| ^omnisafe/envs/classic_control/envs_from_crabs.py$| + ^omnisafe/envs/classic_control/envs_from_rcbf.py| ^omnisafe/common/control_barrier_function/crabs/models.py$| ^omnisafe/common/control_barrier_function/crabs/optimizers.py$| ^omnisafe/common/control_barrier_function/crabs/utils.py$| ^conftest.py$| - ^omnisafe/envs/unicycle_env.py| ^setup.py$ ) diff --git a/omnisafe/adapter/barrier_function_adapter.py b/omnisafe/adapter/barrier_function_adapter.py index a91218b48..469f4e7cd 100644 --- a/omnisafe/adapter/barrier_function_adapter.py +++ b/omnisafe/adapter/barrier_function_adapter.py @@ -71,18 +71,23 @@ def _wrapper( """ assert not obs_normalize, 'Barrier function does not support observation normalization!' if self._env.need_time_limit_wrapper: - self._env = TimeLimit(self._env, time_limit=1000, device=self._device) - self._eval_env = TimeLimit(self._eval_env, time_limit=1000, device=self._device) + assert ( + self._env.max_episode_steps + ), 'You must define max_episode_steps as an integer\ + \nor cancel the use of the time_limit wrapper.' + self._env = TimeLimit( + self._env, + time_limit=self._env.max_episode_steps, + device=self._device, + ) if self._env.need_auto_reset_wrapper: self._env = AutoReset(self._env, device=self._device) - self._eval_env = AutoReset(self._eval_env, device=self._device) if reward_normalize: self._env = RewardNormalize(self._env, device=self._device) if cost_normalize: self._env = CostNormalize(self._env, device=self._device) if self._env.num_envs == 1: self._env = Unsqueeze(self._env, device=self._device) - self._eval_env = Unsqueeze(self._eval_env, device=self._device) def set_solver(self, solver: PendulumSolver) -> None: """Set the barrier function solver for Pendulum environment.""" @@ -96,7 +101,7 @@ def reset_gp_model(self) -> None: """Reset the gaussian processing model of barrier function solver.""" self.solver.reset_gp_model() - def rollout( # pylint: disable=too-many-locals + def rollout( # pylint: disable=too-many-locals,too-many-branches self, steps_per_epoch: int, agent: ConstraintActorCritic, @@ -158,7 +163,6 @@ def rollout( # pylint: disable=too-many-locals self._log_value(reward=reward, cost=cost, info=info) logger.store({'Value/reward': value_r}) - logger.store({'Metrics/angle': cost}) buffer.store( obs=obs, @@ -174,15 +178,21 @@ def rollout( # pylint: disable=too-many-locals obs = next_obs epoch_end = step >= steps_per_epoch + + if epoch_end: + num_dones = int(terminated.contiguous().sum()) + if self._env.num_envs - num_dones: + logger.log( + f'\nWarning: trajectory cut off when rollout by epoch\ + in {self._env.num_envs - num_dones} of {self._env.num_envs} environments.', + ) + for idx, (done, time_out) in enumerate(zip(terminated, truncated)): if epoch_end or done or time_out: last_value_r = torch.zeros(1) last_value_c = torch.zeros(1) if not done: if epoch_end: - logger.log( - f'Warning: trajectory cut off when rollout by epoch at {self._ep_len[idx]} steps.', - ) _, last_value_r, last_value_c, _ = agent.step(obs[idx]) if time_out: _, last_value_r, last_value_c, _ = agent.step( diff --git a/omnisafe/adapter/beta_barrier_function_adapter.py b/omnisafe/adapter/beta_barrier_function_adapter.py index 844c0b4ce..d5738e02d 100644 --- a/omnisafe/adapter/beta_barrier_function_adapter.py +++ b/omnisafe/adapter/beta_barrier_function_adapter.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Barrier Function Adapter for OmniSafe.""" +"""Barrier Function Adapter with Beta Distribution for OmniSafe.""" from __future__ import annotations @@ -25,7 +25,7 @@ from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter from omnisafe.common.buffer import VectorOnPolicyBuffer from omnisafe.common.logger import Logger -from omnisafe.envs.wrapper import CostNormalize, RewardNormalize, Unsqueeze +from omnisafe.envs.wrapper import AutoReset, CostNormalize, RewardNormalize, TimeLimit, Unsqueeze from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic from omnisafe.utils.config import Config @@ -157,13 +157,24 @@ def _wrapper( cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True. """ assert not obs_normalize, 'Barrier function does not support observation normalization!' + if self._env.need_time_limit_wrapper: + assert ( + self._env.max_episode_steps + ), 'You must define max_episode_steps as an integer\ + \nor cancel the use of the time_limit wrapper.' + self._env = TimeLimit( + self._env, + time_limit=self._env.max_episode_steps, + device=self._device, + ) + if self._env.need_auto_reset_wrapper: + self._env = AutoReset(self._env, device=self._device) if reward_normalize: self._env = RewardNormalize(self._env, device=self._device) if cost_normalize: self._env = CostNormalize(self._env, device=self._device) if self._env.num_envs == 1: self._env = Unsqueeze(self._env, device=self._device) - self._eval_env = Unsqueeze(self._eval_env, device=self._device) def rollout( # pylint: disable=too-many-locals self, @@ -203,7 +214,6 @@ def rollout( # pylint: disable=too-many-locals if self._cfgs.algo_cfgs.use_cost: logger.store({'Value/cost': value_c}) logger.store({'Value/reward': value_r}) - logger.store({'Metrics/angle': info.get('original_cost', cost).cpu()}) buffer.store( obs=obs, diff --git a/omnisafe/adapter/offpolicy_barrier_function_adapter.py b/omnisafe/adapter/offpolicy_barrier_function_adapter.py index 49bf7909c..f40a7add9 100644 --- a/omnisafe/adapter/offpolicy_barrier_function_adapter.py +++ b/omnisafe/adapter/offpolicy_barrier_function_adapter.py @@ -68,7 +68,6 @@ def _wrapper( self._env = CostNormalize(self._env, device=self._device) if self._env.num_envs == 1: self._env = Unsqueeze(self._env, device=self._device) - self._eval_env = Unsqueeze(self._eval_env, device=self._device) def eval_policy( # pylint: disable=too-many-locals self, @@ -83,6 +82,7 @@ def eval_policy( # pylint: disable=too-many-locals agent (ConstraintActorCritic): Agent. logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. """ + assert self._eval_env for _ in range(episode): ep_ret, ep_cost, ep_len = 0.0, 0.0, 0 obs, _ = self._eval_env.reset() diff --git a/omnisafe/adapter/robust_barrier_function_adapter.py b/omnisafe/adapter/robust_barrier_function_adapter.py index f56674319..8da2cf658 100644 --- a/omnisafe/adapter/robust_barrier_function_adapter.py +++ b/omnisafe/adapter/robust_barrier_function_adapter.py @@ -76,7 +76,6 @@ def _wrapper( self._env = CostNormalize(self._env, device=self._device) if self._env.num_envs == 1: self._env = Unsqueeze(self._env, device=self._device) - self._eval_env = Unsqueeze(self._eval_env, device=self._device) def set_solver(self, solver: CBFQPLayer) -> None: """Set the barrier function solver for Pendulum environment.""" @@ -101,6 +100,7 @@ def eval_policy( # pylint: disable=too-many-locals agent (ConstraintActorCritic): Agent. logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. """ + assert self._eval_env for _ in range(episode): ep_ret, ep_cost, ep_len = 0.0, 0.0, 0 obs, _ = self._eval_env.reset() diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py index 5a297c49f..1e14ebd26 100644 --- a/omnisafe/algorithms/off_policy/__init__.py +++ b/omnisafe/algorithms/off_policy/__init__.py @@ -28,4 +28,17 @@ from omnisafe.algorithms.off_policy.td3_pid import TD3PID -__all__ = ['DDPG', 'TD3', 'SAC', 'DDPGLag', 'TD3Lag', 'SACLag', 'DDPGPID', 'TD3PID', 'SACPID', 'SACRCBF', 'DDPGCBF', 'CRABS'] +__all__ = [ + 'DDPG', + 'TD3', + 'SAC', + 'DDPGLag', + 'TD3Lag', + 'SACLag', + 'DDPGPID', + 'TD3PID', + 'SACPID', + 'SACRCBF', + 'DDPGCBF', + 'CRABS', +] diff --git a/omnisafe/algorithms/off_policy/ddpg_cbf.py b/omnisafe/algorithms/off_policy/ddpg_cbf.py index 32b27be1d..de556372b 100644 --- a/omnisafe/algorithms/off_policy/ddpg_cbf.py +++ b/omnisafe/algorithms/off_policy/ddpg_cbf.py @@ -126,8 +126,5 @@ def _log_what_to_save(self) -> dict[str, Any]: what_to_save['pi'] = self._actor_critic.actor what_to_save['compensator'] = self._env.compensator - if self._cfgs.algo_cfgs.obs_normalize: - obs_normalizer = self._env.save()['obs_normalizer'] - what_to_save['obs_normalizer'] = obs_normalizer self._logger.setup_torch_saver(what_to_save) diff --git a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py index 72238e41a..8125151d6 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py +++ b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py @@ -112,7 +112,6 @@ def _update(self) -> None: act, logp, target_value_r, - target_value_c, adv_r, adv_c, approx_compensating_act, @@ -122,7 +121,6 @@ def _update(self) -> None: data['act'], data['logp'], data['target_value_r'], - data['target_value_c'], data['adv_r'], data['adv_c'], data['approx_compensating_act'], @@ -136,7 +134,7 @@ def _update(self) -> None: compensating_act=compensating_act, ) dataloader = DataLoader( - dataset=TensorDataset(obs, target_value_r, target_value_c), + dataset=TensorDataset(obs, target_value_r), batch_size=self._cfgs.algo_cfgs.batch_size, shuffle=True, ) @@ -145,11 +143,8 @@ def _update(self) -> None: for ( obs, target_value_r, - target_value_c, ) in dataloader: self._update_reward_critic(obs, target_value_r) - if self._cfgs.algo_cfgs.use_cost: - self._update_cost_critic(obs, target_value_c) self._logger.store( { diff --git a/omnisafe/common/robust_barrier_solver.py b/omnisafe/common/robust_barrier_solver.py index 3e14d002c..62499352b 100644 --- a/omnisafe/common/robust_barrier_solver.py +++ b/omnisafe/common/robust_barrier_solver.py @@ -19,6 +19,7 @@ # pylint: disable=invalid-name,wrong-spelling-in-docstring from __future__ import annotations +import warnings from typing import Any import gymnasium as gym @@ -58,6 +59,7 @@ def __init__( self.k_d = k_d self.l_p = l_p self.action_dim = env.action_space.shape[0] + warnings.filterwarnings('ignore') def get_safe_action( self, diff --git a/omnisafe/envs/__init__.py b/omnisafe/envs/__init__.py index c21b1973c..fb1bf03b1 100644 --- a/omnisafe/envs/__init__.py +++ b/omnisafe/envs/__init__.py @@ -15,15 +15,14 @@ """Environment API for OmniSafe.""" from omnisafe.envs import classic_control -from omnisafe.envs.barrier_function_env import BarrierFunctionEnv +from omnisafe.envs.classic_control.envs_from_cbf import BarrierFunctionEnv +from omnisafe.envs.classic_control.envs_from_rcbf import RobustBarrierFunctionEnv from omnisafe.envs.core import CMDP, env_register, make, support_envs from omnisafe.envs.crabs_env import CRABSEnv from omnisafe.envs.custom_env import CustomEnv from omnisafe.envs.meta_drive_env import SafetyMetaDriveEnv from omnisafe.envs.barrier_function_env import BarrierFunctionEnv from omnisafe.envs.mujoco_env import MujocoEnv -from omnisafe.envs.robust_barrier_function_env import RobustBarrierFunctionEnv from omnisafe.envs.safety_gymnasium_env import SafetyGymnasiumEnv from omnisafe.envs.safety_gymnasium_modelbased import SafetyGymnasiumModelBased from omnisafe.envs.safety_isaac_gym_env import SafetyIsaacGymEnv -from omnisafe.envs.robust_barrier_function_env import RobustBarrierFunctionEnv diff --git a/omnisafe/envs/classic_control/__init__.py b/omnisafe/envs/classic_control/__init__.py index d899a41de..9d5a3ba99 100644 --- a/omnisafe/envs/classic_control/__init__.py +++ b/omnisafe/envs/classic_control/__init__.py @@ -13,4 +13,7 @@ # limitations under the License. # ============================================================================== """Environment implementations from papers.""" + from omnisafe.envs.classic_control import envs_from_crabs +from omnisafe.envs.classic_control.envs_from_cbf import BarrierFunctionEnv +from omnisafe.envs.classic_control.envs_from_rcbf import RobustBarrierFunctionEnv diff --git a/omnisafe/envs/barrier_function_env.py b/omnisafe/envs/classic_control/envs_from_cbf.py similarity index 95% rename from omnisafe/envs/barrier_function_env.py rename to omnisafe/envs/classic_control/envs_from_cbf.py index 01477c1fe..c46012b8d 100644 --- a/omnisafe/envs/barrier_function_env.py +++ b/omnisafe/envs/classic_control/envs_from_cbf.py @@ -14,6 +14,9 @@ # ============================================================================== """Interface of control barrier function-based environments.""" +# mypy: ignore-errors +# pylint: disable=all + from __future__ import annotations from typing import Any, ClassVar @@ -72,7 +75,11 @@ def __init__( super().__init__(env_id) self._env_id = env_id if num_envs == 1: - self._env = gymnasium.make(id=env_id, autoreset=False) + self._env = gymnasium.make( + id=env_id, + autoreset=False, + render_mode=kwargs.get('render_mode'), + ) self._env_specific_setting() assert isinstance(self._env.action_space, Box), 'Only support Box action space.' assert isinstance( @@ -103,7 +110,7 @@ def _env_specific_setting(self) -> None: high=self._env.unwrapped.max_torque, # type: ignore shape=(1,), ) - high = np.array([1.0, 1.0, self._env.unwrapped.max_speed]) # type: ignore + high = np.array([1.0, 1.0, self._env.unwrapped.max_speed], dtype=np.float32) # type: ignore self._env.unwrapped.observation_space = spaces.Box(low=-high, high=high) self._env.dt = 0.05 # type: ignore @@ -197,6 +204,11 @@ def reset( obs, info = self._env.reset(options=options) return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info + @property + def max_episode_steps(self) -> int: + """The max steps per episode.""" + return self._env.spec.max_episode_steps + def set_seed(self, seed: int) -> None: """Set the seed for the environment. diff --git a/omnisafe/envs/robust_barrier_function_env.py b/omnisafe/envs/classic_control/envs_from_rcbf.py similarity index 51% rename from omnisafe/envs/robust_barrier_function_env.py rename to omnisafe/envs/classic_control/envs_from_rcbf.py index 9bce446ce..e97aaaea5 100644 --- a/omnisafe/envs/robust_barrier_function_env.py +++ b/omnisafe/envs/classic_control/envs_from_rcbf.py @@ -14,18 +14,202 @@ # ============================================================================== """Interface of control barrier function-based environments.""" +# mypy: ignore-errors +# pylint: disable=all + from __future__ import annotations -from typing import Any, ClassVar +from collections.abc import Iterable +from typing import Any, Callable, ClassVar +import gymnasium import numpy as np import torch +from gymnasium import spaces from omnisafe.envs.core import CMDP, env_register -from omnisafe.envs.unicycle_env import UnicycleEnv from omnisafe.typing import Box +def to_pixel(meas_cm: list[float] | float, shift: int = 0) -> float: + if isinstance(meas_cm, Iterable): + return 1.5 * 37.795 * meas_cm + np.array(shift) + + return 1.5 * 37.795 * meas_cm + shift + + +class UnicycleEnv(gymnasium.Env): + + def __init__(self) -> None: + + super().__init__() + + self.dynamics_mode = 'Unicycle' + self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,)) + self.safe_action_space = spaces.Box(low=-2.5, high=2.5, shape=(2,)) + self.observation_space = spaces.Box(low=-1e10, high=1e10, shape=(7,)) + self.bds = np.array([[-3.0, -3.0], [3.0, 3.0]]) + + self.dt = 0.02 + self.max_episode_steps = 1000 + self.reward_goal = 1.0 + self.goal_size = 0.3 + # Initialize Env + self.state = None + self.episode_step = 0 + self.initial_state = np.array( + [[-2.5, -2.5, 0.0], [-2.5, 2.5, 0.0], [-2.5, 0.0, 0.0], [2.5, -2.5, np.pi / 2]], + ) + self.goal_pos = np.array([2.5, 2.5]) + self.rand_init = False + + self.reset() + + # Get Dynamics + self.get_f, self.get_g = self._get_dynamics() + # Disturbance + self.disturb_mean = np.zeros((3,)) + self.disturb_covar = np.diag([0.005, 0.005, 0.05]) * 20 + + # Build Hazards + self.hazards = [] + + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([0.0, 0.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([-1.0, 1.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([-1.0, -1.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([1.0, -1.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([1.0, 1.0])}, + ) + + # Viewer + self.viewer = None + + def step( + self, + action: np.ndarray, + ) -> tuple[np.ndarray, float, float, bool, bool, dict[str, Any]]: + """Step the environment.""" + action = np.clip(action, -1.0, 1.0) + state, reward, cost, terminated, truncated, info = self._step(action) + return self.get_obs(), reward, cost, terminated, truncated, info + + def _step(self, action: np.ndarray) -> tuple: + """The details of step dynamics.""" + self.state += self.dt * (self.get_f(self.state) + self.get_g(self.state) @ action) + self.state -= self.dt * 0.1 * self.get_g(self.state) @ np.array([np.cos(self.state[2]), 0]) + + self.episode_step += 1 + + dist_goal = self._goal_dist() + reward = self.last_goal_dist - dist_goal + self.last_goal_dist = dist_goal + terminated = False + if self.goal_met(): + reward += self.reward_goal + terminated = True + truncated = self.episode_step >= self.max_episode_steps + + cost = 0.0 + for hazard in self.hazards: + if hazard['type'] == 'circle': + cost += 0.1 * ( + np.sum((self.state[:2] - hazard['location']) ** 2) < hazard['radius'] ** 2 + ) + + return self.state, reward, cost, terminated, truncated, {} + + def goal_met(self) -> bool: + return np.linalg.norm(self.state[:2] - self.goal_pos) <= self.goal_size + + def reset(self, seed: int | None = None, options: dict | None = None) -> tuple: + self.episode_step = 0 + + if self.rand_init: + self.state = np.copy(self.initial_state[np.random.randint(self.initial_state.shape[0])]) + else: + self.state = np.copy(self.initial_state[0]) + + self.last_goal_dist = self._goal_dist() + + return self.get_obs(), {} + + def render(self, mode: str = 'human') -> np.ndarray: + """Get the image of the running environment.""" + raise NotImplementedError + + def get_obs(self) -> np.ndarray: + """Given the state, this function returns corresponding observation. + + Returns: + Observation: np.ndarray. + """ + + rel_loc = self.goal_pos - self.state[:2] + goal_dist = np.linalg.norm(rel_loc) + goal_compass = self.obs_compass() # compass to the goal + + return np.array( + [ + self.state[0], + self.state[1], + np.cos(self.state[2]), + np.sin(self.state[2]), + goal_compass[0], + goal_compass[1], + np.exp(-goal_dist), + ], + ) + + def obs_compass(self) -> np.ndarray: + """Return a robot-centric compass observation of a list of positions.""" + + # Get ego vector in world frame + vec = self.goal_pos - self.state[:2] + # Rotate into frame + R = np.array( + [ + [np.cos(self.state[2]), -np.sin(self.state[2])], + [np.sin(self.state[2]), np.cos(self.state[2])], + ], + ) + vec = np.matmul(vec, R) + # Normalize + vec /= np.sqrt(np.sum(np.square(vec))) + 0.001 + return vec + + def _get_dynamics(self) -> tuple[Callable, Callable]: + + def get_f(state: np.ndarray) -> np.ndarray: + """Function to compute the drift dynamics 'f(x)' of the system.""" + return np.zeros(state.shape) + + def get_g(state: np.ndarray) -> np.ndarray: + """Function to compute the control dynamics 'g(x)' of the system.""" + theta = state[2] + return np.array([[np.cos(theta), 0], [np.sin(theta), 0], [0, 1.0]]) + + return get_f, get_g + + def _goal_dist(self) -> np.ndarray: + """Calculate the distance between the goal.""" + return np.linalg.norm(self.goal_pos - self.state[:2]) + + def close(self) -> None: + """Close the instance of environment.""" + if self.viewer: + self.viewer.close() + self.viewer = None + + @env_register class RobustBarrierFunctionEnv(CMDP): """Interface of control barrier function-based environments. diff --git a/omnisafe/envs/unicycle_env.py b/omnisafe/envs/unicycle_env.py deleted file mode 100644 index dd0515fba..000000000 --- a/omnisafe/envs/unicycle_env.py +++ /dev/null @@ -1,265 +0,0 @@ -# pylint: disable=all -# mypy: ignore-errors -from __future__ import annotations - -from collections.abc import Iterable -from typing import Any, Callable - -import gymnasium as gym -import numpy as np -from gymnasium import spaces - - -def to_pixel(meas_cm: list[float] | float, shift: int = 0) -> float: - if isinstance(meas_cm, Iterable): - return 1.5 * 37.795 * meas_cm + np.array(shift) - - return 1.5 * 37.795 * meas_cm + shift - - -class UnicycleEnv(gym.Env): - - def __init__(self) -> None: - - super().__init__() - - self.dynamics_mode = 'Unicycle' - self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,)) - self.safe_action_space = spaces.Box(low=-2.5, high=2.5, shape=(2,)) - self.observation_space = spaces.Box(low=-1e10, high=1e10, shape=(7,)) - self.bds = np.array([[-3.0, -3.0], [3.0, 3.0]]) - - self.dt = 0.02 - self.max_episode_steps = 1000 - self.reward_goal = 1.0 - self.goal_size = 0.3 - # Initialize Env - self.state = None - self.episode_step = 0 - self.initial_state = np.array( - [[-2.5, -2.5, 0.0], [-2.5, 2.5, 0.0], [-2.5, 0.0, 0.0], [2.5, -2.5, np.pi / 2]], - ) - self.goal_pos = np.array([2.5, 2.5]) - self.rand_init = False - - self.reset() - - # Get Dynamics - self.get_f, self.get_g = self._get_dynamics() - # Disturbance - self.disturb_mean = np.zeros((3,)) - self.disturb_covar = np.diag([0.005, 0.005, 0.05]) * 20 - - # Build Hazards - self.hazards = [] - - self.hazards.append( - {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([0.0, 0.0])}, - ) - self.hazards.append( - {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([-1.0, 1.0])}, - ) - self.hazards.append( - {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([-1.0, -1.0])}, - ) - self.hazards.append( - {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([1.0, -1.0])}, - ) - self.hazards.append( - {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([1.0, 1.0])}, - ) - - # Viewer - self.viewer = None - - def step( - self, - action: np.ndarray, - ) -> tuple[np.ndarray, float, float, bool, bool, dict[str, Any]]: - action = np.clip(action, -1.0, 1.0) - state, reward, cost, terminated, truncated, info = self._step(action) - return self.get_obs(), reward, cost, terminated, truncated, info - - def _step(self, action: np.ndarray) -> tuple: - self.state += self.dt * (self.get_f(self.state) + self.get_g(self.state) @ action) - self.state -= self.dt * 0.1 * self.get_g(self.state) @ np.array([np.cos(self.state[2]), 0]) - - self.episode_step += 1 - - dist_goal = self._goal_dist() - reward = self.last_goal_dist - dist_goal - self.last_goal_dist = dist_goal - terminated = False - if self.goal_met(): - reward += self.reward_goal - terminated = True - truncated = self.episode_step >= self.max_episode_steps - - cost = 0.0 - for hazard in self.hazards: - if hazard['type'] == 'circle': - cost += 0.1 * ( - np.sum((self.state[:2] - hazard['location']) ** 2) < hazard['radius'] ** 2 - ) - - return self.state, reward, cost, terminated, truncated, {} - - def goal_met(self) -> bool: - return np.linalg.norm(self.state[:2] - self.goal_pos) <= self.goal_size - - def reset(self, seed: int | None = None, options: dict | None = None) -> tuple: - self.episode_step = 0 - - if self.rand_init: - self.state = np.copy(self.initial_state[np.random.randint(self.initial_state.shape[0])]) - else: - self.state = np.copy(self.initial_state[0]) - - self.last_goal_dist = self._goal_dist() - - return self.get_obs(), {} - - def render(self, mode: str = 'human') -> np.ndarray: - if mode != 'human' and mode != 'rgb_array': - rel_loc = self.goal_pos - self.state[:2] - theta_error = np.arctan2(rel_loc[1], rel_loc[0]) - self.state[2] - print( - f'Ep_step = {self.episode_step}, \tState = {self.state}, \tDist2Goal = {self._goal_dist()}, alignment_error = {theta_error}', - ) - - screen_width = 600 - screen_height = 400 - - if self.viewer is None: - from envs import pyglet_rendering - - self.viewer = pyglet_rendering.Viewer(screen_width, screen_height) - # Draw obstacles - obstacles = [] - for i in range(len(self.hazards)): - if self.hazards[i]['type'] == 'circle': - obstacles.append( - pyglet_rendering.make_circle( - radius=to_pixel(self.hazards[i]['radius'], shift=0), - filled=True, - ), - ) - obs_trans = pyglet_rendering.Transform( - translation=( - to_pixel(self.hazards[i]['location'][0], shift=screen_width / 2), - to_pixel(self.hazards[i]['location'][1], shift=screen_height / 2), - ), - ) - obstacles[i].set_color(1.0, 0.0, 0.0) - obstacles[i].add_attr(obs_trans) - elif self.hazards[i]['type'] == 'polygon': - obstacles.append( - pyglet_rendering.make_polygon( - to_pixel( - self.hazards[i]['vertices'], - shift=[screen_width / 2, screen_height / 2], - ), - filled=True, - ), - ) - self.viewer.add_geom(obstacles[i]) - - # Make Goal - goal = pyglet_rendering.make_circle(radius=to_pixel(0.1, shift=0), filled=True) - goal_trans = pyglet_rendering.Transform( - translation=( - to_pixel(self.goal_pos[0], shift=screen_width / 2), - to_pixel(self.goal_pos[1], shift=screen_height / 2), - ), - ) - goal.add_attr(goal_trans) - goal.set_color(0.0, 0.5, 0.0) - self.viewer.add_geom(goal) - - # Make Robot - self.robot = pyglet_rendering.make_circle(radius=to_pixel(0.1), filled=True) - self.robot_trans = pyglet_rendering.Transform( - translation=( - to_pixel(self.state[0], shift=screen_width / 2), - to_pixel(self.state[1], shift=screen_height / 2), - ), - ) - self.robot_trans.set_rotation(self.state[2]) - self.robot.add_attr(self.robot_trans) - self.robot.set_color(0.5, 0.5, 0.8) - self.viewer.add_geom(self.robot) - self.robot_orientation = pyglet_rendering.Line(start=(0.0, 0.0), end=(15.0, 0.0)) - self.robot_orientation.linewidth.stroke = 2 - self.robot_orientation.add_attr(self.robot_trans) - self.robot_orientation.set_color(0, 0, 0) - self.viewer.add_geom(self.robot_orientation) - - if self.state is None: - return None - - self.robot_trans.set_translation( - to_pixel(self.state[0], shift=screen_width / 2), - to_pixel(self.state[1], shift=screen_height / 2), - ) - self.robot_trans.set_rotation(self.state[2]) - - return self.viewer.render(return_rgb_array=mode == 'rgb_array') - - def get_obs(self) -> np.ndarray: - """Given the state, this function returns corresponding observation. - - Returns: - Observation: [pos_x, pos_y, cos(theta), sin(theta), xdir2goal, ydir2goal, exp(-dist2goal)] - """ - - rel_loc = self.goal_pos - self.state[:2] - goal_dist = np.linalg.norm(rel_loc) - goal_compass = self.obs_compass() # compass to the goal - - return np.array( - [ - self.state[0], - self.state[1], - np.cos(self.state[2]), - np.sin(self.state[2]), - goal_compass[0], - goal_compass[1], - np.exp(-goal_dist), - ], - ) - - def _get_dynamics(self) -> tuple[Callable, Callable]: - - def get_f(state: np.ndarray) -> np.ndarray: - """Function to compute the drift dynamics 'f(x)' of the system.""" - return np.zeros(state.shape) - - def get_g(state: np.ndarray) -> np.ndarray: - """Function to compute the control dynamics 'g(x)' of the system.""" - theta = state[2] - return np.array([[np.cos(theta), 0], [np.sin(theta), 0], [0, 1.0]]) - - return get_f, get_g - - def obs_compass(self) -> np.ndarray: - # Get ego vector in world frame - vec = self.goal_pos - self.state[:2] - # Rotate into frame - R = np.array( - [ - [np.cos(self.state[2]), -np.sin(self.state[2])], - [np.sin(self.state[2]), np.cos(self.state[2])], - ], - ) - vec = np.matmul(vec, R) - # Normalize - vec /= np.sqrt(np.sum(np.square(vec))) + 0.001 - return vec - - def _goal_dist(self) -> np.ndarray: - return np.linalg.norm(self.goal_pos - self.state[:2]) - - def close(self) -> None: - if self.viewer: - self.viewer.close() - self.viewer = None diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py index c94c38389..a1240a334 100644 --- a/omnisafe/evaluator.py +++ b/omnisafe/evaluator.py @@ -39,7 +39,6 @@ SafeARCPlanner, ) from omnisafe.common import Normalizer -<<<<<<< HEAD from omnisafe.common.control_barrier_function.crabs.models import ( AddGaussianNoise, CrabsCore, @@ -50,12 +49,10 @@ from omnisafe.common.control_barrier_function.crabs.optimizers import Barrier from omnisafe.common.control_barrier_function.crabs.utils import Normalizer as CRABSNormalizer from omnisafe.common.control_barrier_function.crabs.utils import create_model_and_trainer -======= from omnisafe.common.barrier_comp import BarrierCompensator from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.common.robust_barrier_solver import CBFQPLayer from omnisafe.common.robust_gp_model import DynamicsModel ->>>>>>> wip from omnisafe.envs.core import CMDP, make from omnisafe.envs.wrapper import ActionRepeat, ActionScale, ObsNormalize, TimeLimit from omnisafe.models.actor import ActorBuilder @@ -648,6 +645,33 @@ def render( # pylint: disable=too-many-locals,too-many-arguments,too-many-branc ).reshape( -1, # to make sure the shape is (act_dim,) ) + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + approx_compensating_act = self._compensator(obs=obs) + compensated_act_mean_raw = act + approx_compensating_act + [f, g, x, std] = self._solver.get_gp_dynamics(obs, use_prev_model=False) + compensating_act = self._solver.control_barrier( + compensated_act_mean_raw, + f, + g, + x, + std, + ) + act = compensated_act_mean_raw + compensating_act + + if self._cfgs['algo'] == 'SACRCBF': + state_batch = self._dynamics_model.get_state(obs) + mean_pred_batch, sigma_pred_batch = ( + self._dynamics_model.predict_disturbance( + state_batch, + ) + ) + safe_act = self._solver.get_safe_action( + state_batch, + act, + mean_pred_batch, + sigma_pred_batch, + ) + act = safe_act elif self._planner is not None: act = self._planner.output_action( obs.unsqueeze(0).to('cpu'), diff --git a/tests/test_policy.py b/tests/test_policy.py index 79810d0b9..8492e2193 100644 --- a/tests/test_policy.py +++ b/tests/test_policy.py @@ -38,6 +38,8 @@ pid_lagrange_policy = ['TRPOPID', 'CPPOPID'] early_terminated_policy = ['TRPOEarlyTerminated', 'PPOEarlyTerminated'] offline_policy = ['BCQ', 'BCQLag', 'CRR', 'CCRR', 'VAEBC'] +cbf_policy = ['TRPOCBF', 'DDPGCBF', 'PPOBetaCBF'] +auto_alpha = [True, False] model_cfgs = { 'linear_lr_decay': True, @@ -52,6 +54,52 @@ optim_case = [0, 1, 2, 3, 4] +@helpers.parametrize(algo=cbf_policy) +def test_cbf(algo): + env_id = 'Pendulum-v1' + + custom_cfgs = { + 'train_cfgs': { + 'total_steps': 200, + 'vector_env_nums': 1, + 'torch_threads': 4, + }, + 'algo_cfgs': { + 'steps_per_epoch': 200, + }, + 'logger_cfgs': { + 'use_wandb': False, + 'save_model_freq': 1, + }, + } + agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs) + agent.learn() + + +@helpers.parametrize(auto_alpha=auto_alpha) +def test_rcbf(auto_alpha): + env_id = 'Unicycle' + + custom_cfgs = { + 'train_cfgs': { + 'total_steps': 1000, + 'vector_env_nums': 1, + 'torch_threads': 4, + }, + 'algo_cfgs': { + 'start_learning_steps': 998, + 'update_iters': 1, + 'auto_alpha': auto_alpha, + }, + 'logger_cfgs': { + 'use_wandb': False, + 'save_model_freq': 1, + }, + } + agent = omnisafe.Agent('SACRCBF', env_id, custom_cfgs=custom_cfgs) + agent.learn() + + @helpers.parametrize(optim_case=optim_case) def test_cpo(optim_case): agent = omnisafe.Agent('CPO', 'Test-v0', custom_cfgs={}) @@ -337,9 +385,6 @@ def test_off_lag_policy(algo): agent.learn() -auto_alpha = [True, False] - - @helpers.parametrize(auto_alpha=auto_alpha) def test_sac_policy(auto_alpha): """Test sac algorithms.""" From 259975af3dfac3b601c071c3d6a2c976dec3b3a0 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Wed, 8 May 2024 23:13:40 +0800 Subject: [PATCH 05/18] chore: update pytest --- omnisafe/envs/__init__.py | 5 +- .../envs_from_cbf.py => cbf_env.py} | 0 omnisafe/envs/classic_control/__init__.py | 4 +- .../envs/classic_control/envs_from_rcbf.py | 168 +--------------- omnisafe/envs/rcbf_env.py | 187 ++++++++++++++++++ omnisafe/evaluator.py | 4 +- 6 files changed, 193 insertions(+), 175 deletions(-) rename omnisafe/envs/{classic_control/envs_from_cbf.py => cbf_env.py} (100%) create mode 100644 omnisafe/envs/rcbf_env.py diff --git a/omnisafe/envs/__init__.py b/omnisafe/envs/__init__.py index fb1bf03b1..095a1134c 100644 --- a/omnisafe/envs/__init__.py +++ b/omnisafe/envs/__init__.py @@ -15,14 +15,13 @@ """Environment API for OmniSafe.""" from omnisafe.envs import classic_control -from omnisafe.envs.classic_control.envs_from_cbf import BarrierFunctionEnv -from omnisafe.envs.classic_control.envs_from_rcbf import RobustBarrierFunctionEnv +from omnisafe.envs.cbf_env import BarrierFunctionEnv from omnisafe.envs.core import CMDP, env_register, make, support_envs from omnisafe.envs.crabs_env import CRABSEnv from omnisafe.envs.custom_env import CustomEnv from omnisafe.envs.meta_drive_env import SafetyMetaDriveEnv -from omnisafe.envs.barrier_function_env import BarrierFunctionEnv from omnisafe.envs.mujoco_env import MujocoEnv +from omnisafe.envs.rcbf_env import RobustBarrierFunctionEnv from omnisafe.envs.safety_gymnasium_env import SafetyGymnasiumEnv from omnisafe.envs.safety_gymnasium_modelbased import SafetyGymnasiumModelBased from omnisafe.envs.safety_isaac_gym_env import SafetyIsaacGymEnv diff --git a/omnisafe/envs/classic_control/envs_from_cbf.py b/omnisafe/envs/cbf_env.py similarity index 100% rename from omnisafe/envs/classic_control/envs_from_cbf.py rename to omnisafe/envs/cbf_env.py diff --git a/omnisafe/envs/classic_control/__init__.py b/omnisafe/envs/classic_control/__init__.py index 9d5a3ba99..9c8e7b35a 100644 --- a/omnisafe/envs/classic_control/__init__.py +++ b/omnisafe/envs/classic_control/__init__.py @@ -14,6 +14,4 @@ # ============================================================================== """Environment implementations from papers.""" -from omnisafe.envs.classic_control import envs_from_crabs -from omnisafe.envs.classic_control.envs_from_cbf import BarrierFunctionEnv -from omnisafe.envs.classic_control.envs_from_rcbf import RobustBarrierFunctionEnv +from omnisafe.envs.classic_control import envs_from_crabs, envs_from_rcbf diff --git a/omnisafe/envs/classic_control/envs_from_rcbf.py b/omnisafe/envs/classic_control/envs_from_rcbf.py index e97aaaea5..bdf469876 100644 --- a/omnisafe/envs/classic_control/envs_from_rcbf.py +++ b/omnisafe/envs/classic_control/envs_from_rcbf.py @@ -20,16 +20,12 @@ from __future__ import annotations from collections.abc import Iterable -from typing import Any, Callable, ClassVar +from typing import Any, Callable import gymnasium import numpy as np -import torch from gymnasium import spaces -from omnisafe.envs.core import CMDP, env_register -from omnisafe.typing import Box - def to_pixel(meas_cm: list[float] | float, shift: int = 0) -> float: if isinstance(meas_cm, Iterable): @@ -171,10 +167,7 @@ def get_obs(self) -> np.ndarray: def obs_compass(self) -> np.ndarray: """Return a robot-centric compass observation of a list of positions.""" - - # Get ego vector in world frame vec = self.goal_pos - self.state[:2] - # Rotate into frame R = np.array( [ [np.cos(self.state[2]), -np.sin(self.state[2])], @@ -182,7 +175,6 @@ def obs_compass(self) -> np.ndarray: ], ) vec = np.matmul(vec, R) - # Normalize vec /= np.sqrt(np.sum(np.square(vec))) + 0.001 return vec @@ -208,161 +200,3 @@ def close(self) -> None: if self.viewer: self.viewer.close() self.viewer = None - - -@env_register -class RobustBarrierFunctionEnv(CMDP): - """Interface of control barrier function-based environments. - - .. warning:: - Since environments based on control barrier functions require special judgment and control - of environmental dynamics, they do not support the use of vectorized environments for - parallelization. - - Attributes: - need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. - need_time_limit_wrapper (bool): Whether to use time limit wrapper. - """ - - need_auto_reset_wrapper = True - need_time_limit_wrapper = False - _support_envs: ClassVar[list[str]] = [ - 'Unicycle', - ] - - def __init__( - self, - env_id: str, - num_envs: int = 1, - device: str = 'cpu', - **kwargs: Any, - ) -> None: - """Initialize the environment. - - Args: - env_id (str): Environment id. - num_envs (int, optional): Number of environments. Defaults to 1. - device (torch.device, optional): Device to store the data. Defaults to 'cpu'. - - Keyword Args: - render_mode (str, optional): The render mode, ranging from ``human``, ``rgb_array``, ``rgb_array_list``. - Defaults to ``rgb_array``. - camera_name (str, optional): The camera name. - camera_id (int, optional): The camera id. - width (int, optional): The width of the rendered image. Defaults to 256. - height (int, optional): The height of the rendered image. Defaults to 256. - """ - super().__init__(env_id) - self._env_id = env_id - if num_envs == 1: - if self._env_id == 'Unicycle': - self._env = UnicycleEnv() - else: - raise NotImplementedError('Only support Unicycle now.') - assert isinstance(self._env.action_space, Box), 'Only support Box action space.' - assert isinstance( - self._env.observation_space, - Box, - ), 'Only support Box observation space.' - self._action_space = self._env.action_space - self._observation_space = self._env.observation_space - else: - raise NotImplementedError('Only support num_envs=1 now.') - self._device = torch.device(device) - - self._num_envs = num_envs - self._metadata = self._env.metadata - - def step( - self, - action: torch.Tensor, - ) -> tuple[ - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - torch.Tensor, - dict[str, Any], - ]: - """Step the environment. - - .. note:: - - OmniSafe use auto reset wrapper to reset the environment when the episode is - terminated. So the ``obs`` will be the first observation of the next episode. - And the true ``final_observation`` in ``info`` will be stored in the ``final_observation`` key of ``info``. - - Args: - action (torch.Tensor): Action to take. - - Returns: - observation: Agent's observation of the current environment. - reward: Amount of reward returned after previous action. - cost: Amount of cost returned after previous action. - terminated: Whether the episode has ended. - truncated: Whether the episode has been truncated due to a time limit. - info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). - """ - obs, reward, cost, terminated, truncated, info = self._env.step( - action.detach().cpu().numpy(), - ) - obs, reward, cost, terminated, truncated = ( - torch.as_tensor(x, dtype=torch.float32, device=self._device) - for x in (obs, reward, cost, terminated, truncated) - ) - if 'final_observation' in info: - info['final_observation'] = np.array( - [ - array if array is not None else np.zeros(obs.shape[-1]) - for array in info['final_observation'] - ], - ) - info['final_observation'] = torch.as_tensor( - info['final_observation'], - dtype=torch.float32, - device=self._device, - ) - - return obs, reward, cost, terminated, truncated, info - - def reset( - self, - seed: int | None = None, - options: dict[str, Any] | None = None, - ) -> tuple[torch.Tensor, dict]: - """Reset the environment. - - Args: - seed (int, optional): The random seed. Defaults to None. - options (dict[str, Any], optional): The options for the environment. Defaults to None. - - Returns: - observation: Agent's observation of the current environment. - info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). - """ - obs, info = self._env.reset(seed=seed, options=options) - return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info - - def set_seed(self, seed: int) -> None: - """Set the seed for the environment. - - Args: - seed (int): Seed to set. - """ - self.reset(seed=seed) - - def render(self) -> Any: - """Render the environment. - - Returns: - Rendered environment. - """ - return self._env.render() - - def close(self) -> None: - """Close the environment.""" - self._env.close() - - def __getattr__(self, name: str) -> Any: - """Return the unwrapped environment attributes.""" - return getattr(self._env, name) diff --git a/omnisafe/envs/rcbf_env.py b/omnisafe/envs/rcbf_env.py new file mode 100644 index 000000000..f97586dc3 --- /dev/null +++ b/omnisafe/envs/rcbf_env.py @@ -0,0 +1,187 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Interface of control barrier function-based environments.""" + +# mypy: ignore-errors +# pylint: disable=all + +from __future__ import annotations + +from typing import Any, ClassVar + +import numpy as np +import torch + +from omnisafe.envs.classic_control.envs_from_rcbf import UnicycleEnv +from omnisafe.envs.core import CMDP, env_register +from omnisafe.typing import Box + + +@env_register +class RobustBarrierFunctionEnv(CMDP): + """Interface of control barrier function-based environments. + + .. warning:: + Since environments based on control barrier functions require special judgment and control + of environmental dynamics, they do not support the use of vectorized environments for + parallelization. + + Attributes: + need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. + need_time_limit_wrapper (bool): Whether to use time limit wrapper. + """ + + need_auto_reset_wrapper = True + need_time_limit_wrapper = False + _support_envs: ClassVar[list[str]] = [ + 'Unicycle', + ] + + def __init__( + self, + env_id: str, + num_envs: int = 1, + device: str = 'cpu', + **kwargs: Any, + ) -> None: + """Initialize the environment. + + Args: + env_id (str): Environment id. + num_envs (int, optional): Number of environments. Defaults to 1. + device (torch.device, optional): Device to store the data. Defaults to 'cpu'. + + Keyword Args: + render_mode (str, optional): The render mode, ranging from ``human``, ``rgb_array``, ``rgb_array_list``. + Defaults to ``rgb_array``. + camera_name (str, optional): The camera name. + camera_id (int, optional): The camera id. + width (int, optional): The width of the rendered image. Defaults to 256. + height (int, optional): The height of the rendered image. Defaults to 256. + """ + super().__init__(env_id) + self._env_id = env_id + if num_envs == 1: + if self._env_id == 'Unicycle': + self._env = UnicycleEnv() + else: + raise NotImplementedError('Only support Unicycle now.') + assert isinstance(self._env.action_space, Box), 'Only support Box action space.' + assert isinstance( + self._env.observation_space, + Box, + ), 'Only support Box observation space.' + self._action_space = self._env.action_space + self._observation_space = self._env.observation_space + else: + raise NotImplementedError('Only support num_envs=1 now.') + self._device = torch.device(device) + + self._num_envs = num_envs + self._metadata = self._env.metadata + + def step( + self, + action: torch.Tensor, + ) -> tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + dict[str, Any], + ]: + """Step the environment. + + .. note:: + + OmniSafe use auto reset wrapper to reset the environment when the episode is + terminated. So the ``obs`` will be the first observation of the next episode. + And the true ``final_observation`` in ``info`` will be stored in the ``final_observation`` key of ``info``. + + Args: + action (torch.Tensor): Action to take. + + Returns: + observation: Agent's observation of the current environment. + reward: Amount of reward returned after previous action. + cost: Amount of cost returned after previous action. + terminated: Whether the episode has ended. + truncated: Whether the episode has been truncated due to a time limit. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + obs, reward, cost, terminated, truncated, info = self._env.step( + action.detach().cpu().numpy(), + ) + obs, reward, cost, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, cost, terminated, truncated) + ) + if 'final_observation' in info: + info['final_observation'] = np.array( + [ + array if array is not None else np.zeros(obs.shape[-1]) + for array in info['final_observation'] + ], + ) + info['final_observation'] = torch.as_tensor( + info['final_observation'], + dtype=torch.float32, + device=self._device, + ) + + return obs, reward, cost, terminated, truncated, info + + def reset( + self, + seed: int | None = None, + options: dict[str, Any] | None = None, + ) -> tuple[torch.Tensor, dict]: + """Reset the environment. + + Args: + seed (int, optional): The random seed. Defaults to None. + options (dict[str, Any], optional): The options for the environment. Defaults to None. + + Returns: + observation: Agent's observation of the current environment. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + obs, info = self._env.reset(seed=seed, options=options) + return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info + + def set_seed(self, seed: int) -> None: + """Set the seed for the environment. + + Args: + seed (int): Seed to set. + """ + self.reset(seed=seed) + + def render(self) -> Any: + """Render the environment. + + Returns: + Rendered environment. + """ + return self._env.render() + + def close(self) -> None: + """Close the environment.""" + self._env.close() + + def __getattr__(self, name: str) -> Any: + """Return the unwrapped environment attributes.""" + return getattr(self._env, name) diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py index a1240a334..90535d931 100644 --- a/omnisafe/evaluator.py +++ b/omnisafe/evaluator.py @@ -39,6 +39,8 @@ SafeARCPlanner, ) from omnisafe.common import Normalizer +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.common.control_barrier_function.crabs.models import ( AddGaussianNoise, CrabsCore, @@ -49,8 +51,6 @@ from omnisafe.common.control_barrier_function.crabs.optimizers import Barrier from omnisafe.common.control_barrier_function.crabs.utils import Normalizer as CRABSNormalizer from omnisafe.common.control_barrier_function.crabs.utils import create_model_and_trainer -from omnisafe.common.barrier_comp import BarrierCompensator -from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.common.robust_barrier_solver import CBFQPLayer from omnisafe.common.robust_gp_model import DynamicsModel from omnisafe.envs.core import CMDP, make From 08e926c1f7bb3a6c0098abbbcee9799c28264955 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Wed, 8 May 2024 23:44:04 +0800 Subject: [PATCH 06/18] chore: update pytest --- .pre-commit-config.yaml | 2 +- omnisafe/adapter/barrier_function_adapter.py | 12 ++++++------ .../adapter/beta_barrier_function_adapter.py | 16 ++++++---------- .../offpolicy_barrier_function_adapter.py | 2 +- .../adapter/robust_barrier_function_adapter.py | 4 ++-- omnisafe/algorithms/off_policy/sac_rcbf.py | 2 +- .../on_policy/barrier_function/ppo_cbf.py | 4 ---- omnisafe/envs/cbf_env.py | 2 +- omnisafe/envs/classic_control/envs_from_rcbf.py | 16 +++++----------- omnisafe/envs/rcbf_env.py | 2 +- 10 files changed, 24 insertions(+), 38 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 63f378224..4b40fedd1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -114,7 +114,7 @@ repos: ^tests/| ^setup.py$| ^omnisafe/envs/classic_control/envs_from_crabs.py$| - ^omnisafe/envs/classic_control/envs_from_rcbf.py| + ^omnisafe/envs/classic_control/envs_from_rcbf.py$| ^omnisafe/common/control_barrier_function/crabs/models.py$| ^omnisafe/common/control_barrier_function/crabs/optimizers.py$| ^omnisafe/common/control_barrier_function/crabs/utils.py$| diff --git a/omnisafe/adapter/barrier_function_adapter.py b/omnisafe/adapter/barrier_function_adapter.py index 469f4e7cd..80b45eecf 100644 --- a/omnisafe/adapter/barrier_function_adapter.py +++ b/omnisafe/adapter/barrier_function_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""BarrierFunction Adapter for OmniSafe.""" +"""Barrier Function Adapter for OmniSafe.""" from __future__ import annotations @@ -31,11 +31,11 @@ class BarrierFunctionAdapter(OnPolicyAdapter): - """BarrierFunction Adapter for OmniSafe. + """Barrier Function Adapter for OmniSafe. - The BarrierFunction Adapter is used to establish the logic of interaction between agents and the - environment based on control barrier functions. Its key feature is the introduction of action - compensators and barrier function solvers. + The Barrier Function Adapter is used to establish the logic of interaction between agents and + the environment based on control barrier functions. Its key feature is the introduction of + action compensators and barrier function solvers. Args: env_id (str): The environment id. diff --git a/omnisafe/adapter/beta_barrier_function_adapter.py b/omnisafe/adapter/beta_barrier_function_adapter.py index d5738e02d..f0bc50af8 100644 --- a/omnisafe/adapter/beta_barrier_function_adapter.py +++ b/omnisafe/adapter/beta_barrier_function_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -120,11 +120,7 @@ def vectorized_f_(obs: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: class BetaBarrierFunctionAdapter(OnPolicyAdapter): - """BarrierFunction Adapter for OmniSafe. - - The BarrierFunction Adapter is used to establish the logic of interaction between agents and the - environment based on control barrier functions. Its key feature is the introduction of action - compensators and barrier function solvers. + """Barrier Function Adapter with Beta Distribution for OmniSafe. Args: env_id (str): The environment id. @@ -134,7 +130,7 @@ class BetaBarrierFunctionAdapter(OnPolicyAdapter): """ def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: - """Initialize an instance of :class:`BarrierFunctionAdapter`.""" + """Initialize an instance of :class:`BetaBarrierFunctionAdapte`.""" super().__init__(env_id, num_envs, seed, cfgs) self.constraint_fn: Callable = vectorize_f(cbf) @@ -147,9 +143,9 @@ def _wrapper( """Wrapper the environment. .. warning:: - Since solving the optimization problem requires obtaining physical quantities with practical - significance from state observations, the Barrier Function Adapter does not support - normalization of observations. + Since solving the optimization problem requires obtaining physical quantities with + practical significance from state observations, the Beta Barrier Function Adapter does + not support normalization of observations. Args: obs_normalize (bool, optional): Whether to normalize the observation. Defaults to False. diff --git a/omnisafe/adapter/offpolicy_barrier_function_adapter.py b/omnisafe/adapter/offpolicy_barrier_function_adapter.py index f40a7add9..e6bff40d6 100644 --- a/omnisafe/adapter/offpolicy_barrier_function_adapter.py +++ b/omnisafe/adapter/offpolicy_barrier_function_adapter.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""BarrierFunction OffPolicy Adapter for OmniSafe.""" +"""OffPolicy Barrier Function Adapter for OmniSafe.""" from __future__ import annotations diff --git a/omnisafe/adapter/robust_barrier_function_adapter.py b/omnisafe/adapter/robust_barrier_function_adapter.py index 8da2cf658..a7c6dc394 100644 --- a/omnisafe/adapter/robust_barrier_function_adapter.py +++ b/omnisafe/adapter/robust_barrier_function_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""BarrierFunction Adapter for OmniSafe.""" +"""Robust Barrier Function Adapter for OmniSafe.""" from __future__ import annotations diff --git a/omnisafe/algorithms/off_policy/sac_rcbf.py b/omnisafe/algorithms/off_policy/sac_rcbf.py index 1e9547369..e1a351020 100644 --- a/omnisafe/algorithms/off_policy/sac_rcbf.py +++ b/omnisafe/algorithms/off_policy/sac_rcbf.py @@ -37,7 +37,7 @@ class SACRCBF(SAC): """The Soft Actor-Critic algorithm with Robust Control Barrier Function. References: - - Title: Soft Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor + - Title: The Soft Actor-Critic algorithm with Robust Control Barrier Function - Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. - URL: `SAC `_ """ diff --git a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py index b77c36c76..1b46857e6 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py +++ b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py @@ -35,10 +35,6 @@ class PPOBetaCBF(PPO): - URL: `PPOBetaCBF `_ """ - def _init_log(self) -> None: - super()._init_log() - self._logger.register_key('Value/Loss_compensator') - def _init_env(self) -> None: self._env: BetaBarrierFunctionAdapter = BetaBarrierFunctionAdapter( self._env_id, diff --git a/omnisafe/envs/cbf_env.py b/omnisafe/envs/cbf_env.py index c46012b8d..a46e91c94 100644 --- a/omnisafe/envs/cbf_env.py +++ b/omnisafe/envs/cbf_env.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/omnisafe/envs/classic_control/envs_from_rcbf.py b/omnisafe/envs/classic_control/envs_from_rcbf.py index bdf469876..33e13189c 100644 --- a/omnisafe/envs/classic_control/envs_from_rcbf.py +++ b/omnisafe/envs/classic_control/envs_from_rcbf.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ from __future__ import annotations -from collections.abc import Iterable from typing import Any, Callable import gymnasium @@ -27,17 +26,11 @@ from gymnasium import spaces -def to_pixel(meas_cm: list[float] | float, shift: int = 0) -> float: - if isinstance(meas_cm, Iterable): - return 1.5 * 37.795 * meas_cm + np.array(shift) - - return 1.5 * 37.795 * meas_cm + shift - - class UnicycleEnv(gymnasium.Env): + """Environment from `The Soft Actor-Critic algorithm with Robust Control Barrier Function`.""" def __init__(self) -> None: - + """Initialize the unicycle environment.""" super().__init__() self.dynamics_mode = 'Unicycle' @@ -124,9 +117,11 @@ def _step(self, action: np.ndarray) -> tuple: return self.state, reward, cost, terminated, truncated, {} def goal_met(self) -> bool: + """Return whether meeting the goal.""" return np.linalg.norm(self.state[:2] - self.goal_pos) <= self.goal_size def reset(self, seed: int | None = None, options: dict | None = None) -> tuple: + """Reset the environment.""" self.episode_step = 0 if self.rand_init: @@ -148,7 +143,6 @@ def get_obs(self) -> np.ndarray: Returns: Observation: np.ndarray. """ - rel_loc = self.goal_pos - self.state[:2] goal_dist = np.linalg.norm(rel_loc) goal_compass = self.obs_compass() # compass to the goal diff --git a/omnisafe/envs/rcbf_env.py b/omnisafe/envs/rcbf_env.py index f97586dc3..a8fbdcd28 100644 --- a/omnisafe/envs/rcbf_env.py +++ b/omnisafe/envs/rcbf_env.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 6a18071c2746ab8e92148c81ebf510638d7127de Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 9 May 2024 18:09:01 +0800 Subject: [PATCH 07/18] chore: update pytest --- omnisafe/adapter/barrier_function_adapter.py | 6 +- .../adapter/beta_barrier_function_adapter.py | 24 ++----- omnisafe/algorithms/off_policy/ddpg.py | 1 + .../on_policy/base/policy_gradient.py | 1 + omnisafe/common/barrier_solver.py | 2 +- omnisafe/common/robust_gp_model.py | 67 +++++-------------- omnisafe/models/actor/actor_builder.py | 11 +-- tests/test_policy.py | 1 + 8 files changed, 37 insertions(+), 76 deletions(-) diff --git a/omnisafe/adapter/barrier_function_adapter.py b/omnisafe/adapter/barrier_function_adapter.py index 80b45eecf..c5581400a 100644 --- a/omnisafe/adapter/barrier_function_adapter.py +++ b/omnisafe/adapter/barrier_function_adapter.py @@ -60,9 +60,9 @@ def _wrapper( """Wrapper the environment. .. warning:: - Since solving the optimization problem requires obtaining physical quantities with practical - significance from state observations, the Barrier Function Adapter does not support - normalization of observations. + Since solving the optimization problem requires obtaining physical quantities with + practical significance from state observations, the Barrier Function Adapter does not + support normalization of observations. Args: obs_normalize (bool, optional): Whether to normalize the observation. Defaults to False. diff --git a/omnisafe/adapter/beta_barrier_function_adapter.py b/omnisafe/adapter/beta_barrier_function_adapter.py index f0bc50af8..22bab63ff 100644 --- a/omnisafe/adapter/beta_barrier_function_adapter.py +++ b/omnisafe/adapter/beta_barrier_function_adapter.py @@ -98,18 +98,11 @@ def vectorized_f_(obs: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: """ obs = obs.cpu().detach().numpy() - if len(obs.shape) == 1: - batch_size = 1 - lbs, ubs = f(obs) - lbs = torch.as_tensor(lbs) - ubs = torch.as_tensor(ubs) - - else: - batch_size = obs.shape[0] - lbs = torch.zeros([batch_size, 1]) - ubs = torch.zeros([batch_size, 1]) - for i in range(batch_size): - lbs[i], ubs[i] = f(obs[i]) + batch_size = obs.shape[0] + lbs = torch.zeros([batch_size, 1]) + ubs = torch.zeros([batch_size, 1]) + for i in range(batch_size): + lbs[i], ubs[i] = f(obs[i]) lbs = torch.FloatTensor(lbs).reshape(batch_size, 1) ubs = torch.FloatTensor(ubs).reshape(batch_size, 1) @@ -181,10 +174,6 @@ def rollout( # pylint: disable=too-many-locals ) -> None: """Rollout the environment and store the data in the buffer. - .. warning:: - As OmniSafe uses :class:`AutoReset` wrapper, the environment will be reset automatically, - so the final observation will be stored in ``info['final_observation']``. - Args: steps_per_epoch (int): Number of steps per epoch. agent (ConstraintActorCritic): Constraint actor-critic, including actor , reward critic @@ -206,9 +195,6 @@ def rollout( # pylint: disable=too-many-locals next_obs, reward, cost, terminated, truncated, info = self.step(final_act) self._log_value(reward=reward, cost=cost, info=info) - - if self._cfgs.algo_cfgs.use_cost: - logger.store({'Value/cost': value_c}) logger.store({'Value/reward': value_r}) buffer.store( diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 2d6bad948..0ce31f286 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -190,6 +190,7 @@ def _init_log(self) -> None: self._log_what_to_save() self._logger.torch_save() + self._specific_save() self._logger.register_key( 'Metrics/EpRet', diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index 831076de6..826ff7c1a 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -182,6 +182,7 @@ def _init_log(self) -> None: self._log_what_to_save() self._logger.torch_save() + self._specific_save() self._logger.register_key( 'Metrics/EpRet', diff --git a/omnisafe/common/barrier_solver.py b/omnisafe/common/barrier_solver.py index ea287b4ad..e4471fb38 100644 --- a/omnisafe/common/barrier_solver.py +++ b/omnisafe/common/barrier_solver.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/omnisafe/common/robust_gp_model.py b/omnisafe/common/robust_gp_model.py index 885a50389..62679fd59 100644 --- a/omnisafe/common/robust_gp_model.py +++ b/omnisafe/common/robust_gp_model.py @@ -133,7 +133,7 @@ def __init__( self.model = self.model.to(self.device) warnings.filterwarnings('ignore') - def train(self, training_iter: int, verbose: bool = False) -> None: + def train(self, training_iter: int) -> None: """Trains the Gaussian Process model. Args: @@ -145,17 +145,11 @@ def train(self, training_iter: int, verbose: bool = False) -> None: optimizer = torch.optim.Adam(self.model.parameters(), lr=0.1) mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model) - for i in range(training_iter): + for _ in range(training_iter): optimizer.zero_grad() output = self.model(self._train_x) loss = -mll(output, self._train_y) loss.backward() - if verbose: - print( - f'\tIter {i + 1}/{training_iter} - Loss: {loss.item():.3f} lengthscale: ' - f'{self.model.covar_module.base_kernel.lengthscale.item():.3f} noise: ' - f'{self.likelihood.noise.item():.3f}', - ) optimizer.step() def predict(self, test_x: torch.Tensor) -> dict[str, torch.Tensor | np.ndarray]: @@ -216,45 +210,27 @@ def __init__( self.n_s = DYNAMICS_MODE[self.env.dynamics_mode]['n_s'] self.n_u = DYNAMICS_MODE[self.env.dynamics_mode]['n_u'] - self._disturb_estimators = None self.disturbance_history = {} self.history_counter = 0 self.max_history_count = gp_model_size self.disturbance_history['state'] = np.zeros((self.max_history_count, self.n_s)) self.disturbance_history['disturbance'] = np.zeros((self.max_history_count, self.n_s)) - self._train_x = None - self._train_y = None - - self.l_p = l_p + self._train_x = np.zeros((self.max_history_count, self.n_s)) + self._train_y = np.zeros((self.max_history_count, self.n_s)) + self._disturb_estimators = [] self.device = torch.device(device) - def predict_next_state(self, state_batch: np.ndarray, u_batch: np.ndarray) -> np.ndarray: - """Predicts the next state given the current state and action batch. - - Args: - state_batch (np.ndarray): The batch of current states. - u_batch (np.ndarray): The batch of actions applied. - - Returns: - np.ndarray: The batch of predicted next states. - """ - expand_dims = len(state_batch.shape) == 1 - if expand_dims: - state_batch = np.expand_dims(state_batch, axis=0) - - next_state_batch = state_batch + self.env.dt * ( - self.get_f(state_batch) - + (self.get_g(state_batch) @ np.expand_dims(u_batch, -1)).squeeze(-1) - ) - pred_mean, pred_std = self.predict_disturbance(state_batch) - next_state_batch += self.env.dt * pred_mean - - if expand_dims: - next_state_batch = next_state_batch.squeeze(0) - if pred_std is not None: - pred_std = pred_std.squeeze(0) - - return next_state_batch + for i in range(self.n_s): + self._disturb_estimators.append( + GPyDisturbanceEstimator( + np.zeros((self.max_history_count, self.n_s)), + np.zeros((self.max_history_count, self.n_s)), + MAX_STD[self.env.dynamics_mode][i], + device=self.device, + ), + ) + self._disturb_initialized = True + self.l_p = l_p def get_dynamics(self) -> tuple[Callable, Callable]: """Retrieves the dynamics functions for drift and control based on the environment's dynamics mode. @@ -324,13 +300,6 @@ def append_transition( u_batch (np.ndarray): The batch of actions applied, shape (n_u,) or (batch_size, n_u). next_state_batch (np.ndarray): The batch of next states, shape (n_s,) or (batch_size, n_s). """ - expand_dims = len(state_batch.shape) == 1 - - if expand_dims: - state_batch = np.expand_dims(state_batch, 0) - next_state_batch = np.expand_dims(next_state_batch, 0) - u_batch = np.expand_dims(u_batch, 0) - u_batch = np.expand_dims(u_batch, -1) disturbance_batch = ( next_state_batch @@ -380,7 +349,7 @@ def fit_gp_model(self, training_iter: int = 70) -> None: ), ) self._disturb_estimators[i].train(training_iter) - + self._disturb_initialized = False self._train_x = train_x self._train_y = train_y @@ -404,7 +373,7 @@ def predict_disturbance(self, test_x: torch.Tensor) -> tuple[torch.Tensor, torch means = np.zeros(test_x.shape) f_std = np.zeros(test_x.shape) - if self._disturb_estimators: + if not self._disturb_initialized: train_x_std = np.std(self._train_x, axis=0) train_y_std = np.std(self._train_y, axis=0) test_x = test_x / train_x_std diff --git a/omnisafe/models/actor/actor_builder.py b/omnisafe/models/actor/actor_builder.py index 75358134c..3f0b3e4a6 100644 --- a/omnisafe/models/actor/actor_builder.py +++ b/omnisafe/models/actor/actor_builder.py @@ -61,10 +61,13 @@ def build_actor( ) -> Actor: """Build actor network. - Currently, we support the following actor types: - - ``gaussian_learning``: Gaussian actor with learnable standard deviation parameters. - - ``gaussian_sac``: Gaussian actor with learnable standard deviation network. - - ``mlp``: Multi-layer perceptron actor, used in ``DDPG`` and ``TD3``. + This method supports multiple actor types, each corresponding to a different class: + - `gaussian_learning`: Returns a GaussianLearningActor with learnable std deviation parameters. + - `gaussian_sac`: Returns a GaussianSACActor with a learnable std deviation network. + - `mlp`: Returns an MLPActor, commonly used in DDPG and TD3 algorithms. + - `vae`: Returns a Variational Autoencoder (VAE) actor. + - `perturbation`: Returns a PerturbationActor. + - `beta`: Returns a BetaLearningActor. Args: actor_type (ActorType): Type of actor network, e.g. ``gaussian_learning``. diff --git a/tests/test_policy.py b/tests/test_policy.py index 8492e2193..21ed70782 100644 --- a/tests/test_policy.py +++ b/tests/test_policy.py @@ -98,6 +98,7 @@ def test_rcbf(auto_alpha): } agent = omnisafe.Agent('SACRCBF', env_id, custom_cfgs=custom_cfgs) agent.learn() + agent.evaluate(num_episodes=1) @helpers.parametrize(optim_case=optim_case) From 38b0a5c60c09f6d0f84e59bddb2177227656a420 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Tue, 14 May 2024 15:07:20 +0800 Subject: [PATCH 08/18] style: fix comment --- .pre-commit-config.yaml | 3 +- .../adapter/beta_barrier_function_adapter.py | 2 +- .../robust_barrier_function_adapter.py | 4 +- omnisafe/algorithms/off_policy/sac_rcbf.py | 1 - omnisafe/algorithms/on_policy/base/ppo.py | 56 +++++++++++++++++++ omnisafe/common/barrier_comp.py | 1 - omnisafe/common/barrier_solver.py | 52 +++++++++-------- omnisafe/common/robust_barrier_solver.py | 17 +++--- omnisafe/common/robust_gp_model.py | 36 ++++++------ omnisafe/configs/off-policy/DDPGCBF.yaml | 1 - omnisafe/configs/off-policy/SACRCBF.yaml | 46 +++++++-------- omnisafe/configs/on-policy/IPO.yaml | 44 ++++++++++++--- omnisafe/configs/on-policy/TRPOCBF.yaml | 1 - .../envs/classic_control/envs_from_rcbf.py | 9 +-- omnisafe/envs/rcbf_env.py | 18 +----- omnisafe/evaluator.py | 1 - omnisafe/utils/tools.py | 14 ++--- 17 files changed, 181 insertions(+), 125 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4b40fedd1..96e584f57 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -118,6 +118,5 @@ repos: ^omnisafe/common/control_barrier_function/crabs/models.py$| ^omnisafe/common/control_barrier_function/crabs/optimizers.py$| ^omnisafe/common/control_barrier_function/crabs/utils.py$| - ^conftest.py$| - ^setup.py$ + ^conftest.py$ ) diff --git a/omnisafe/adapter/beta_barrier_function_adapter.py b/omnisafe/adapter/beta_barrier_function_adapter.py index 22bab63ff..9364b5282 100644 --- a/omnisafe/adapter/beta_barrier_function_adapter.py +++ b/omnisafe/adapter/beta_barrier_function_adapter.py @@ -30,7 +30,7 @@ from omnisafe.utils.config import Config -# # pylint: disable-next=too-many-locals +# pylint: disable-next=too-many-locals def cbf(state: np.ndarray, eta: float = 0.99) -> tuple[np.ndarray, np.ndarray]: """Calculates the Control Barrier Function (CBF) constraints. diff --git a/omnisafe/adapter/robust_barrier_function_adapter.py b/omnisafe/adapter/robust_barrier_function_adapter.py index a7c6dc394..ade39d12f 100644 --- a/omnisafe/adapter/robust_barrier_function_adapter.py +++ b/omnisafe/adapter/robust_barrier_function_adapter.py @@ -32,7 +32,7 @@ class RobustBarrierFunctionAdapter(OffPolicyAdapter): - """Off Policy Robust Barrier Function Adapter for OmniSafe. + """Robust Barrier Function Adapter for OmniSafe. :class:`RobustBarrierFunctionAdapter` is used to adapt the environment with RCBF controller. @@ -44,7 +44,7 @@ class RobustBarrierFunctionAdapter(OffPolicyAdapter): """ def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: - """Initialize an instance of :class:`BarrierFunctionAdapter`.""" + """Initialize an instance of :class:`RobustBarrierFunctionAdapter`.""" super().__init__(env_id, num_envs, seed, cfgs) self.solver: CBFQPLayer self.dynamics_model: DynamicsModel diff --git a/omnisafe/algorithms/off_policy/sac_rcbf.py b/omnisafe/algorithms/off_policy/sac_rcbf.py index e1a351020..fcb7dad26 100644 --- a/omnisafe/algorithms/off_policy/sac_rcbf.py +++ b/omnisafe/algorithms/off_policy/sac_rcbf.py @@ -53,7 +53,6 @@ def _init_env(self) -> None: env=self._env, device=self._cfgs.train_cfgs.device, gamma_b=self._cfgs.cbf_cfgs.gamma_b, - k_d=self._cfgs.cbf_cfgs.k_d, l_p=self._cfgs.cbf_cfgs.l_p, ) dynamics_model = DynamicsModel(env=self._env) diff --git a/omnisafe/algorithms/on_policy/base/ppo.py b/omnisafe/algorithms/on_policy/base/ppo.py index 463b286c8..69f0ce4e9 100644 --- a/omnisafe/algorithms/on_policy/base/ppo.py +++ b/omnisafe/algorithms/on_policy/base/ppo.py @@ -16,6 +16,8 @@ from __future__ import annotations +import torch + from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.policy_gradient import PolicyGradient @@ -29,3 +31,57 @@ class PPO(PolicyGradient): - Authors: John Schulman, Filip Wolski, Prafulla Dhariwal, Alec Radford, Oleg Klimov. - URL: `PPO `_ """ + + def _loss_pi( + self, + obs: torch.Tensor, + act: torch.Tensor, + logp: torch.Tensor, + adv: torch.Tensor, + ) -> torch.Tensor: + r"""Computing pi/actor loss. + + In Proximal Policy Optimization, the loss is defined as: + + .. math:: + + L^{CLIP} = \underset{s_t \sim \rho_{\theta}}{\mathbb{E}} \left[ + \min ( r_t A^{R}_{\pi_{\theta}} (s_t, a_t) , \text{clip} (r_t, 1 - \epsilon, 1 + \epsilon) + A^{R}_{\pi_{\theta}} (s_t, a_t) + \right] + + where :math:`r_t = \frac{\pi_{\theta}^{'} (a_t|s_t)}{\pi_{\theta} (a_t|s_t)}`, + :math:`\epsilon` is the clip parameter, and :math:`A^{R}_{\pi_{\theta}} (s_t, a_t)` is the + advantage. + + Args: + obs (torch.Tensor): The ``observation`` sampled from buffer. + act (torch.Tensor): The ``action`` sampled from buffer. + logp (torch.Tensor): The ``log probability`` of action sampled from buffer. + adv (torch.Tensor): The ``advantage`` processed. ``reward_advantage`` here. + + Returns: + The loss of pi/actor. + """ + distribution = self._actor_critic.actor(obs) + logp_ = self._actor_critic.actor.log_prob(act) + std = self._actor_critic.actor.std + ratio = torch.exp(logp_ - logp) + ratio_cliped = torch.clamp( + ratio, + 1 - self._cfgs.algo_cfgs.clip, + 1 + self._cfgs.algo_cfgs.clip, + ) + loss = -torch.min(ratio * adv, ratio_cliped * adv).mean() + loss -= self._cfgs.algo_cfgs.entropy_coef * distribution.entropy().mean() + # useful extra info + entropy = distribution.entropy().mean().item() + self._logger.store( + { + 'Train/Entropy': entropy, + 'Train/PolicyRatio': ratio, + 'Train/PolicyStd': std, + 'Loss/Loss_pi': loss.mean().item(), + }, + ) + return loss diff --git a/omnisafe/common/barrier_comp.py b/omnisafe/common/barrier_comp.py index 40381ccd3..891932188 100644 --- a/omnisafe/common/barrier_comp.py +++ b/omnisafe/common/barrier_comp.py @@ -85,7 +85,6 @@ def update( Returns: torch.Tensor: The loss after training. """ - # Train the model for _ in range(self._cfgs.update_iters): target = approx_compensating_act + compensating_act self.optimizer.zero_grad() diff --git a/omnisafe/common/barrier_solver.py b/omnisafe/common/barrier_solver.py index e4471fb38..35221281e 100644 --- a/omnisafe/common/barrier_solver.py +++ b/omnisafe/common/barrier_solver.py @@ -52,15 +52,7 @@ def __init__( max_speed: float = 60.0, device: str = 'cpu', ) -> None: - """Initializes the PendulumSolver with specified parameters. - - Args: - action_size (int): Size of the action space. - observation_size (int): Size of the observation space. - torque_bound (float): Maximum torque bound. - max_speed (float): Maximum speed of the pendulum. - device (str): Device to run the computations on. - """ + """Initialize the PendulumSolver with specified parameters.""" self.action_size = action_size self.observation_size = observation_size self.torque_bound = torque_bound @@ -77,7 +69,7 @@ def __init__( warnings.filterwarnings('ignore') def build_gp_model(self, save_dir: str | None = None) -> None: - """Builds the Gaussian Process model.""" + """Build the Gaussian Process model.""" gp_list = [] noise = 0.01 for _ in range(self.observation_size - 1): @@ -96,7 +88,7 @@ def gp_models(self) -> list[GaussianProcessRegressor]: return self.gp_model def _build_barrier(self) -> None: - """Builds the barrier for the pendulum solver.""" + """Build the barrier for the pendulum solver.""" self.P = matrix(np.diag([1.0, 1e16]), tc='d') self.q = matrix(np.zeros(self.action_size + 1)) self.h1 = np.array([1, 0.01]) @@ -112,7 +104,7 @@ def control_barrier( # pylint: disable=invalid-name x: np.ndarray, std: np.ndarray, ) -> torch.Tensor: - """Adjusts the original action using a control barrier function. + """Adjust the original action using a control barrier function. Args: original_action (torch.Tensor): The original action proposed by the RL algorithm. @@ -124,12 +116,12 @@ def control_barrier( # pylint: disable=invalid-name Returns: torch.Tensor: The adjusted action that respects the system's constraints. """ - # Define gamma for the barrier function + # define gamma for the barrier function gamma_b = 0.5 kd = 1.5 u_rl = original_action.cpu().detach().numpy() - # Set up Quadratic Program to satisfy Control Barrier Function + # set up Quadratic Program to satisfy Control Barrier Function G = np.array( [ [ @@ -185,14 +177,14 @@ def control_barrier( # pylint: disable=invalid-name ) h = np.squeeze(h).astype(np.double) - # Convert numpy arrays to cvx matrices to set up QP + # convert numpy arrays to cvx matrices to set up QP G = matrix(G, tc='d') h = matrix(h, tc='d') solvers.options['show_progress'] = False sol = solvers.qp(self.P, self.q, G, h) u_bar = sol['x'] - # Check if the adjusted action is within bounds + # check if the adjusted action is within bounds if np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) - 0.001 >= self.torque_bound: u_bar[0] = self.torque_bound - u_rl print('Error in QP') @@ -204,7 +196,7 @@ def control_barrier( # pylint: disable=invalid-name # pylint: disable-next=attribute-defined-outside-init,import-outside-toplevel,invalid-name def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: - """Calculates the dynamics of the system. + """Calculate the dynamics of the system. Args: obs (list[float]): The current observation of the system state. @@ -213,15 +205,21 @@ def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: Returns: np.ndarray: The calculated dynamics of the system. """ - dt = 0.05 # Time step - G = 10 # Gravitational constant - m = 2 # Mass - length = 2 # Length + # time step + dt = 0.05 + # gravitational constant + G = 10 + # mass + m = 2 + # length + length = 2 - theta = np.arctan2(obs[1], obs[0]) # Calculate the angle - theta_dot = obs[2] # Angular velocity + # calculate the angle + theta = np.arctan2(obs[1], obs[0]) + # angular velocity + theta_dot = obs[2] - # Dynamics equations + # dynamics equations f = np.array( [ -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 @@ -237,7 +235,7 @@ def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: return np.squeeze(f) def update_gp_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: - """Updates the Gaussian Process (GP) dynamics model based on observed states and actions. + """Update the Gaussian Process (GP) dynamics model based on observed states and actions. Args: obs (np.ndarray): Observed states. @@ -263,7 +261,7 @@ def update_gp_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: self.gp_model[1].fit(S, err[:, 1]) def get_gp_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.ndarray]: - """Retrieves the gp dynamics based on the current observation. + """Retrieve the GP dynamics based on the current observation. Args: obs (torch.Tensor): Current state observation. @@ -311,6 +309,6 @@ def get_gp_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.nd ] def reset_gp_model(self) -> None: - """Reset the gaussian processing model of barrier function solver.""" + """Reset the gaussian process model of barrier function solver.""" self.gp_model_prev = self.gp_model.copy() self.build_gp_model() diff --git a/omnisafe/common/robust_barrier_solver.py b/omnisafe/common/robust_barrier_solver.py index 62499352b..348a1a4c7 100644 --- a/omnisafe/common/robust_barrier_solver.py +++ b/omnisafe/common/robust_barrier_solver.py @@ -36,11 +36,19 @@ class CBFQPLayer: """CBFQLayer for robust control barrier function solver. Args: - env (gym.Env): The Gym environment to interact with. + env (gymnasium.Env): The Gymnasium environment to interact with. device (str, optional): The device type, such as 'cpu' or 'gpu'. Defaults to 'cpu'. gamma_b (float, optional): The gamma parameter. Defaults to 20. - k_d (float, optional): The confidence parameter desired. Defaults to 3.0. l_p (float, optional): Some additional layer parameter, purpose unspecified. Defaults to 0.03. + + Attributes: + device (torch.device): The device on which computations will be performed. + env (gym.Env): The Gym environment instance. + u_min (float): The minimum control bound. + u_max (float): The maximum control bound. + gamma_b (float): The gamma parameter. + l_p (float): An additional layer parameter. + action_dim (int): The dimensionality of the action space. """ def __init__( @@ -48,7 +56,6 @@ def __init__( env: gym.Env, device: str = 'cpu', gamma_b: float = 20, - k_d: float = 3.0, l_p: float = 0.03, ) -> None: """Initializes a CBFLayer instance with specified parameters and environment.""" @@ -56,7 +63,6 @@ def __init__( self.env = env self.u_min, self.u_max = self.get_control_bounds() self.gamma_b = gamma_b - self.k_d = k_d self.l_p = l_p self.action_dim = env.action_space.shape[0] warnings.filterwarnings('ignore') @@ -79,7 +85,6 @@ def get_safe_action( Returns: torch.Tensor: Safe actions adjusted for given constraints and uncertainties. """ - # Batch form adjustment if only a single data point is passed expand_dims = len(state_batch.shape) == 1 if expand_dims: state_batch = state_batch.unsqueeze(0) @@ -227,7 +232,6 @@ def get_cbf_qp_constraints( batch_size = state_batch.shape[0] gamma_b = self.gamma_b - # Expand dims state_batch = torch.unsqueeze(state_batch, -1).to(self.device) action_batch = torch.unsqueeze(action_batch, -1).to(self.device) mean_pred_batch = torch.unsqueeze(mean_pred_batch, -1).to(self.device) @@ -261,7 +265,6 @@ def get_cbf_qp_constraints( sigma_theta_aug[:, 1, :] = sigma_pred_batch[:, 2, :] sigma_ps = torch.bmm(torch.abs(g_ps), sigma_theta_aug) + sigma_pred_batch[:, :2, :] - # Build RCBFs hs = 1e3 * torch.ones((batch_size, num_cbfs), device=self.device) dhdps = torch.zeros((batch_size, num_cbfs, 2), device=self.device) hazards = self.env.hazards diff --git a/omnisafe/common/robust_gp_model.py b/omnisafe/common/robust_gp_model.py index 62679fd59..5a305140d 100644 --- a/omnisafe/common/robust_gp_model.py +++ b/omnisafe/common/robust_gp_model.py @@ -134,11 +134,11 @@ def __init__( warnings.filterwarnings('ignore') def train(self, training_iter: int) -> None: - """Trains the Gaussian Process model. + """Train the Gaussian Process model. Args: training_iter (int): Number of training iterations. - verbose (bool): If True, prints detailed logging information. + verbose (bool): If True, print detailed logging information. """ self.model.train() self.likelihood.train() @@ -153,7 +153,7 @@ def train(self, training_iter: int) -> None: optimizer.step() def predict(self, test_x: torch.Tensor) -> dict[str, torch.Tensor | np.ndarray]: - """Makes predictions on new data. + """Make predictions on new data. Args: test_x (torch.Tensor): Test data features. If not a tensor, it will be converted. @@ -188,13 +188,13 @@ def predict(self, test_x: torch.Tensor) -> dict[str, torch.Tensor | np.ndarray]: # pylint: disable-next=too-many-instance-attributes class DynamicsModel: - """Initializes the DynamicsModel with a gym environment. + """Initialize the DynamicsModel with a gymnasium environment. Args: - env (gym.Env): The gym environment to model dynamics for. - gp_model_size (int, optional): Maximum history count for disturbances. Defaults to 2000. - l_p (float, optional): Learning parameter. Defaults to 0.03. - device (str, optional): The device to perform computations on. Defaults to 'cpu'. + env (gym.Env): The gymnasium environment to model dynamics for. + gp_model_size (int, optional): Maximum history count for disturbances. Default to 2000. + l_p (float, optional): Learning parameter. Default to 0.03. + device (str, optional): The device to perform computations on. Default to 'cpu'. """ def __init__( @@ -204,7 +204,7 @@ def __init__( l_p: float = 0.03, device: str = 'cpu', ) -> None: - """Initializes the DynamicsModel with a gym environment.""" + """Initialize the DynamicsModel with a gymnasium environment.""" self.env = env self.get_f, self.get_g = self.get_dynamics() self.n_s = DYNAMICS_MODE[self.env.dynamics_mode]['n_s'] @@ -233,7 +233,7 @@ def __init__( self.l_p = l_p def get_dynamics(self) -> tuple[Callable, Callable]: - """Retrieves the dynamics functions for drift and control based on the environment's dynamics mode. + """Retrieve the dynamics functions for drift and control based on the environment's dynamics mode. Returns: tuple: A tuple containing two callable methods, `get_f` and `get_g`. @@ -257,7 +257,7 @@ def get_g(state_batch: np.ndarray) -> np.ndarray: return get_f, get_g def get_state(self, obs: torch.Tensor) -> torch.Tensor: - """Processes the raw observations from the environment. + """Process the raw observations from the environment. Args: obs (torch.Tensor): The environment observations. @@ -293,7 +293,7 @@ def append_transition( u_batch: np.ndarray, next_state_batch: np.ndarray, ) -> None: - """Estimates the disturbance from the current dynamics transition and adds it to the buffer. + """Estimate the disturbance from the current dynamics transition and adds it to the buffer. Args: state_batch (np.ndarray): The batch of current states, shape (n_s,) or (batch_size, n_s). @@ -321,7 +321,7 @@ def append_transition( self.fit_gp_model() def fit_gp_model(self, training_iter: int = 70) -> None: - """Fits a Gaussian Process model to the disturbance data. + """Fit a Gaussian Process model to the disturbance data. Args: training_iter (int, optional): Number of training iterations for the GP model. Defaults to 70. @@ -354,7 +354,7 @@ def fit_gp_model(self, training_iter: int = 70) -> None: self._train_y = train_y def predict_disturbance(self, test_x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: - """Predicts the disturbance at the queried states using the trained Gaussian Process models. + """Predict the disturbance at the queried states using the trained Gaussian Process models. Args: test_x (torch.Tensor): The state for which to predict disturbances, shape (n_test, n_s). @@ -394,7 +394,7 @@ def predict_disturbance(self, test_x: torch.Tensor) -> tuple[torch.Tensor, torch return (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) def load_disturbance_models(self, save_dir: str, epoch: str) -> None: - """Loads the disturbance models and their training data. + """Load the disturbance models and their training data. Args: save_dir (str): The directory where the model files are saved. @@ -420,15 +420,15 @@ def load_disturbance_models(self, save_dir: str, epoch: str) -> None: @property def train_x(self) -> np.ndarray: - """Returns the training data input features used for the disturbance estimators.""" + """Return the training data input features used for the disturbance estimators.""" return self._train_x @property def train_y(self) -> np.ndarray: - """Returns the training data labels used for the disturbance estimators.""" + """Return the training data labels used for the disturbance estimators.""" return self._train_y @property def disturb_estimators(self) -> list[GPyDisturbanceEstimator]: - """Provides access to the list of trained disturbance estimator models.""" + """Provide access to the list of trained disturbance estimator models.""" return self._disturb_estimators diff --git a/omnisafe/configs/off-policy/DDPGCBF.yaml b/omnisafe/configs/off-policy/DDPGCBF.yaml index 3eec4dced..f9d706305 100644 --- a/omnisafe/configs/off-policy/DDPGCBF.yaml +++ b/omnisafe/configs/off-policy/DDPGCBF.yaml @@ -105,7 +105,6 @@ defaults: # Size of hidden layers hidden_sizes: [400, 300] # Activation function - activation: relu # The learning rate of Critic network lr: 0.001 diff --git a/omnisafe/configs/off-policy/SACRCBF.yaml b/omnisafe/configs/off-policy/SACRCBF.yaml index 53c5e5a17..b4d182c50 100644 --- a/omnisafe/configs/off-policy/SACRCBF.yaml +++ b/omnisafe/configs/off-policy/SACRCBF.yaml @@ -38,9 +38,9 @@ defaults: update_cycle: 1 # number of iterations to update the policy update_iters: 1 - # The size of replay buffer + # size of replay buffer size: 1000000 - # The size of batch + # size of batch batch_size: 256 # normalize reward reward_normalize: False @@ -54,23 +54,23 @@ defaults: use_critic_norm: False # critic norm coefficient critic_norm_coeff: 0.001 - # The soft update coefficient + # soft update coefficient polyak: 0.005 - # The discount factor of GAE + # discount factor of GAE gamma: 0.99 - # Actor perdorm random action before `start_learning_steps` steps + # actor perform random action before `start_learning_steps` steps start_learning_steps: 5000 - # The delay step of policy update + # delay step of policy update policy_delay: 1 - # Whether to use the exploration noise + # whether to use the exploration noise use_exploration_noise: False - # The exploration noise + # exploration noise exploration_noise: 0.1 - # The policy noise + # policy noise policy_noise: 0.2 # policy_noise_clip policy_noise_clip: 0.5 - # The value of alpha + # value of alpha alpha: 0.2 # Whether to use auto alpha auto_alpha: True @@ -106,29 +106,29 @@ defaults: actor_type: gaussian_sac # linear learning rate decay linear_lr_decay: False - # Configuration of Actor network + # configuration of actor network actor: - # Size of hidden layers + # size of hidden layers hidden_sizes: [400, 300] - # Activation function + # activation function activation: relu - # The learning rate of Actor network + # learning rate of actor network lr: 0.0003 - # Configuration of Critic network + # configuration of critic network critic: - # The number of critic networks + # number of critic networks num_critics: 2 - # Size of hidden layers + # size of hidden layers hidden_sizes: [400, 300] - # Activation function + # activation function activation: relu - # The learning rate of Critic network + # learning rate of critic network lr: 0.0003 - # Dynamics model configurations + # dynamics model configurations dynamics_model_cfgs: - # The max number of episodes updating GP models + # max number of episodes updating GP models gp_max_episodes: 100 - # The size of gp model + # size of gp model gp_model_size: 2000 - # Whether to use the action compensator + # whether to use the action compensator use_compensator: False diff --git a/omnisafe/configs/on-policy/IPO.yaml b/omnisafe/configs/on-policy/IPO.yaml index e2a6869c3..807984252 100644 --- a/omnisafe/configs/on-policy/IPO.yaml +++ b/omnisafe/configs/on-policy/IPO.yaml @@ -27,17 +27,17 @@ defaults: # number of parallel agent, similar to a3c parallel: 1 # total number of steps to train - total_steps: 80_000 + total_steps: 10000000 # algorithm configurations algo_cfgs: # number of steps to update the policy - steps_per_epoch: 2000 + steps_per_epoch: 20000 # number of iterations to update the policy - update_iters: 10 + update_iters: 40 # batch size for each iteration - batch_size: 256 + batch_size: 64 # target kl divergence - target_kl: 0.005 + target_kl: 0.02 # entropy coefficient entropy_coef: 0.0 # normalize reward @@ -45,7 +45,7 @@ defaults: # normalize cost cost_normalize: False # normalize observation - obs_normalize: False + obs_normalize: True # early stop when kl divergence is bigger than target kl kl_early_stop: True # use max gradient norm @@ -57,11 +57,11 @@ defaults: # critic norm coefficient critic_norm_coef: 0.001 # reward discount factor - gamma: 0.995 + gamma: 0.99 # cost discount factor cost_gamma: 0.99 # lambda for gae - lam: 0.98 + lam: 0.95 # lambda for cost gae lam_c: 0.95 # clip ratio @@ -127,10 +127,36 @@ defaults: # lagrangian configurations lagrange_cfgs: # Tolerance of constraint violation - cost_limit: 1000.0 + cost_limit: 25.0 # Initial value of lagrangian multiplier lagrangian_multiplier_init: 0.001 # Learning rate of lagrangian multiplier lambda_lr: 0.035 # Type of lagrangian optimizer lambda_optimizer: "Adam" + +Pendulum-v1: + # training configurations + train_cfgs: + # total number of steps to train + total_steps: 80_000 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # number of iterations to update the policy + update_iters: 10 + # batch size for each iteration + batch_size: 256 + # target kl divergence + target_kl: 0.005 + # normalize observation + obs_normalize: False + # reward discount factor + gamma: 0.995 + # lambda for gae + lam: 0.98 + # lagrangian configurations + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 1000.0 diff --git a/omnisafe/configs/on-policy/TRPOCBF.yaml b/omnisafe/configs/on-policy/TRPOCBF.yaml index 9d1b67ec0..2b0b16126 100644 --- a/omnisafe/configs/on-policy/TRPOCBF.yaml +++ b/omnisafe/configs/on-policy/TRPOCBF.yaml @@ -116,7 +116,6 @@ defaults: hidden_sizes: [64, 64] # activation function activation: relu - # out_activation: tanh # learning rate lr: ~ # critic network configurations diff --git a/omnisafe/envs/classic_control/envs_from_rcbf.py b/omnisafe/envs/classic_control/envs_from_rcbf.py index 33e13189c..211c8a352 100644 --- a/omnisafe/envs/classic_control/envs_from_rcbf.py +++ b/omnisafe/envs/classic_control/envs_from_rcbf.py @@ -43,7 +43,6 @@ def __init__(self) -> None: self.max_episode_steps = 1000 self.reward_goal = 1.0 self.goal_size = 0.3 - # Initialize Env self.state = None self.episode_step = 0 self.initial_state = np.array( @@ -54,13 +53,9 @@ def __init__(self) -> None: self.reset() - # Get Dynamics self.get_f, self.get_g = self._get_dynamics() - # Disturbance self.disturb_mean = np.zeros((3,)) self.disturb_covar = np.diag([0.005, 0.005, 0.05]) * 20 - - # Build Hazards self.hazards = [] self.hazards.append( @@ -78,8 +73,6 @@ def __init__(self) -> None: self.hazards.append( {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([1.0, 1.0])}, ) - - # Viewer self.viewer = None def step( @@ -145,7 +138,7 @@ def get_obs(self) -> np.ndarray: """ rel_loc = self.goal_pos - self.state[:2] goal_dist = np.linalg.norm(rel_loc) - goal_compass = self.obs_compass() # compass to the goal + goal_compass = self.obs_compass() return np.array( [ diff --git a/omnisafe/envs/rcbf_env.py b/omnisafe/envs/rcbf_env.py index a8fbdcd28..983528489 100644 --- a/omnisafe/envs/rcbf_env.py +++ b/omnisafe/envs/rcbf_env.py @@ -31,7 +31,7 @@ @env_register class RobustBarrierFunctionEnv(CMDP): - """Interface of control barrier function-based environments. + """Interface of robust control barrier function-based environments. .. warning:: Since environments based on control barrier functions require special judgment and control @@ -56,21 +56,7 @@ def __init__( device: str = 'cpu', **kwargs: Any, ) -> None: - """Initialize the environment. - - Args: - env_id (str): Environment id. - num_envs (int, optional): Number of environments. Defaults to 1. - device (torch.device, optional): Device to store the data. Defaults to 'cpu'. - - Keyword Args: - render_mode (str, optional): The render mode, ranging from ``human``, ``rgb_array``, ``rgb_array_list``. - Defaults to ``rgb_array``. - camera_name (str, optional): The camera name. - camera_id (int, optional): The camera id. - width (int, optional): The width of the rendered image. Defaults to 256. - height (int, optional): The height of the rendered image. Defaults to 256. - """ + """Initialize the robust control barrier function-based environments.""" super().__init__(env_id) self._env_id = env_id if num_envs == 1: diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py index 90535d931..691d6aa86 100644 --- a/omnisafe/evaluator.py +++ b/omnisafe/evaluator.py @@ -330,7 +330,6 @@ def __load_model_and_env( env=self._env, device=self._cfgs['train_cfgs']['device'], gamma_b=self._cfgs['cbf_cfgs']['gamma_b'], - k_d=self._cfgs['cbf_cfgs']['k_d'], l_p=self._cfgs['cbf_cfgs']['l_p'], ) self._dynamics_model = DynamicsModel(env=self._env) diff --git a/omnisafe/utils/tools.py b/omnisafe/utils/tools.py index 7c7a10ceb..d5be5369d 100644 --- a/omnisafe/utils/tools.py +++ b/omnisafe/utils/tools.py @@ -367,13 +367,13 @@ def to_tensor( """Convert a numpy array to a torch tensor of specified type and device. Args: - x (np.ndarray): A numpy array to be converted. - dtype (torch.dtype): The desired data type for the tensor. - device (torch.device): The device to store the tensor on. - requires_grad (bool): If True, gradients will be computed for operations involving this tensor. + x (np.ndarray): A numpy array to be converted. + dtype (torch.dtype): The desired data type for the tensor. + device (torch.device): The device to store the tensor on. + requires_grad (bool): If True, gradients will be computed for operations involving this tensor. Returns: - torch.Tensor: A torch tensor representation of the input array. + torch.Tensor: A torch tensor representation of the input array. """ return torch.from_numpy(x).type(dtype).to(device).requires_grad_(requires_grad) @@ -382,10 +382,10 @@ def sort_vertices_cclockwise(vertices: np.ndarray) -> np.ndarray: """Sort vertices of a 2D convex polygon in counter-clockwise direction. Args: - vertices (np.ndarray): An array of shape (n_v, 2) where n_v is the number of vertices. + vertices (np.ndarray): An array of shape (n_v, 2) where n_v is the number of vertices. Returns: - np.ndarray: An array of vertices sorted in counter-clockwise direction. + np.ndarray: An array of vertices sorted in counter-clockwise direction. """ assert vertices.shape[1] == 2, f'Vertices must each have dimension 2, got {vertices.shape[1]}' polygon_center = vertices.sum(axis=0, keepdims=True) / vertices.shape[0] # (1, d) From 483e42750d4eade310c5b07ec9fcfe16fc2084b7 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Tue, 14 May 2024 22:51:15 +0800 Subject: [PATCH 09/18] style: fix comment --- omnisafe/adapter/__init__.py | 3 +++ omnisafe/algorithms/off_policy/ddpg.py | 12 ++++++---- omnisafe/algorithms/off_policy/ddpg_cbf.py | 2 +- omnisafe/algorithms/off_policy/sac_rcbf.py | 2 +- .../on_policy/barrier_function/ppo_cbf.py | 23 ++----------------- .../on_policy/base/policy_gradient.py | 12 ++++++---- omnisafe/configs/off-policy/SACRCBF.yaml | 2 +- omnisafe/configs/on-policy/PPOBetaCBF.yaml | 2 +- omnisafe/configs/on-policy/TRPOCBF.yaml | 2 +- 9 files changed, 26 insertions(+), 34 deletions(-) diff --git a/omnisafe/adapter/__init__.py b/omnisafe/adapter/__init__.py index 02dab6709..873eccc33 100644 --- a/omnisafe/adapter/__init__.py +++ b/omnisafe/adapter/__init__.py @@ -14,12 +14,15 @@ # ============================================================================== """Adapter for the environment and the algorithm.""" +from omnisafe.adapter.barrier_function_adapter import BarrierFunctionAdapter from omnisafe.adapter.beta_barrier_function_adapter import BetaBarrierFunctionAdapter from omnisafe.adapter.early_terminated_adapter import EarlyTerminatedAdapter from omnisafe.adapter.modelbased_adapter import ModelBasedAdapter from omnisafe.adapter.offline_adapter import OfflineAdapter from omnisafe.adapter.offpolicy_adapter import OffPolicyAdapter +from omnisafe.adapter.offpolicy_barrier_function_adapter import OffPolicyBarrierFunctionAdapter from omnisafe.adapter.online_adapter import OnlineAdapter from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.adapter.robust_barrier_function_adapter import RobustBarrierFunctionAdapter from omnisafe.adapter.saute_adapter import SauteAdapter from omnisafe.adapter.simmer_adapter import SimmerAdapter diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 0ce31f286..0d698e5f2 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -188,7 +188,7 @@ def _init_log(self) -> None: config=self._cfgs, ) - self._log_what_to_save() + self._setup_torch_saver() self._logger.torch_save() self._specific_save() @@ -559,8 +559,12 @@ def _log_when_not_update(self) -> None: }, ) - def _log_what_to_save(self) -> None: - """Define what need to be saved below.""" + def _setup_torch_saver(self) -> None: + """Define what need to be saved below. + + OmniSafe's main storage interface is based on PyTorch. If you need to save models in other + formats, please use :meth:`_specific_save`. + """ what_to_save: dict[str, Any] = {} what_to_save['pi'] = self._actor_critic.actor @@ -571,4 +575,4 @@ def _log_what_to_save(self) -> None: self._logger.setup_torch_saver(what_to_save) def _specific_save(self) -> None: - """Save some algorithms specific models per epoch.""" + """Save some algorithms specific models other than PyTorch format per epoch.""" diff --git a/omnisafe/algorithms/off_policy/ddpg_cbf.py b/omnisafe/algorithms/off_policy/ddpg_cbf.py index de556372b..17dcacda3 100644 --- a/omnisafe/algorithms/off_policy/ddpg_cbf.py +++ b/omnisafe/algorithms/off_policy/ddpg_cbf.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/omnisafe/algorithms/off_policy/sac_rcbf.py b/omnisafe/algorithms/off_policy/sac_rcbf.py index fcb7dad26..e6c020770 100644 --- a/omnisafe/algorithms/off_policy/sac_rcbf.py +++ b/omnisafe/algorithms/off_policy/sac_rcbf.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py index 1b46857e6..4ab2f4d17 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py +++ b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py @@ -60,27 +60,8 @@ def _loss_pi( ) -> torch.Tensor: r"""Computing pi/actor loss. - In Proximal Policy Optimization, the loss is defined as: - - .. math:: - - L^{CLIP} = \underset{s_t \sim \rho_{\theta}}{\mathbb{E}} \left[ - \min ( r_t A^{R}_{\pi_{\theta}} (s_t, a_t) , \text{clip} (r_t, 1 - \epsilon, 1 + \epsilon) - A^{R}_{\pi_{\theta}} (s_t, a_t) - \right] - - where :math:`r_t = \frac{\pi_{\theta}^{'} (a_t|s_t)}{\pi_{\theta} (a_t|s_t)}`, - :math:`\epsilon` is the clip parameter, and :math:`A^{R}_{\pi_{\theta}} (s_t, a_t)` is the - advantage. - - Args: - obs (torch.Tensor): The ``observation`` sampled from buffer. - act (torch.Tensor): The ``action`` sampled from buffer. - logp (torch.Tensor): The ``log probability`` of action sampled from buffer. - adv (torch.Tensor): The ``advantage`` processed. ``reward_advantage`` here. - - Returns: - The loss of pi/actor. + This section of the logic is consistent with PPO, except that it does not record the + standard deviation of the actor distribution. """ distribution = self._actor_critic.actor(obs) logp_ = self._actor_critic.actor.log_prob(act) diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index 826ff7c1a..cb144922a 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -180,7 +180,7 @@ def _init_log(self) -> None: config=self._cfgs, ) - self._log_what_to_save() + self._setup_torch_saver() self._logger.torch_save() self._specific_save() @@ -584,8 +584,12 @@ def _loss_pi( ) return loss - def _log_what_to_save(self) -> None: - """Define what need to be saved below.""" + def _setup_torch_saver(self) -> None: + """Define what need to be saved below. + + OmniSafe's main storage interface is based on PyTorch. If you need to save models in other + formats, please use :meth:`_specific_save`. + """ what_to_save: dict[str, Any] = {} what_to_save['pi'] = self._actor_critic.actor @@ -596,4 +600,4 @@ def _log_what_to_save(self) -> None: self._logger.setup_torch_saver(what_to_save) def _specific_save(self) -> None: - """Save some algorithms specific models per epoch.""" + """Save some algorithms specific models other than PyTorch format per epoch.""" diff --git a/omnisafe/configs/off-policy/SACRCBF.yaml b/omnisafe/configs/off-policy/SACRCBF.yaml index b4d182c50..f70327e6d 100644 --- a/omnisafe/configs/off-policy/SACRCBF.yaml +++ b/omnisafe/configs/off-policy/SACRCBF.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/omnisafe/configs/on-policy/PPOBetaCBF.yaml b/omnisafe/configs/on-policy/PPOBetaCBF.yaml index 4bd5f0f12..afb636e8b 100644 --- a/omnisafe/configs/on-policy/PPOBetaCBF.yaml +++ b/omnisafe/configs/on-policy/PPOBetaCBF.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/omnisafe/configs/on-policy/TRPOCBF.yaml b/omnisafe/configs/on-policy/TRPOCBF.yaml index 2b0b16126..c61d3df44 100644 --- a/omnisafe/configs/on-policy/TRPOCBF.yaml +++ b/omnisafe/configs/on-policy/TRPOCBF.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From dd9068f6174cdd4cd142ac3e96d40966e39641c9 Mon Sep 17 00:00:00 2001 From: Gaiejj <524339208@qq.com> Date: Thu, 16 May 2024 15:17:56 +0800 Subject: [PATCH 10/18] style: fix pre-commit --- .pre-commit-config.yaml | 2 +- omnisafe/adapter/modelbased_adapter.py | 4 ++-- omnisafe/common/robust_barrier_solver.py | 3 ++- omnisafe/envs/safety_gymnasium_modelbased.py | 6 +++++- omnisafe/utils/plotter.py | 3 +-- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 96e584f57..51cbb81c1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: - id: debug-statements - id: double-quote-string-fixer - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.4.2 + rev: v0.4.4 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] diff --git a/omnisafe/adapter/modelbased_adapter.py b/omnisafe/adapter/modelbased_adapter.py index 8abbd90d7..6e2154531 100644 --- a/omnisafe/adapter/modelbased_adapter.py +++ b/omnisafe/adapter/modelbased_adapter.py @@ -269,8 +269,8 @@ def rollout( # pylint: disable=too-many-arguments,too-many-locals update_actor_critic_time = 0.0 update_dynamics_time = 0.0 - if use_eval: - eval_time = 0.0 + + eval_time = 0.0 epoch_steps = 0 diff --git a/omnisafe/common/robust_barrier_solver.py b/omnisafe/common/robust_barrier_solver.py index 348a1a4c7..a871ccc4d 100644 --- a/omnisafe/common/robust_barrier_solver.py +++ b/omnisafe/common/robust_barrier_solver.py @@ -237,7 +237,6 @@ def get_cbf_qp_constraints( mean_pred_batch = torch.unsqueeze(mean_pred_batch, -1).to(self.device) sigma_pred_batch = torch.unsqueeze(sigma_pred_batch, -1).to(self.device) if self.env.dynamics_mode == 'Unicycle': - num_cbfs = len(self.env.hazards) l_p = self.l_p buffer = 0.1 @@ -299,6 +298,8 @@ def get_cbf_qp_constraints( .to(self.device) ) q = torch.zeros((batch_size, n_u + 1)).to(self.device) + else: + raise NotImplementedError n_u = action_batch.shape[1] diff --git a/omnisafe/envs/safety_gymnasium_modelbased.py b/omnisafe/envs/safety_gymnasium_modelbased.py index fe5ae5071..2e1a00598 100644 --- a/omnisafe/envs/safety_gymnasium_modelbased.py +++ b/omnisafe/envs/safety_gymnasium_modelbased.py @@ -181,6 +181,8 @@ def get_cost_from_obs_tensor(self, obs: torch.Tensor, is_binary: bool = True) -> elif len(obs.shape) == 3: batch_size = obs.shape[0] * obs.shape[1] hazard_obs = obs[:, :, hazards_key].reshape(batch_size, -1, 2) + else: + raise NotImplementedError hazards_dist = torch.sqrt(torch.sum(torch.square(hazard_obs), dim=2)).reshape( batch_size, -1, @@ -497,8 +499,10 @@ def reset( self.get_lidar_from_coordinate(flat_coordinate_obs) info['obs_original'] = obs_original info['goal_met'] = False - obs = torch.as_tensor(flat_coordinate_obs, dtype=torch.float32, device=self._device) + else: + obs = torch.as_tensor(obs_original, dtype=torch.float32, device=self._device) + return obs, info def set_seed(self, seed: int) -> None: diff --git a/omnisafe/utils/plotter.py b/omnisafe/utils/plotter.py index 5bdbb7ec2..f24a97bb4 100644 --- a/omnisafe/utils/plotter.py +++ b/omnisafe/utils/plotter.py @@ -118,8 +118,7 @@ def plot_data( smoothed_x = np.convolve(x, y, 'same') / np.convolve(z, y, 'same') datum['Costs'] = smoothed_x - if isinstance(data, list): - data_to_plot = pd.concat(data, ignore_index=True) + data_to_plot = pd.concat(data, ignore_index=True) sns.lineplot( data=data_to_plot, x=xaxis, From d18b1ac2991851c9b9be89eea4d0afc1e2e5d26d Mon Sep 17 00:00:00 2001 From: Gaiejj Date: Wed, 19 Jun 2024 20:13:27 +0800 Subject: [PATCH 11/18] feat: support customized plot --- examples/plot.py | 25 ++++++++++- omnisafe/common/experiment_grid.py | 8 +++- omnisafe/common/statistics_tools.py | 9 +++- omnisafe/utils/plotter.py | 64 +++++++++++++++++++++-------- 4 files changed, 87 insertions(+), 19 deletions(-) diff --git a/examples/plot.py b/examples/plot.py index c16974cce..a425587a7 100644 --- a/examples/plot.py +++ b/examples/plot.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,6 +35,27 @@ parser.add_argument('--select', nargs='*') parser.add_argument('--exclude', nargs='*') parser.add_argument('--estimator', default='mean') + parser.add_argument( + '--reward-metrics', + type=str, + choices=[ + 'Metrics/TestEpRet', + 'Metrics/EpRet', + ], + default='Metrics/EpRet', + help='Specify the reward metric to be used.', + ) + parser.add_argument( + '--cost-metrics', + type=str, + choices=[ + 'Metrics/Max_angle_violation', + 'Metrics/TestEpCost', + 'Metrics/EpCost', + ], + default='Metrics/EpCost', + help='Specify the cost metric to be used.', + ) args = parser.parse_args() plotter = Plotter() @@ -48,4 +69,6 @@ select=args.select, exclude=args.exclude, estimator=args.estimator, + cost_metrics=args.cost_metrics, + reward_metrics=args.reward_metrics, ) diff --git a/omnisafe/common/experiment_grid.py b/omnisafe/common/experiment_grid.py index f93cef8d3..c47e5312a 100644 --- a/omnisafe/common/experiment_grid.py +++ b/omnisafe/common/experiment_grid.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -544,6 +544,8 @@ def analyze( compare_num: int | None = None, cost_limit: float | None = None, show_image: bool = False, + reward_metrics: str = 'Metrics/EpRet', + cost_metrics: str = 'Metrics/EpCost', ) -> None: """Analyze the experiment results. @@ -559,6 +561,8 @@ def analyze( cost_limit (float or None, optional): Value for one line showed on graph to indicate cost. Defaults to None. show_image (bool): Whether to show graph image in GUI windows. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. """ assert self._statistical_tools is not None, 'Please run run() first!' self._statistical_tools.load_source(self.log_dir) @@ -568,6 +572,8 @@ def analyze( compare_num, cost_limit, show_image=show_image, + reward_metrics=reward_metrics, + cost_metrics=cost_metrics, ) def evaluate(self, num_episodes: int = 10, cost_criteria: float = 1.0) -> None: diff --git a/omnisafe/common/statistics_tools.py b/omnisafe/common/statistics_tools.py index 3856b81a7..d2082918f 100644 --- a/omnisafe/common/statistics_tools.py +++ b/omnisafe/common/statistics_tools.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -83,6 +83,7 @@ def load_source(self, path: str) -> None: 'The config file is not found in the save directory.', ) from error + # pylint: disable-next=too-many-arguments, too-many-locals def draw_graph( self, parameter: str, @@ -91,6 +92,8 @@ def draw_graph( cost_limit: float | None = None, smooth: int = 1, show_image: bool = False, + reward_metrics: str = 'Metrics/EpRet', + cost_metrics: str = 'Metrics/EpCost', ) -> None: """Draw graph. @@ -102,6 +105,8 @@ def draw_graph( cost_limit (float or None, optional): The cost limit of the experiment. Defaults to None. smooth (int, optional): The smooth window size. Defaults to 1. show_image (bool): Whether to show graph image in GUI windows. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. .. note:: `values` and `compare_num` cannot be set at the same time. @@ -161,6 +166,8 @@ def draw_graph( 'mean', save_name=save_name, show_image=show_image, + reward_metrics=reward_metrics, + cost_metrics=cost_metrics, ) except Exception: # noqa # pragma: no cover # pylint: disable=broad-except print( diff --git a/omnisafe/utils/plotter.py b/omnisafe/utils/plotter.py index f24a97bb4..29e22caa9 100644 --- a/omnisafe/utils/plotter.py +++ b/omnisafe/utils/plotter.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -164,7 +164,13 @@ def plot_data( plt.tight_layout(pad=0.5) - def get_datasets(self, logdir: str, condition: str | None = None) -> list[DataFrame]: + def get_datasets( + self, + logdir: str, + condition: str | None = None, + reward_metrics: str = 'Metrics/EpReward', + cost_metrics: str = 'Metrics/EpCost', + ) -> list[DataFrame]: """Recursively look through logdir for files named "progress.txt". Assumes that any file "progress.txt" is a valid hit. @@ -172,9 +178,11 @@ def get_datasets(self, logdir: str, condition: str | None = None) -> list[DataFr Args: logdir (str): The directory to search for progress.txt files condition (str or None, optional): The condition label. Defaults to None. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. Returns: - The datasets. + list[DataFrame]: A list of DataFrame objects containing the datasets. Raise: FileNotFoundError: If the config file is not found. @@ -204,21 +212,21 @@ def get_datasets(self, logdir: str, condition: str | None = None) -> list[DataFr self.units[condition1] += 1 try: exp_data = pd.read_csv(os.path.join(root, 'progress.csv')) - except FileNotFoundError as error: progress_path = os.path.join(root, 'progress.csv') raise FileNotFoundError(f'Could not read from {progress_path}') from error - performance = ( - 'Metrics/TestEpRet' if 'Metrics/TestEpRet' in exp_data else 'Metrics/EpRet' - ) - cost_performance = ( - 'Metrics/TestEpCost' if 'Metrics/TestEpCost' in exp_data else 'Metrics/EpCost' - ) + + if reward_metrics not in exp_data: + raise KeyError(f'{reward_metrics} is not in data to plot!') + + if cost_metrics not in exp_data: + raise KeyError(f'{cost_metrics} is not in data to plot!') + exp_data.insert(len(exp_data.columns), 'Unit', unit) exp_data.insert(len(exp_data.columns), 'Condition1', condition1) exp_data.insert(len(exp_data.columns), 'Condition2', condition2) - exp_data.insert(len(exp_data.columns), 'Rewards', exp_data[performance]) - exp_data.insert(len(exp_data.columns), 'Costs', exp_data[cost_performance]) + exp_data.insert(len(exp_data.columns), 'Rewards', exp_data[reward_metrics]) + exp_data.insert(len(exp_data.columns), 'Costs', exp_data[cost_metrics]) epoch = exp_data.get('Train/Epoch') if epoch is None or steps_per_epoch is None: raise ValueError('No Train/Epoch column in progress.csv') @@ -236,6 +244,8 @@ def get_all_datasets( legend: list[str] | None = None, select: str | None = None, exclude: str | None = None, + reward_metrics: str = 'Metrics/EpCost', + cost_metrics: str = 'Metrics/EpCost', ) -> list[DataFrame]: """Get all the data from all the log directories. @@ -248,6 +258,8 @@ def get_all_datasets( legend (list of str or None, optional): List of legend names. Defaults to None. select (str or None, optional): Select logdirs that contain this string. Defaults to None. exclude (str or None, optional): Exclude logdirs that contain this string. Defaults to None. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. Returns: All the data stored in a list of DataFrames. @@ -285,13 +297,22 @@ def get_all_datasets( data = [] if legend: for log, leg in zip(logdirs, legend): - data += self.get_datasets(log, leg) + data += self.get_datasets( + log, + leg, + cost_metrics=cost_metrics, + reward_metrics=reward_metrics, + ) else: for log in logdirs: - data += self.get_datasets(log) + data += self.get_datasets( + log, + cost_metrics=cost_metrics, + reward_metrics=reward_metrics, + ) return data - # pylint: disable-next=too-many-arguments + # pylint: disable-next=too-many-arguments, too-many-locals def make_plots( self, all_logdirs: list[str], @@ -308,6 +329,8 @@ def make_plots( save_name: str | None = None, save_format: str = 'png', show_image: bool = False, + reward_metrics: str = 'Metrics/EpCost', + cost_metrics: str = 'Metrics/EpCost', ) -> None: """Make plots from the data in the specified log directories. @@ -355,9 +378,18 @@ def make_plots( to ``png``. show_image (bool, optional): Optional flag. If set, the plot will be displayed on screen. Defaults to ``False``. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. """ assert xaxis is not None, 'Must specify xaxis' - data = self.get_all_datasets(all_logdirs, legend, select, exclude) + data = self.get_all_datasets( + all_logdirs, + legend, + select, + exclude, + cost_metrics=cost_metrics, + reward_metrics=reward_metrics, + ) condition = 'Condition2' if count else 'Condition1' # choose what to show on main curve: mean? max? min? estimator = getattr(np, estimator) From 7423bc16ad7dc2d225c79865a9506b04885f894c Mon Sep 17 00:00:00 2001 From: Gaiejj Date: Wed, 19 Jun 2024 20:15:13 +0800 Subject: [PATCH 12/18] fix: fix cuda error --- omnisafe/adapter/beta_barrier_function_adapter.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/omnisafe/adapter/beta_barrier_function_adapter.py b/omnisafe/adapter/beta_barrier_function_adapter.py index 9364b5282..1ab488d88 100644 --- a/omnisafe/adapter/beta_barrier_function_adapter.py +++ b/omnisafe/adapter/beta_barrier_function_adapter.py @@ -96,6 +96,7 @@ def vectorized_f_(obs: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: Returns: tuple: Two torch tensors representing the lower and upper bounds for each observation in the batch. """ + device = obs.device obs = obs.cpu().detach().numpy() batch_size = obs.shape[0] @@ -104,8 +105,8 @@ def vectorized_f_(obs: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: for i in range(batch_size): lbs[i], ubs[i] = f(obs[i]) - lbs = torch.FloatTensor(lbs).reshape(batch_size, 1) - ubs = torch.FloatTensor(ubs).reshape(batch_size, 1) + lbs = torch.FloatTensor(lbs).reshape(batch_size, 1).to(device) + ubs = torch.FloatTensor(ubs).reshape(batch_size, 1).to(device) return lbs, ubs From 39341928a818ea2af67975590a97df24be256558 Mon Sep 17 00:00:00 2001 From: Gaiejj Date: Wed, 19 Jun 2024 20:16:35 +0800 Subject: [PATCH 13/18] fix: fix compensator saving --- omnisafe/algorithms/off_policy/ddpg_cbf.py | 13 +++++++++--- .../on_policy/barrier_function/trpo_cbf.py | 21 +++++++++++++++++-- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/omnisafe/algorithms/off_policy/ddpg_cbf.py b/omnisafe/algorithms/off_policy/ddpg_cbf.py index 17dcacda3..f69310fff 100644 --- a/omnisafe/algorithms/off_policy/ddpg_cbf.py +++ b/omnisafe/algorithms/off_policy/ddpg_cbf.py @@ -51,7 +51,7 @@ def _init_env(self) -> None: self._seed, self._cfgs, ) - solver = PendulumSolver(device=self._cfgs.train_cfgs.device) + solver = PendulumSolver(device=self._device) compensator = BarrierCompensator( obs_dim=self._env.observation_space.shape[0], act_dim=self._env.action_space.shape[0], @@ -120,11 +120,18 @@ def _specific_save(self) -> None: os.makedirs(os.path.dirname(path), exist_ok=True) joblib.dump(self._env.gp_models, path) - def _log_what_to_save(self) -> dict[str, Any]: - """Define what need to be saved below.""" + def _setup_torch_saver(self) -> None: + """Define what need to be saved below. + + OmniSafe's main storage interface is based on PyTorch. If you need to save models in other + formats, please use :meth:`_specific_save`. + """ what_to_save: dict[str, Any] = {} what_to_save['pi'] = self._actor_critic.actor what_to_save['compensator'] = self._env.compensator + if self._cfgs.algo_cfgs.obs_normalize: + obs_normalizer = self._env.save()['obs_normalizer'] + what_to_save['obs_normalizer'] = obs_normalizer self._logger.setup_torch_saver(what_to_save) diff --git a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py index 8125151d6..b0b64f892 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py +++ b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py @@ -28,6 +28,7 @@ from omnisafe.algorithms.on_policy.base.trpo import TRPO from omnisafe.common.barrier_comp import BarrierCompensator from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.typing import Any from omnisafe.utils import distributed from omnisafe.utils.distributed import get_rank @@ -70,12 +71,12 @@ def _init_env(self) -> None: // distributed.world_size() // self._cfgs.train_cfgs.vector_env_nums ) - self.solver = PendulumSolver(device=self._cfgs.train_cfgs.device) + self.solver = PendulumSolver(device=self._device) self.compensator = BarrierCompensator( obs_dim=self._env.observation_space.shape[0], act_dim=self._env.action_space.shape[0], cfgs=self._cfgs.compensator_cfgs, - ) + ).to(self._device) self._env.set_solver(solver=self.solver) self._env.set_compensator(compensator=self.compensator) @@ -165,3 +166,19 @@ def _specific_save(self) -> None: ) os.makedirs(os.path.dirname(path), exist_ok=True) joblib.dump(self._env.gp_models, path) + + def _setup_torch_saver(self) -> None: + """Define what need to be saved below. + + OmniSafe's main storage interface is based on PyTorch. If you need to save models in other + formats, please use :meth:`_specific_save`. + """ + what_to_save: dict[str, Any] = {} + + what_to_save['pi'] = self._actor_critic.actor + what_to_save['compensator'] = self._env.compensator + if self._cfgs.algo_cfgs.obs_normalize: + obs_normalizer = self._env.save()['obs_normalizer'] + what_to_save['obs_normalizer'] = obs_normalizer + + self._logger.setup_torch_saver(what_to_save) From 9809987454f81c188af301de880cd993d47f40ed Mon Sep 17 00:00:00 2001 From: Gaiejj Date: Wed, 19 Jun 2024 20:28:35 +0800 Subject: [PATCH 14/18] style: fix spelling --- omnisafe/common/experiment_grid.py | 2 +- omnisafe/common/statistics_tools.py | 2 +- omnisafe/utils/plotter.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/omnisafe/common/experiment_grid.py b/omnisafe/common/experiment_grid.py index c47e5312a..787f4592f 100644 --- a/omnisafe/common/experiment_grid.py +++ b/omnisafe/common/experiment_grid.py @@ -561,7 +561,7 @@ def analyze( cost_limit (float or None, optional): Value for one line showed on graph to indicate cost. Defaults to None. show_image (bool): Whether to show graph image in GUI windows. - reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. """ assert self._statistical_tools is not None, 'Please run run() first!' diff --git a/omnisafe/common/statistics_tools.py b/omnisafe/common/statistics_tools.py index d2082918f..72e661c33 100644 --- a/omnisafe/common/statistics_tools.py +++ b/omnisafe/common/statistics_tools.py @@ -105,7 +105,7 @@ def draw_graph( cost_limit (float or None, optional): The cost limit of the experiment. Defaults to None. smooth (int, optional): The smooth window size. Defaults to 1. show_image (bool): Whether to show graph image in GUI windows. - reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. .. note:: diff --git a/omnisafe/utils/plotter.py b/omnisafe/utils/plotter.py index 29e22caa9..e592240be 100644 --- a/omnisafe/utils/plotter.py +++ b/omnisafe/utils/plotter.py @@ -168,7 +168,7 @@ def get_datasets( self, logdir: str, condition: str | None = None, - reward_metrics: str = 'Metrics/EpReward', + reward_metrics: str = 'Metrics/EpRet', cost_metrics: str = 'Metrics/EpCost', ) -> list[DataFrame]: """Recursively look through logdir for files named "progress.txt". @@ -178,7 +178,7 @@ def get_datasets( Args: logdir (str): The directory to search for progress.txt files condition (str or None, optional): The condition label. Defaults to None. - reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. Returns: @@ -258,7 +258,7 @@ def get_all_datasets( legend (list of str or None, optional): List of legend names. Defaults to None. select (str or None, optional): Select logdirs that contain this string. Defaults to None. exclude (str or None, optional): Exclude logdirs that contain this string. Defaults to None. - reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. Returns: @@ -378,7 +378,7 @@ def make_plots( to ``png``. show_image (bool, optional): Optional flag. If set, the plot will be displayed on screen. Defaults to ``False``. - reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpReward'. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. """ assert xaxis is not None, 'Must specify xaxis' From 354dfa1878551e938f32e3010a051a1a8a66b5f6 Mon Sep 17 00:00:00 2001 From: Gaiejj Date: Thu, 4 Jul 2024 19:38:08 +0800 Subject: [PATCH 15/18] refactor(cbf): refactor CBF methods --- .pre-commit-config.yaml | 4 +- omnisafe/adapter/barrier_function_adapter.py | 155 ++++++++----- .../offpolicy_barrier_function_adapter.py | 85 ++++--- .../robust_barrier_function_adapter.py | 6 +- omnisafe/algorithms/off_policy/ddpg_cbf.py | 48 +--- omnisafe/algorithms/off_policy/sac_rcbf.py | 17 +- .../on_policy/barrier_function/trpo_cbf.py | 99 +------- omnisafe/common/__init__.py | 3 + omnisafe/common/barrier_comp.py | 6 +- omnisafe/common/barrier_solver.py | 192 +++------------ omnisafe/common/gp_model.py | 218 ++++++++++++++++++ omnisafe/common/robust_gp_model.py | 12 +- omnisafe/evaluator.py | 75 +++--- pyproject.toml | 1 - 14 files changed, 482 insertions(+), 439 deletions(-) create mode 100644 omnisafe/common/gp_model.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 51cbb81c1..2f04378f6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: - id: debug-statements - id: double-quote-string-fixer - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.4.4 + rev: v0.5.0 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] @@ -54,7 +54,7 @@ repos: - id: pyupgrade args: [--py38-plus] # sync with requires-python - repo: https://github.com/pycqa/flake8 - rev: 7.0.0 + rev: 7.1.0 hooks: - id: flake8 additional_dependencies: diff --git a/omnisafe/adapter/barrier_function_adapter.py b/omnisafe/adapter/barrier_function_adapter.py index c5581400a..c247f7705 100644 --- a/omnisafe/adapter/barrier_function_adapter.py +++ b/omnisafe/adapter/barrier_function_adapter.py @@ -16,6 +16,8 @@ from __future__ import annotations +from typing import Any + import torch from rich.progress import track from sklearn.gaussian_process import GaussianProcessRegressor @@ -24,6 +26,7 @@ from omnisafe.common.barrier_comp import BarrierCompensator from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.common.buffer import VectorOnPolicyBuffer +from omnisafe.common.gp_model import DynamicsModel from omnisafe.common.logger import Logger from omnisafe.envs.wrapper import AutoReset, CostNormalize, RewardNormalize, TimeLimit, Unsqueeze from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic @@ -47,9 +50,29 @@ class BarrierFunctionAdapter(OnPolicyAdapter): def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: """Initialize an instance of :class:`BarrierFunctionAdapter`.""" super().__init__(env_id, num_envs, seed, cfgs) - self.solver: PendulumSolver - self.compensator: BarrierCompensator - self.first_iter = 1 + + if env_id == 'Pendulum-v1': + self.solver: PendulumSolver = PendulumSolver( + action_size=self.action_space.shape[0], # type: ignore + device=self._device, + ) + self.dynamics_model: DynamicsModel = DynamicsModel( + observation_size=self.observation_space.shape[0], # type: ignore + ) + else: + raise NotImplementedError(f'Please implement solver for {env_id} !') + self.compensator: BarrierCompensator = BarrierCompensator( + obs_dim=self.observation_space.shape[0], # type: ignore + act_dim=self.action_space.shape[0], # type: ignore + cfgs=cfgs.compensator_cfgs, + ).to(self._device) + self.first_iter: bool = True + + self.episode_rollout: dict[str, Any] = {} + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] + self.episode_rollout['approx_compensating_act'] = [] + self.episode_rollout['compensating_act'] = [] def _wrapper( self, @@ -89,17 +112,9 @@ def _wrapper( if self._env.num_envs == 1: self._env = Unsqueeze(self._env, device=self._device) - def set_solver(self, solver: PendulumSolver) -> None: - """Set the barrier function solver for Pendulum environment.""" - self.solver = solver - - def set_compensator(self, compensator: BarrierCompensator) -> None: - """Set the action compensator.""" - self.compensator = compensator - def reset_gp_model(self) -> None: """Reset the gaussian processing model of barrier function solver.""" - self.solver.reset_gp_model() + self.dynamics_model.reset_gp_model() def rollout( # pylint: disable=too-many-locals,too-many-branches self, @@ -118,12 +133,10 @@ def rollout( # pylint: disable=too-many-locals,too-many-branches logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. """ self._reset_log() - if not self.first_iter: - self.reset_gp_model() obs, _ = self.reset() - path_obs = [] - path_act = [] + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] for step in track( range(steps_per_epoch), description=f'Processing rollout for epoch: {logger.current_epoch}...', @@ -134,46 +147,29 @@ def rollout( # pylint: disable=too-many-locals,too-many-branches act_dist = agent.actor(obs) act_mean, act_std = act_dist.mean, agent.actor.std - approx_compensating_act = self.compensator(obs=obs) - compensated_act_mean_raw = act_mean + approx_compensating_act - - if self.first_iter: - [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=False) - else: - [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=True) - - compensating_act = self.solver.control_barrier( - compensated_act_mean_raw, - f, - g, - x, - std, + safe_act = self.get_safe_action( + obs, + act_mean, + act_std, ) + logp = agent.actor.log_prob(safe_act) - compensated_act_mean = compensated_act_mean_raw + compensating_act - final_act = torch.normal(compensated_act_mean, act_std) - - logp = agent.actor.log_prob(final_act) - - path_obs.append(obs) - path_act.append(final_act) - - next_obs, reward, cost, terminated, truncated, info = self.step(final_act) + self.episode_rollout['obs'].append(obs) + self.episode_rollout['final_act'].append(safe_act) + next_obs, reward, cost, terminated, truncated, info = self.step(safe_act) self._log_value(reward=reward, cost=cost, info=info) logger.store({'Value/reward': value_r}) buffer.store( obs=obs, - act=final_act, + act=safe_act, reward=reward, cost=cost, value_r=value_r, value_c=value_c, logp=logp, - approx_compensating_act=approx_compensating_act.detach(), - compensating_act=compensating_act.detach(), ) obs = next_obs @@ -203,25 +199,72 @@ def rollout( # pylint: disable=too-many-locals,too-many-branches if done or time_out: self._log_metrics(logger, idx) + compensator_loss = self.compensator.update( + torch.cat(self.episode_rollout['obs']), + torch.cat(self.episode_rollout['approx_compensating_act']), + torch.cat(self.episode_rollout['compensating_act']), + ) + logger.store({'Value/Loss_compensator': compensator_loss.item()}) + self.dynamics_model.update_gp_dynamics( + obs=torch.cat(self.episode_rollout['obs']), # type: ignore + act=torch.cat(self.episode_rollout['final_act']), # type: ignore + ) + + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] + self.episode_rollout['approx_compensating_act'] = [] + self.episode_rollout['compensating_act'] = [] + self._reset_log(idx) + obs, _ = self.reset() + buffer.finish_path(last_value_r, last_value_c, idx) + self.first_iter = False + self.reset_gp_model() - self._ep_ret[idx] = 0.0 - self._ep_cost[idx] = 0.0 - self._ep_len[idx] = 0.0 + def get_safe_action( + self, + obs: torch.Tensor, + act_mean: torch.Tensor, + act_std: torch.Tensor, + ) -> torch.Tensor: + """Computes a safe action by applying compensatory actions. - if step < self._cfgs.algo_cfgs.update_dynamics_steps: - self.solver.update_gp_dynamics( - obs=torch.cat(path_obs), # type: ignore - act=torch.cat(path_act), # type: ignore - ) + .. note:: + This is the core method of the CBF method. Users can modify this function to implement + customized action mapping. - path_obs = [] - path_act = [] - obs, _ = self.reset() - buffer.finish_path(last_value_r, last_value_c, idx) - self.first_iter = 0 + Args: + obs (torch.Tensor): The current observation from the environment. + act_mean (torch.Tensor): The mean of proposed action to be controlled for safety. + act_std (torch.Tensor): The standard deviation of proposed action to be controlled for safety. + + Returns: + list(torch.Tensor): The safe actions for interaction and compensating actions for compensator training. + """ + with torch.no_grad(): + approx_compensating_act = self.compensator(obs=obs) + compensated_act_mean_raw = act_mean + approx_compensating_act + + [f, g, x, std] = self.dynamics_model.get_gp_dynamics( + obs, + use_prev_model=not self.first_iter, + ) + compensating_act = self.solver.control_barrier( + original_action=compensated_act_mean_raw, + f=f, + g=g, + x=x, + std=std, + ) + + compensated_act_mean = compensated_act_mean_raw + compensating_act + safe_act = torch.normal(compensated_act_mean, act_std) + self.episode_rollout['compensating_act'].append(compensating_act) + self.episode_rollout['approx_compensating_act'].append(approx_compensating_act) + + return safe_act @property def gp_models(self) -> list[GaussianProcessRegressor]: """Return the gp models to be saved.""" - return self.solver.gp_models + return self.dynamics_model.gp_models diff --git a/omnisafe/adapter/offpolicy_barrier_function_adapter.py b/omnisafe/adapter/offpolicy_barrier_function_adapter.py index e6bff40d6..20b4abdb8 100644 --- a/omnisafe/adapter/offpolicy_barrier_function_adapter.py +++ b/omnisafe/adapter/offpolicy_barrier_function_adapter.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ from omnisafe.common.barrier_comp import BarrierCompensator from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.common.buffer import VectorOffPolicyBuffer +from omnisafe.common.gp_model import DynamicsModel from omnisafe.common.logger import Logger from omnisafe.envs.wrapper import CostNormalize, RewardNormalize, Unsqueeze from omnisafe.models.actor_critic.constraint_actor_q_critic import ConstraintActorQCritic @@ -34,21 +35,47 @@ class OffPolicyBarrierFunctionAdapter(OffPolicyAdapter): """OffPolicy Barrier Function Adapter for OmniSafe. - :class:`OffPolicyBarrierFunctionAdapter` is used to adapt the environment with CBF controller. + :class:`OffPolicyBarrierFunctionAdapter` is used to adapt the environment with a CBF controller, + mapping the agent actions from unsafe ones to safe ones. Args: env_id (str): The environment id. num_envs (int): The number of environments. seed (int): The random seed. cfgs (Config): The configuration. + + Attributes: + solver (PendulumSolver): The solver used for the environment, currently supporting + ``Pendulum-v1``. + dynamics_model (DynamicsModel): The dynamics model used to predict the environment's behavior. + compensator (BarrierCompensator): The compensator used to approximate previous actions. + first_iter (bool): A flag indicating if it is the first iteration. + episode_rollout (dict[str, Any]): A dictionary to store the episode rollout information, + including observations and various actions, + useful for updating compensator. """ def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: - """Initialize an instance of :class:`BarrierFunctionAdapter`.""" + """Initialize an instance of :class:`OffPolicyBarrierFunctionAdapter`.""" super().__init__(env_id, num_envs, seed, cfgs) - self.solver: PendulumSolver - self.compensator: BarrierCompensator - self.first_iter: int = 1 + + if env_id == 'Pendulum-v1': + self.solver: PendulumSolver = PendulumSolver( + action_size=self.action_space.shape[0], # type: ignore + device=self._device, + ) + self.dynamics_model: DynamicsModel = DynamicsModel( + observation_size=self.observation_space.shape[0], # type: ignore + ) + else: + raise NotImplementedError(f'Please implement solver for {env_id} !') + self.compensator: BarrierCompensator = BarrierCompensator( + obs_dim=self.observation_space.shape[0], # type: ignore + act_dim=self.action_space.shape[0], # type: ignore + cfgs=cfgs.compensator_cfgs, + ).to(self._device) + + self.first_iter: bool = True self.episode_rollout: dict[str, Any] = {} self.episode_rollout['obs'] = [] self.episode_rollout['final_act'] = [] @@ -110,17 +137,9 @@ def eval_policy( # pylint: disable=too-many-locals }, ) - def set_solver(self, solver: PendulumSolver) -> None: - """Set the barrier function solver for Pendulum environment.""" - self.solver = solver - - def set_compensator(self, compensator: BarrierCompensator) -> None: - """Set the action compensator.""" - self.compensator = compensator - def reset_gp_model(self) -> None: """Reset the gaussian processing model of barrier function solver.""" - self.solver.reset_gp_model() + self.dynamics_model.reset_gp_model() def rollout( # pylint: disable=too-many-locals self, @@ -130,7 +149,7 @@ def rollout( # pylint: disable=too-many-locals logger: Logger, use_rand_action: bool, ) -> None: - """Rollout in off-policy manner with barrier function controller. + """Rollout in off-policy manner with the ``dynamics_model``, ``solver`` and ``compensator``. Args: rollout_step (int): Number of rollout steps. @@ -173,7 +192,7 @@ def rollout( # pylint: disable=too-many-locals torch.cat(self.episode_rollout['compensating_act']), ) logger.store({'Value/Loss_compensator': compensator_loss.item()}) - self.solver.update_gp_dynamics( + self.dynamics_model.update_gp_dynamics( obs=torch.cat(self.episode_rollout['obs']), # type: ignore act=torch.cat(self.episode_rollout['final_act']), # type: ignore ) @@ -185,9 +204,8 @@ def rollout( # pylint: disable=too-many-locals self._reset_log(idx) self._current_obs, _ = self._env.reset() - self.first_iter = 0 - if not self.first_iter: - self.reset_gp_model() + self.first_iter = False + self.reset_gp_model() def get_safe_action( self, @@ -197,24 +215,33 @@ def get_safe_action( ) -> torch.Tensor: """Computes a safe action by applying compensatory actions. + .. note:: + This is the core method of the CBF method. Users can modify this function to implement + customized action mapping. + Args: obs (torch.Tensor): The current observation from the environment. - act (torch.Tensor): The proposed action to be evaluated for safety. + act (torch.Tensor): The proposed action to be controlled for safety. is_eval (bool, optional): A flag to indicate whether this is an evaluation phase, defaulting to False. Returns: torch.Tensor: The safe action to be executed in the environment. """ with torch.no_grad(): - approx_compensating_act = self.compensator(obs=self._current_obs) + approx_compensating_act = self.compensator(obs=obs) compensated_act_mean_raw = act + approx_compensating_act - if self.first_iter: - [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=False) - else: - [f, g, x, std] = self.solver.get_gp_dynamics(obs, use_prev_model=True) - - compensating_act = self.solver.control_barrier(compensated_act_mean_raw, f, g, x, std) + [f, g, x, std] = self.dynamics_model.get_gp_dynamics( + obs, + use_prev_model=not self.first_iter, + ) + compensating_act = self.solver.control_barrier( + original_action=compensated_act_mean_raw, + f=f, + g=g, + x=x, + std=std, + ) safe_act = compensated_act_mean_raw + compensating_act if not is_eval: @@ -226,4 +253,4 @@ def get_safe_action( @property def gp_models(self) -> list[GaussianProcessRegressor]: """Return the gp models to be saved.""" - return self.solver.gp_models + return self.dynamics_model.gp_models diff --git a/omnisafe/adapter/robust_barrier_function_adapter.py b/omnisafe/adapter/robust_barrier_function_adapter.py index ade39d12f..cc5a22e02 100644 --- a/omnisafe/adapter/robust_barrier_function_adapter.py +++ b/omnisafe/adapter/robust_barrier_function_adapter.py @@ -60,9 +60,9 @@ def _wrapper( """Wrapper the environment. .. warning:: - Since solving the optimization problem requires obtaining physical quantities with practical - significance from state observations, the Barrier Function Adapter does not support - normalization of observations. + Since solving the optimization problem requires obtaining physical quantities with + practical significance from state observations, the Barrier Function Adapter does not + support normalization of observations. Args: obs_normalize (bool, optional): Whether to normalize the observation. Defaults to False. diff --git a/omnisafe/algorithms/off_policy/ddpg_cbf.py b/omnisafe/algorithms/off_policy/ddpg_cbf.py index f69310fff..6df1fcbb3 100644 --- a/omnisafe/algorithms/off_policy/ddpg_cbf.py +++ b/omnisafe/algorithms/off_policy/ddpg_cbf.py @@ -21,13 +21,10 @@ import os import joblib -import torch from omnisafe.adapter.offpolicy_barrier_function_adapter import OffPolicyBarrierFunctionAdapter from omnisafe.algorithms import registry from omnisafe.algorithms.off_policy.ddpg import DDPG -from omnisafe.common.barrier_comp import BarrierCompensator -from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.typing import Any from omnisafe.utils.distributed import get_rank @@ -45,56 +42,13 @@ class DDPGCBF(DDPG): """ def _init_env(self) -> None: + super()._init_env() self._env: OffPolicyBarrierFunctionAdapter = OffPolicyBarrierFunctionAdapter( self._env_id, self._cfgs.train_cfgs.vector_env_nums, self._seed, self._cfgs, ) - solver = PendulumSolver(device=self._device) - compensator = BarrierCompensator( - obs_dim=self._env.observation_space.shape[0], - act_dim=self._env.action_space.shape[0], - cfgs=self._cfgs.compensator_cfgs, - ).to(self._device) - - self._env.set_compensator(compensator=compensator) - self._env.set_solver(solver=solver) - - assert ( - self._cfgs.algo_cfgs.steps_per_epoch % self._cfgs.train_cfgs.vector_env_nums == 0 - ), 'The number of steps per epoch is not divisible by the number of environments.' - - assert ( - int(self._cfgs.train_cfgs.total_steps) % self._cfgs.algo_cfgs.steps_per_epoch == 0 - ), 'The total number of steps is not divisible by the number of steps per epoch.' - self._epochs: int = int( - self._cfgs.train_cfgs.total_steps // self._cfgs.algo_cfgs.steps_per_epoch, - ) - self._epoch: int = 0 - self._steps_per_epoch: int = ( - self._cfgs.algo_cfgs.steps_per_epoch // self._cfgs.train_cfgs.vector_env_nums - ) - - self._update_cycle: int = self._cfgs.algo_cfgs.update_cycle - assert ( - self._steps_per_epoch % self._update_cycle == 0 - ), 'The number of steps per epoch is not divisible by the number of steps per sample.' - self._samples_per_epoch: int = self._steps_per_epoch // self._update_cycle - self._update_count: int = 0 - - def _init(self) -> None: - super()._init() - self._buf.add_field( - name='approx_compensating_act', - shape=self._env.action_space.shape, - dtype=torch.float32, - ) - self._buf.add_field( - name='compensating_act', - shape=self._env.action_space.shape, - dtype=torch.float32, - ) def _init_log(self) -> None: """Log the DDPGCBF specific information. diff --git a/omnisafe/algorithms/off_policy/sac_rcbf.py b/omnisafe/algorithms/off_policy/sac_rcbf.py index e6c020770..598c4a14c 100644 --- a/omnisafe/algorithms/off_policy/sac_rcbf.py +++ b/omnisafe/algorithms/off_policy/sac_rcbf.py @@ -49,13 +49,16 @@ def _init_env(self) -> None: self._seed, self._cfgs, ) - solver = CBFQPLayer( - env=self._env, - device=self._cfgs.train_cfgs.device, - gamma_b=self._cfgs.cbf_cfgs.gamma_b, - l_p=self._cfgs.cbf_cfgs.l_p, - ) - dynamics_model = DynamicsModel(env=self._env) + if self._env_id == 'Unicycle': + solver = CBFQPLayer( + env=self._env, + device=self._cfgs.train_cfgs.device, + gamma_b=self._cfgs.cbf_cfgs.gamma_b, + l_p=self._cfgs.cbf_cfgs.l_p, + ) + dynamics_model = DynamicsModel(env=self._env) + else: + raise NotImplementedError(f'Please implement solver for {self._env_id} !') self._env.set_dynamics_model(dynamics_model=dynamics_model) self._env.set_solver(solver=solver) diff --git a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py index b0b64f892..0324170c4 100644 --- a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py +++ b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,16 +20,11 @@ import os import joblib -import torch -from torch.utils.data import DataLoader, TensorDataset from omnisafe.adapter.barrier_function_adapter import BarrierFunctionAdapter from omnisafe.algorithms import registry from omnisafe.algorithms.on_policy.base.trpo import TRPO -from omnisafe.common.barrier_comp import BarrierCompensator -from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.typing import Any -from omnisafe.utils import distributed from omnisafe.utils.distributed import get_rank @@ -57,103 +52,13 @@ def _init_log(self) -> None: self._logger.register_key('Value/Loss_compensator') def _init_env(self) -> None: + super()._init_env() self._env: BarrierFunctionAdapter = BarrierFunctionAdapter( self._env_id, self._cfgs.train_cfgs.vector_env_nums, self._seed, self._cfgs, ) - assert (self._cfgs.algo_cfgs.steps_per_epoch) % ( - distributed.world_size() * self._cfgs.train_cfgs.vector_env_nums - ) == 0, 'The number of steps per epoch is not divisible by the number of environments.' - self._steps_per_epoch: int = ( - self._cfgs.algo_cfgs.steps_per_epoch - // distributed.world_size() - // self._cfgs.train_cfgs.vector_env_nums - ) - self.solver = PendulumSolver(device=self._device) - self.compensator = BarrierCompensator( - obs_dim=self._env.observation_space.shape[0], - act_dim=self._env.action_space.shape[0], - cfgs=self._cfgs.compensator_cfgs, - ).to(self._device) - self._env.set_solver(solver=self.solver) - self._env.set_compensator(compensator=self.compensator) - - def _init(self) -> None: - super()._init() - self._buf.add_field( - name='approx_compensating_act', - shape=self._env.action_space.shape, - dtype=torch.float32, - ) - self._buf.add_field( - name='compensating_act', - shape=self._env.action_space.shape, - dtype=torch.float32, - ) - - def _update(self) -> None: - """Update actor, critic. - - .. hint:: - Here are some differences between NPG and Policy Gradient (PG): In PG, the actor network - and the critic network are updated together. When the KL divergence between the old - policy, and the new policy is larger than a threshold, the update is rejected together. - - In NPG, the actor network and the critic network are updated separately. When the KL - divergence between the old policy, and the new policy is larger than a threshold, the - update of the actor network is rejected, but the update of the critic network is still - accepted. - """ - data = self._buf.get() - - ( - obs, - act, - logp, - target_value_r, - adv_r, - adv_c, - approx_compensating_act, - compensating_act, - ) = ( - data['obs'], - data['act'], - data['logp'], - data['target_value_r'], - data['adv_r'], - data['adv_c'], - data['approx_compensating_act'], - data['compensating_act'], - ) - - self._update_actor(obs, act, logp, adv_r, adv_c) - compensator_loss = self._env.compensator.update( - observation=obs, - approx_compensating_act=approx_compensating_act, - compensating_act=compensating_act, - ) - dataloader = DataLoader( - dataset=TensorDataset(obs, target_value_r), - batch_size=self._cfgs.algo_cfgs.batch_size, - shuffle=True, - ) - - for _ in range(self._cfgs.algo_cfgs.update_iters): - for ( - obs, - target_value_r, - ) in dataloader: - self._update_reward_critic(obs, target_value_r) - - self._logger.store( - { - 'Train/StopIter': self._cfgs.algo_cfgs.update_iters, - 'Value/Adv': adv_r.mean().item(), - 'Value/Loss_compensator': compensator_loss.item(), - }, - ) def _specific_save(self) -> None: """Save some algorithms specific models per epoch.""" diff --git a/omnisafe/common/__init__.py b/omnisafe/common/__init__.py index 9e4fc1bf1..c1311f150 100644 --- a/omnisafe/common/__init__.py +++ b/omnisafe/common/__init__.py @@ -14,6 +14,9 @@ # ============================================================================== """Common Common utilities for OmniSafe.""" +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.common.gp_model import DynamicsModel from omnisafe.common.lagrange import Lagrange from omnisafe.common.logger import Logger from omnisafe.common.normalizer import Normalizer diff --git a/omnisafe/common/barrier_comp.py b/omnisafe/common/barrier_comp.py index 891932188..64d1af104 100644 --- a/omnisafe/common/barrier_comp.py +++ b/omnisafe/common/barrier_comp.py @@ -27,9 +27,9 @@ class BarrierCompensator(torch.nn.Module): """A module that represents a barrier compensator using a multi-layer perceptron (MLP) network. - This module is designed to compute actions based on observations, with the intention of compensating for - potential barriers in a control system or a similar application. It is built upon a configurable MLP network - and trained using an optimization routine. + This module is designed to compute actions based on observations, with the intention of + compensating for potential barriers in a control system or a similar application. It is built + upon a configurable MLP network and trained using an optimization routine. Attributes: obs_dim (int): Dimension of the observation space. diff --git a/omnisafe/common/barrier_solver.py b/omnisafe/common/barrier_solver.py index 35221281e..f281fd0e6 100644 --- a/omnisafe/common/barrier_solver.py +++ b/omnisafe/common/barrier_solver.py @@ -22,73 +22,70 @@ import warnings -import joblib import numpy as np import torch from cvxopt import matrix, solvers -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import RBF -from sklearn.gaussian_process.kernels import ConstantKernel as C + +from omnisafe.typing import DEVICE_CPU # pylint: disable-next=too-many-instance-attributes class PendulumSolver: - """Solver for the pendulum problem using Gaussian Process models. + """The CBF solver for the pendulum problem using Gaussian Process models. + + This class implements a solver for the pendulum control problem using Control Barrier Functions + (CBFs). The primary goal is to ensure safe reinforcement learning by maintaining + safety constraints during the control process. + + For more details, please refer to: + + *End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous + Control Tasks* Attributes: - action_size (int): Size of the action space. - observation_size (int): Size of the observation space. - torque_bound (float): Maximum torque bound. - max_speed (float): Maximum speed of the pendulum. - device (str): Device to run the computations on. + action_size (int): Size of the action space, typically 1 for the pendulum. + torque_bound (float): Maximum torque bound that can be applied to the pendulum. + max_speed (float): Maximum speed (angular velocity) of the pendulum. + device (torch.device): Device to run the computations on. """ # pylint: disable-next=invalid-name def __init__( self, action_size: int = 1, - observation_size: int = 3, torque_bound: float = 15.0, max_speed: float = 60.0, - device: str = 'cpu', + device: torch.device = DEVICE_CPU, ) -> None: - """Initialize the PendulumSolver with specified parameters.""" + """Initialize the PendulumSolver with specified parameters. + + Args: + action_size (int): Size of the action space, typically 1 for the pendulum. + torque_bound (float): Maximum torque bound that can be applied to the pendulum. + max_speed (float): Maximum speed (angular velocity) of the pendulum. + device (torch.device): Device to run the computations on. + + Attributes: + F (float): A control gain factor used in the CBF computation. + _gamma_b (float): Parameter for the barrier function. + _kd (float): Damping coefficient used in the barrier function. + """ self.action_size = action_size - self.observation_size = observation_size self.torque_bound = torque_bound self.max_speed = max_speed self.F = 1.0 self._device = device self._gamma_b = 0.5 self._kd = 1.5 - self.gp_model_prev: list[GaussianProcessRegressor, GaussianProcessRegressor] - self.gp_model: list[GaussianProcessRegressor, GaussianProcessRegressor] - self._build_barrier() - self.build_gp_model() warnings.filterwarnings('ignore') - def build_gp_model(self, save_dir: str | None = None) -> None: - """Build the Gaussian Process model.""" - gp_list = [] - noise = 0.01 - for _ in range(self.observation_size - 1): - if not save_dir: - kern = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) - gp = GaussianProcessRegressor(kernel=kern, alpha=noise, n_restarts_optimizer=10) - gp_list.append(gp) - else: - gp_list = joblib.load(save_dir) - self.gp_model = gp_list - self.gp_model_prev = gp_list.copy() - - @property - def gp_models(self) -> list[GaussianProcessRegressor]: - """Return all gaussian process regressor for saving.""" - return self.gp_model - def _build_barrier(self) -> None: - """Build the barrier for the pendulum solver.""" + """Construct the Control Barrier Function (CBF) for safe control of the pendulum. + + This method initializes and sets up the necessary components for the CBF, which is used to + ensure that the control actions taken do not violate safety constraints. + """ self.P = matrix(np.diag([1.0, 1e16]), tc='d') self.q = matrix(np.zeros(self.action_size + 1)) self.h1 = np.array([1, 0.01]) @@ -193,122 +190,3 @@ def control_barrier( # pylint: disable=invalid-name print('Error in QP') return torch.as_tensor(u_bar[0], dtype=torch.float32, device=self._device).unsqueeze(dim=0) - - # pylint: disable-next=attribute-defined-outside-init,import-outside-toplevel,invalid-name - def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: - """Calculate the dynamics of the system. - - Args: - obs (list[float]): The current observation of the system state. - original_action (float): The original action proposed by the RL algorithm. - - Returns: - np.ndarray: The calculated dynamics of the system. - """ - # time step - dt = 0.05 - # gravitational constant - G = 10 - # mass - m = 2 - # length - length = 2 - - # calculate the angle - theta = np.arctan2(obs[1], obs[0]) - # angular velocity - theta_dot = obs[2] - - # dynamics equations - f = np.array( - [ - -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 - + theta_dot * dt - + theta - + 3 / (m * length**2) * original_action * dt**2, - theta_dot - - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt - + 3 / (m * length**2) * original_action * dt, - ], - ) - - return np.squeeze(f) - - def update_gp_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: - """Update the Gaussian Process (GP) dynamics model based on observed states and actions. - - Args: - obs (np.ndarray): Observed states. - act (np.ndarray): Actions taken. - """ - obs = obs.detach().cpu().squeeze().numpy() - act = act.detach().cpu().squeeze().numpy() - N = self.observation_size - X = obs - U = act - L = len(X) - err = np.zeros((L - 1, N - 1)) - S = np.zeros((L - 1, 2)) - for i in range(L - 1): - f = self.get_dynamics(X[i], U[i]) - theta_p = np.arctan2(X[i][1], X[i][0]) - theta_dot_p = X[i][2] - theta = np.arctan2(X[i + 1][1], X[i + 1][0]) - theta_dot = X[i + 1][2] - S[i, :] = np.array([theta_p, theta_dot_p]) - err[i, :] = np.array([theta, theta_dot]) - f - self.gp_model[0].fit(S, err[:, 0]) - self.gp_model[1].fit(S, err[:, 1]) - - def get_gp_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.ndarray]: - """Retrieve the GP dynamics based on the current observation. - - Args: - obs (torch.Tensor): Current state observation. - - Returns: - list[np.ndarray]: list containing the gp dynamics [f, g, x, std]. - """ - obs = obs.cpu().detach().numpy() - u_rl = 0 - dt = 0.05 - G = 10 - m = 1 - length = 1 - obs = np.squeeze(obs) - theta = np.arctan2(obs[1], obs[0]) - theta_dot = obs[2] - x = np.array([theta, theta_dot]) - f_nom = np.array( - [ - -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 - + theta_dot * dt - + theta - + 3 / (m * length**2) * u_rl * dt**2, - theta_dot - - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt - + 3 / (m * length**2) * u_rl * dt, - ], - ) - g = np.array([3 / (m * length**2) * dt**2, 3 / (m * length**2) * dt]) - f_nom = np.squeeze(f_nom) - f = np.zeros(2) - if use_prev_model: - [m1, std1] = self.gp_model_prev[0].predict(x.reshape(1, -1), return_std=True) - [m2, std2] = self.gp_model_prev[1].predict(x.reshape(1, -1), return_std=True) - else: - [m1, std1] = self.gp_model[0].predict(x.reshape(1, -1), return_std=True) - [m2, std2] = self.gp_model[1].predict(x.reshape(1, -1), return_std=True) - f[0] = f_nom[0] + m1 - f[1] = f_nom[1] + m2 - return [ - np.squeeze(f), - np.squeeze(g), - np.squeeze(x), - np.array([np.squeeze(std1), np.squeeze(std2)]), - ] - - def reset_gp_model(self) -> None: - """Reset the gaussian process model of barrier function solver.""" - self.gp_model_prev = self.gp_model.copy() - self.build_gp_model() diff --git a/omnisafe/common/gp_model.py b/omnisafe/common/gp_model.py new file mode 100644 index 000000000..771b29731 --- /dev/null +++ b/omnisafe/common/gp_model.py @@ -0,0 +1,218 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Control Barrier Function Solver.""" + +# pylint: disable=invalid-name,wrong-spelling-in-docstring +# mypy: ignore-errors + + +from __future__ import annotations + +import joblib +import numpy as np +import torch +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import RBF +from sklearn.gaussian_process.kernels import ConstantKernel as C + + +# pylint: disable-next=too-many-instance-attributes +class DynamicsModel: + """This class handles the creation and management of Gaussian Process (GP) models. + + These GP models predict the next state of the environment based on the current state. + + .. warning:: + This class provides an implementation for the ``Pendulum-v1`` environment. It needs to be + customized to extend it to more environments. + + Args: + observation_size (int): The size of the observation space. This determines + the number of GP models to create. + load_dir (Optional[str]): The directory to load the GP models from. If None, new models + are initialized. Default is None. + + Attributes: + observation_size (int): The size of the observation space. + gp_model_prev (List[GaussianProcessRegressor]): The GP models from the previous iteration. + gp_model (List[GaussianProcessRegressor]): The current GP models used for predictions. + """ + + def __init__(self, observation_size: int, load_dir: str | None = None) -> None: + """Initialize the DynamicsModel with a specified observation size and optional model loading. + + Args: + observation_size (int): Size of the observation space. + load_dir (Optional[str]): Directory to load the GP models from. If not provided, + new models will be created. + """ + self.observation_size: int = observation_size + self.gp_model_prev: list[GaussianProcessRegressor] + self.gp_model: list[GaussianProcessRegressor] + self._build_gp_model(load_dir=load_dir) + + def _build_gp_model(self, load_dir: str | None = None) -> None: + """Build or load the Gaussian Process models. + + If a load directory is provided, the models are loaded from the specified directory. + Otherwise, new models are created with default parameters. + + Args: + load_dir (Optional[str]): Directory to load the GP models from. If None, new models + will be created. + """ + gp_list = [] + noise = 0.01 # Small noise term to stabilize the GP model + for _ in range(self.observation_size - 1): + if not load_dir: + # Define the kernel as a product of a constant kernel and an RBF kernel + kern = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) + # Initialize the GaussianProcessRegressor with the specified kernel and noise + gp = GaussianProcessRegressor(kernel=kern, alpha=noise, n_restarts_optimizer=10) + gp_list.append(gp) + else: + # Load the GP models from the specified directory + gp_list = joblib.load(load_dir) + self.gp_model = gp_list + self.gp_model_prev = gp_list.copy() + + @property + def gp_models(self) -> list[GaussianProcessRegressor]: + """Return all gaussian process regressor for saving.""" + return self.gp_model + + def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: + """Calculate the dynamics of the system based on the current observation and the original action. + + This method computes the next state of a pendulum system using the provided state and + action. The equations of motion for the pendulum are discretized using the Euler method. + + Args: + obs (list[float]): The current observation of the system state. + For the ``Pendulum-v1``, It should contain at least three elements: + [x, y, theta_dot], where x and y are the Cartesian coordinates of + the pendulum, and theta_dot is the angular velocity. + original_action (float): The original action proposed by the RL agent. + + Returns: + np.ndarray: The calculated dynamics of the system, representing the next state. + """ + # Time step + dt = 0.05 + # Gravitational constant + G = 10 + # Mass of the pendulum + m = 2 + # Length of the pendulum + length = 2 + + # Calculate the angle theta from the Cartesian coordinates + theta = np.arctan2(obs[1], obs[0]) + # Angular velocity + theta_dot = obs[2] + + f = np.array( + [ + -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 + + theta_dot * dt + + theta + + 3 / (m * length**2) * original_action * dt**2, + theta_dot + - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt + + 3 / (m * length**2) * original_action * dt, + ], + ) + + return np.squeeze(f) + + def update_gp_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: + """Update the Gaussian Process (GP) dynamics model based on observed states and actions. + + Args: + obs (np.ndarray): Agent's observation of the current environment. + act (np.ndarray): Actions taken. + """ + obs = obs.detach().cpu().squeeze().numpy() + act = act.detach().cpu().squeeze().numpy() + N = self.observation_size + X = obs + U = act + L = len(X) + err = np.zeros((L - 1, N - 1)) + S = np.zeros((L - 1, 2)) + for i in range(L - 1): + f = self.get_dynamics(X[i], U[i]) + theta_p = np.arctan2(X[i][1], X[i][0]) + theta_dot_p = X[i][2] + theta = np.arctan2(X[i + 1][1], X[i + 1][0]) + theta_dot = X[i + 1][2] + S[i, :] = np.array([theta_p, theta_dot_p]) + err[i, :] = np.array([theta, theta_dot]) - f + self.gp_model[0].fit(S, err[:, 0]) + self.gp_model[1].fit(S, err[:, 1]) + + def get_gp_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.ndarray]: + """Retrieve the GP dynamics based on the current observation. + + Args: + obs (torch.Tensor): Agent's observation of the current environment. + use_prev_model (bool): Whether to use previous gaussian model. + + Returns: + list[np.ndarray]: list containing the gp dynamics [f, g, x, std]. + """ + obs = obs.cpu().detach().numpy() + u_rl = 0 + dt = 0.05 + G = 10 + m = 1 + length = 1 + obs = np.squeeze(obs) + theta = np.arctan2(obs[1], obs[0]) + theta_dot = obs[2] + x = np.array([theta, theta_dot]) + f_nom = np.array( + [ + -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 + + theta_dot * dt + + theta + + 3 / (m * length**2) * u_rl * dt**2, + theta_dot + - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt + + 3 / (m * length**2) * u_rl * dt, + ], + ) + g = np.array([3 / (m * length**2) * dt**2, 3 / (m * length**2) * dt]) + f_nom = np.squeeze(f_nom) + f = np.zeros(2) + if use_prev_model: + [m1, std1] = self.gp_model_prev[0].predict(x.reshape(1, -1), return_std=True) + [m2, std2] = self.gp_model_prev[1].predict(x.reshape(1, -1), return_std=True) + else: + [m1, std1] = self.gp_model[0].predict(x.reshape(1, -1), return_std=True) + [m2, std2] = self.gp_model[1].predict(x.reshape(1, -1), return_std=True) + f[0] = f_nom[0] + m1 + f[1] = f_nom[1] + m2 + return [ + np.squeeze(f), + np.squeeze(g), + np.squeeze(x), + np.array([np.squeeze(std1), np.squeeze(std2)]), + ] + + def reset_gp_model(self) -> None: + """Reset the gaussian process model of barrier function solver.""" + self.gp_model_prev = self.gp_model.copy() + self._build_gp_model() diff --git a/omnisafe/common/robust_gp_model.py b/omnisafe/common/robust_gp_model.py index 5a305140d..9361c833b 100644 --- a/omnisafe/common/robust_gp_model.py +++ b/omnisafe/common/robust_gp_model.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -393,20 +393,20 @@ def predict_disturbance(self, test_x: torch.Tensor) -> tuple[torch.Tensor, torch return (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) - def load_disturbance_models(self, save_dir: str, epoch: str) -> None: + def load_disturbance_models(self, load_dir: str, epoch: str) -> None: """Load the disturbance models and their training data. Args: - save_dir (str): The directory where the model files are saved. + load_dir (str): The directory where the model files are saved. epoch (str): The epoch identifier used in the filenames to load the specific model checkpoint. """ self._disturb_estimators = [] weights = torch.load( - os.path.join(save_dir, f'gp_models_{epoch}.pkl'), + os.path.join(load_dir, f'gp_models_{epoch}.pkl'), map_location=self.device, ) - self._train_x = torch.load(os.path.join(save_dir, f'gp_models_train_x_{epoch}.pkl')) - self._train_y = torch.load(os.path.join(save_dir, f'gp_models_train_y_{epoch}.pkl')) + self._train_x = torch.load(os.path.join(load_dir, f'gp_models_train_x_{epoch}.pkl')) + self._train_y = torch.load(os.path.join(load_dir, f'gp_models_train_y_{epoch}.pkl')) for i in range(self.n_s): self._disturb_estimators.append( GPyDisturbanceEstimator( diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py index 691d6aa86..088c8b4af 100644 --- a/omnisafe/evaluator.py +++ b/omnisafe/evaluator.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -51,8 +51,9 @@ from omnisafe.common.control_barrier_function.crabs.optimizers import Barrier from omnisafe.common.control_barrier_function.crabs.utils import Normalizer as CRABSNormalizer from omnisafe.common.control_barrier_function.crabs.utils import create_model_and_trainer +from omnisafe.common.gp_model import DynamicsModel from omnisafe.common.robust_barrier_solver import CBFQPLayer -from omnisafe.common.robust_gp_model import DynamicsModel +from omnisafe.common.robust_gp_model import DynamicsModel as RoboustDynamicsModel from omnisafe.envs.core import CMDP, make from omnisafe.envs.wrapper import ActionRepeat, ActionScale, ObsNormalize, TimeLimit from omnisafe.models.actor import ActorBuilder @@ -100,7 +101,7 @@ def __init__( self._safety_obs = torch.ones(1) self._cost_count = torch.zeros(1) self.__set_render_mode(render_mode) - self._dynamics_model: DynamicsModel | None = None + self._dynamics_model: DynamicsModel | RoboustDynamicsModel | None = None self._solver: PendulumSolver | CBFQPLayer | None = None self._compensator = None @@ -311,6 +312,18 @@ def __load_model_and_env( self._actor = actor_builder.build_actor(actor_type) self._actor.load_state_dict(model_params['pi']) if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + epoch = model_name.split('.pt')[0].split('-')[-1] + self._solver = PendulumSolver(action_size=self._env.action_space.shape[0]) + path = os.path.join( + save_dir, + 'gp_model_save', + f'gaussian_process_regressor_{epoch}.pkl', + ) + self._dynamics_model = DynamicsModel( + observation_size=observation_space.shape[0], + load_dir=path, + ) + self._compensator = BarrierCompensator( obs_dim=observation_space.shape[0], act_dim=action_space.shape[0], @@ -332,9 +345,9 @@ def __load_model_and_env( gamma_b=self._cfgs['cbf_cfgs']['gamma_b'], l_p=self._cfgs['cbf_cfgs']['l_p'], ) - self._dynamics_model = DynamicsModel(env=self._env) + self._dynamics_model = RoboustDynamicsModel(env=self._env) self._dynamics_model.load_disturbance_models( - save_dir=os.path.join(self._save_dir, 'gp_model_save'), + load_dir=os.path.join(self._save_dir, 'gp_model_save'), epoch=epoch, ) @@ -413,22 +426,11 @@ def load_saved( # load the config self._save_dir = save_dir self._model_name = model_name - epoch = model_name.split('.pt')[0].split('-')[-1] self.__load_cfgs(save_dir) self.__set_render_mode(render_mode) - if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': - - self._solver = PendulumSolver() - path = os.path.join( - save_dir, - 'gp_model_save', - f'gaussian_process_regressor_{epoch}.pkl', - ) - self._solver.build_gp_model(save_dir=path) - env_kwargs = { 'env_id': self._cfgs['env_id'], 'num_envs': 1, @@ -443,7 +445,7 @@ def load_saved( self.__load_model_and_env(save_dir, model_name, env_kwargs) - # pylint: disable-next=too-many-locals + # pylint: disable-next=too-many-locals,too-many-branches def evaluate( self, num_episodes: int = 10, @@ -503,13 +505,13 @@ def evaluate( if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': approx_compensating_act = self._compensator(obs=obs) compensated_act_mean_raw = act + approx_compensating_act - [f, g, x, std] = self._solver.get_gp_dynamics(obs, use_prev_model=False) + [f, g, x, std] = self._dynamics_model.get_gp_dynamics(obs, use_prev_model=False) compensating_act = self._solver.control_barrier( - compensated_act_mean_raw, - f, - g, - x, - std, + original_action=compensated_act_mean_raw, + f=f, + g=g, + x=x, + std=std, ) act = compensated_act_mean_raw + compensating_act @@ -532,7 +534,12 @@ def evaluate( self._safety_obs /= self._cfgs.algo_cfgs.saute_gamma ep_ret += rew.item() - ep_cost += (cost_criteria**length) * cost.item() + + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + ep_cost = ep_cost if ep_cost > cost.item() else cost.item() + else: + ep_cost += (cost_criteria**length) * cost.item() + if ( 'EarlyTerminated' in self._cfgs['algo'] and ep_cost >= self._cfgs.algo_cfgs.cost_limit @@ -647,13 +654,16 @@ def render( # pylint: disable=too-many-locals,too-many-arguments,too-many-branc if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': approx_compensating_act = self._compensator(obs=obs) compensated_act_mean_raw = act + approx_compensating_act - [f, g, x, std] = self._solver.get_gp_dynamics(obs, use_prev_model=False) + [f, g, x, std] = self._dynamics_model.get_gp_dynamics( + obs, + use_prev_model=False, + ) compensating_act = self._solver.control_barrier( - compensated_act_mean_raw, - f, - g, - x, - std, + original_action=compensated_act_mean_raw, + f=f, + g=g, + x=x, + std=std, ) act = compensated_act_mean_raw + compensating_act @@ -688,7 +698,10 @@ def render( # pylint: disable=too-many-locals,too-many-arguments,too-many-branc step += 1 done = bool(terminated or truncated) ep_ret += rew.item() - ep_cost += (cost_criteria**length) * cost.item() + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + ep_cost = ep_cost if ep_cost > cost.item() else cost.item() + else: + ep_cost += (cost_criteria**length) * cost.item() if ( 'EarlyTerminated' in self._cfgs['algo'] and ep_cost >= self._cfgs.algo_cfgs.cost_limit diff --git a/pyproject.toml b/pyproject.toml index 350414746..5b4a33e95 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -130,7 +130,6 @@ ignore-words = "docs/source/spelling_wordlist.txt" # Sync with requires-python target-version = "py38" line-length = 100 -show-source = true src = ["omnisafe", "tests", "examples"] select = [ "E", "W", # pycodestyle From 436dbddc0978d97920769ad3116381f1f1a1ed8d Mon Sep 17 00:00:00 2001 From: Gaiejj Date: Thu, 4 Jul 2024 19:39:39 +0800 Subject: [PATCH 16/18] docs(cbf): update CBF methods docs --- docs/source/index.rst | 1 + docs/source/saferl/cbf.rst | 136 +++++++++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 docs/source/saferl/cbf.rst diff --git a/docs/source/index.rst b/docs/source/index.rst index 792f62052..402ed6203 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -394,6 +394,7 @@ this project, don't hesitate to ask your question on `the GitHub issue page `_ as an example. + +The CBF method implementation in OmniSafe revolves around the ``Adapter``, which decouples and integrates the two core components: ``dynamics model`` and ``solver``. The former predicts the dynamic changes of the environment, while the latter maps the current action to a safe space based on the given environment dynamics. + +CBF Adapter +----------- + +.. currentmodule:: omnisafe.adapter + +.. card:: + :class-header: sd-bg-success sd-text-white + :class-card: sd-outline-success sd-rounded-1 + + Documentation + ^^^ + + .. autoclass:: OffPolicyBarrierFunctionAdapter + :members: + +Core Components +--------------- + +Dynamics Model +"""""""""""""" + +The environmental dynamic model of the CBF method needs to be designed for a specific environment. For example, in the case of the ``Pendulum-v1`` environment, the environmental dynamics will be calculated together with variables such as mass and gravitational acceleration. + +.. code-block:: python + :linenos: + + def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: + dt = 0.05 + # gravitational constant + G = 10 + # mass + m = 2 + # length + length = 2 + # calculate the angle + theta = np.arctan2(obs[1], obs[0]) + # angular velocity + theta_dot = obs[2] + # dynamics equations + f = np.array( + [ + -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 + + theta_dot * dt + + theta + + 3 / (m * length**2) * original_action * dt**2, + theta_dot + - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt + + 3 / (m * length**2) * original_action * dt, + ], + ) + return np.squeeze(f) + +The current mainstream implementation often uses a combination of several Gaussian Process (GP) models to fit the environmental dynamics. The specific code documentation is as follows: + +.. currentmodule:: omnisafe.common + +.. card:: + :class-header: sd-bg-success sd-text-white + :class-card: sd-outline-success sd-rounded-1 + + Documentation + ^^^ + + .. autoclass:: DynamicsModel + :members: + :private-members: + +The ``solver`` is responsible for taking the feedback information from the ``dynamics model`` and mapping the often unsafe actions generated by the agent into a safe one. + +CBF Solver +"""""""""" + +.. currentmodule:: omnisafe.common + +.. card:: + :class-header: sd-bg-success sd-text-white + :class-card: sd-outline-success sd-rounded-1 + + Documentation + ^^^ + + .. autoclass:: PendulumSolver + :members: + :private-members: + +Architecture of methods +""""""""""""""""""""""" + +- ``DDPGCBF.learn()`` + + - ``DDPGCBF._env.rollout()`` + + - ``DDPGCBF._env.get_safe_action()`` + + - ``DDPGCBF._env.dynamics_model.get_gp_dynamics()`` + - ``DDPGCBF._env.solver.control_barrier()`` + + - ``DDPGCBF._env.dynamics_model.update_gp_dynamics()`` + + - ``DDPGCBF._update()`` + + +Further Discussion +"""""""""""""""""" + +For details on the implementation, performance, reproducible scripts, and related discussions of algorithms including DDPGCBF, please refer to: https://github.com/PKU-Alignment/omnisafe/pull/323 + + +References +---------- + +- `End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous Control Tasks `__ +- `Safe Reinforcement Learning Using Robust Control Barrier Functions `__ +- `Learning Barrier Certificates: Towards Safe Reinforcement Learning with Zero Training-time Violations `__ From f56875b48782a32fe92729b93707bc3b6284ea0a Mon Sep 17 00:00:00 2001 From: Gaiejj Date: Thu, 4 Jul 2024 19:58:30 +0800 Subject: [PATCH 17/18] style: polish code style --- conftest.py | 4 +--- omnisafe/version.py | 6 +++--- pyproject.toml | 14 +++++++------- tests/test_buffer.py | 2 +- 4 files changed, 12 insertions(+), 14 deletions(-) diff --git a/conftest.py b/conftest.py index f3a1e8b06..266ac7a7e 100644 --- a/conftest.py +++ b/conftest.py @@ -10,6 +10,4 @@ def pytest_ignore_collect(path, config): - if os.path.basename(path) == 'meta_drive_env.py' and not meta_drive_env_available: - return True - return False + return os.path.basename(path) == 'meta_drive_env.py' and not meta_drive_env_available diff --git a/omnisafe/version.py b/omnisafe/version.py index 0295dccbf..bb545ba26 100644 --- a/omnisafe/version.py +++ b/omnisafe/version.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,8 +25,8 @@ try: prefix, sep, suffix = ( - subprocess.check_output( - ['git', 'describe', '--abbrev=7'], # noqa: S603,S607 + subprocess.check_output( # noqa: S603 + ['git', 'describe', '--abbrev=7'], # noqa: S607 cwd=os.path.dirname(os.path.abspath(__file__)), stderr=subprocess.DEVNULL, text=True, diff --git a/pyproject.toml b/pyproject.toml index 5b4a33e95..d7351aeb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -131,7 +131,7 @@ ignore-words = "docs/source/spelling_wordlist.txt" target-version = "py38" line-length = 100 src = ["omnisafe", "tests", "examples"] -select = [ +lint.select = [ "E", "W", # pycodestyle "F", # pyflakes "UP", # pyupgrade @@ -152,7 +152,7 @@ select = [ "TID", # flake8-tidy-imports "RUF", # ruff ] -ignore = [ +lint.ignore = [ # E501: line too long # W505: doc line too long # too long docstring due to long example blocks @@ -171,9 +171,9 @@ ignore = [ # use alias for import convention (e.g., `import torch.nn as nn`) "PLR0402", ] -typing-modules = ["omnisafe.typing"] +lint.typing-modules = ["omnisafe.typing"] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = [ "F401", # unused-import ] @@ -235,15 +235,15 @@ typing-modules = ["omnisafe.typing"] "ANN003", # Missing type annotation ] -[tool.ruff.flake8-annotations] +[tool.ruff.lint.flake8-annotations] allow-star-arg-any = true -[tool.ruff.flake8-quotes] +[tool.ruff.lint.flake8-quotes] docstring-quotes = "double" multiline-quotes = "double" inline-quotes = "single" -[tool.ruff.flake8-tidy-imports] +[tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all" [tool.pytest.ini_options] diff --git a/tests/test_buffer.py b/tests/test_buffer.py index 0fee90a46..b284b9e10 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -79,7 +79,7 @@ def test_vector_onpolicy_buffer( assert ( vector_buffer.standardized_adv_r == standardized_adv_r ), f'vector_buffer.sstandardized_adv_r is {vector_buffer.sstandardized_adv_r}' - assert vector_buffer.buffers is not [], f'vector_buffer.buffers is {vector_buffer.buffers}' + assert vector_buffer.buffers != [], f'vector_buffer.buffers is {vector_buffer.buffers}' # checking the store function obs_dim = obs_space.shape[0] From 34104cdf3b5c9797199d732acb9344ce6b2a563b Mon Sep 17 00:00:00 2001 From: Gaiejj Date: Thu, 4 Jul 2024 20:12:35 +0800 Subject: [PATCH 18/18] style: polish docs style --- docs/source/spelling_wordlist.txt | 1 + omnisafe/common/gp_model.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index 46e297388..958277550 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -513,3 +513,4 @@ Vipul Sivaranjani Vijay suttle +regressor diff --git a/omnisafe/common/gp_model.py b/omnisafe/common/gp_model.py index 771b29731..dac93ea13 100644 --- a/omnisafe/common/gp_model.py +++ b/omnisafe/common/gp_model.py @@ -97,7 +97,7 @@ def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: """Calculate the dynamics of the system based on the current observation and the original action. This method computes the next state of a pendulum system using the provided state and - action. The equations of motion for the pendulum are discretized using the Euler method. + action. Args: obs (list[float]): The current observation of the system state.