diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 99b01f43f..2f04378f6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: - id: debug-statements - id: double-quote-string-fixer - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.4.2 + rev: v0.5.0 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] @@ -54,7 +54,7 @@ repos: - id: pyupgrade args: [--py38-plus] # sync with requires-python - repo: https://github.com/pycqa/flake8 - rev: 7.0.0 + rev: 7.1.0 hooks: - id: flake8 additional_dependencies: @@ -114,6 +114,7 @@ repos: ^tests/| ^setup.py$| ^omnisafe/envs/classic_control/envs_from_crabs.py$| + ^omnisafe/envs/classic_control/envs_from_rcbf.py$| ^omnisafe/common/control_barrier_function/crabs/models.py$| ^omnisafe/common/control_barrier_function/crabs/optimizers.py$| ^omnisafe/common/control_barrier_function/crabs/utils.py$| diff --git a/conftest.py b/conftest.py index f3a1e8b06..266ac7a7e 100644 --- a/conftest.py +++ b/conftest.py @@ -10,6 +10,4 @@ def pytest_ignore_collect(path, config): - if os.path.basename(path) == 'meta_drive_env.py' and not meta_drive_env_available: - return True - return False + return os.path.basename(path) == 'meta_drive_env.py' and not meta_drive_env_available diff --git a/docs/source/index.rst b/docs/source/index.rst index 792f62052..402ed6203 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -394,6 +394,7 @@ this project, don't hesitate to ask your question on `the GitHub issue page `_ as an example. + +The CBF method implementation in OmniSafe revolves around the ``Adapter``, which decouples and integrates the two core components: ``dynamics model`` and ``solver``. The former predicts the dynamic changes of the environment, while the latter maps the current action to a safe space based on the given environment dynamics. + +CBF Adapter +----------- + +.. currentmodule:: omnisafe.adapter + +.. card:: + :class-header: sd-bg-success sd-text-white + :class-card: sd-outline-success sd-rounded-1 + + Documentation + ^^^ + + .. autoclass:: OffPolicyBarrierFunctionAdapter + :members: + +Core Components +--------------- + +Dynamics Model +"""""""""""""" + +The environmental dynamic model of the CBF method needs to be designed for a specific environment. For example, in the case of the ``Pendulum-v1`` environment, the environmental dynamics will be calculated together with variables such as mass and gravitational acceleration. + +.. code-block:: python + :linenos: + + def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: + dt = 0.05 + # gravitational constant + G = 10 + # mass + m = 2 + # length + length = 2 + # calculate the angle + theta = np.arctan2(obs[1], obs[0]) + # angular velocity + theta_dot = obs[2] + # dynamics equations + f = np.array( + [ + -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 + + theta_dot * dt + + theta + + 3 / (m * length**2) * original_action * dt**2, + theta_dot + - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt + + 3 / (m * length**2) * original_action * dt, + ], + ) + return np.squeeze(f) + +The current mainstream implementation often uses a combination of several Gaussian Process (GP) models to fit the environmental dynamics. The specific code documentation is as follows: + +.. currentmodule:: omnisafe.common + +.. card:: + :class-header: sd-bg-success sd-text-white + :class-card: sd-outline-success sd-rounded-1 + + Documentation + ^^^ + + .. autoclass:: DynamicsModel + :members: + :private-members: + +The ``solver`` is responsible for taking the feedback information from the ``dynamics model`` and mapping the often unsafe actions generated by the agent into a safe one. + +CBF Solver +"""""""""" + +.. currentmodule:: omnisafe.common + +.. card:: + :class-header: sd-bg-success sd-text-white + :class-card: sd-outline-success sd-rounded-1 + + Documentation + ^^^ + + .. autoclass:: PendulumSolver + :members: + :private-members: + +Architecture of methods +""""""""""""""""""""""" + +- ``DDPGCBF.learn()`` + + - ``DDPGCBF._env.rollout()`` + + - ``DDPGCBF._env.get_safe_action()`` + + - ``DDPGCBF._env.dynamics_model.get_gp_dynamics()`` + - ``DDPGCBF._env.solver.control_barrier()`` + + - ``DDPGCBF._env.dynamics_model.update_gp_dynamics()`` + + - ``DDPGCBF._update()`` + + +Further Discussion +"""""""""""""""""" + +For details on the implementation, performance, reproducible scripts, and related discussions of algorithms including DDPGCBF, please refer to: https://github.com/PKU-Alignment/omnisafe/pull/323 + + +References +---------- + +- `End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous Control Tasks `__ +- `Safe Reinforcement Learning Using Robust Control Barrier Functions `__ +- `Learning Barrier Certificates: Towards Safe Reinforcement Learning with Zero Training-time Violations `__ diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt index 460cabd1a..958277550 100644 --- a/docs/source/spelling_wordlist.txt +++ b/docs/source/spelling_wordlist.txt @@ -486,3 +486,31 @@ UpdateDynamics mathbb meger Jupyter +compensator +CBF +Vectorize +gp +optim +cvx +QP +gpytorch +ExactGP +RBF +parallelization +compensators +thetadot +VK +Sharma +Kosaraju +Seetharaman +Sadler +Suttle +Cheng +Orosz +JW +Burdick +Vipul +Sivaranjani +Vijay +suttle +regressor diff --git a/examples/plot.py b/examples/plot.py index c16974cce..a425587a7 100644 --- a/examples/plot.py +++ b/examples/plot.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,6 +35,27 @@ parser.add_argument('--select', nargs='*') parser.add_argument('--exclude', nargs='*') parser.add_argument('--estimator', default='mean') + parser.add_argument( + '--reward-metrics', + type=str, + choices=[ + 'Metrics/TestEpRet', + 'Metrics/EpRet', + ], + default='Metrics/EpRet', + help='Specify the reward metric to be used.', + ) + parser.add_argument( + '--cost-metrics', + type=str, + choices=[ + 'Metrics/Max_angle_violation', + 'Metrics/TestEpCost', + 'Metrics/EpCost', + ], + default='Metrics/EpCost', + help='Specify the cost metric to be used.', + ) args = parser.parse_args() plotter = Plotter() @@ -48,4 +69,6 @@ select=args.select, exclude=args.exclude, estimator=args.estimator, + cost_metrics=args.cost_metrics, + reward_metrics=args.reward_metrics, ) diff --git a/omnisafe/adapter/__init__.py b/omnisafe/adapter/__init__.py index ba768a7eb..873eccc33 100644 --- a/omnisafe/adapter/__init__.py +++ b/omnisafe/adapter/__init__.py @@ -14,11 +14,15 @@ # ============================================================================== """Adapter for the environment and the algorithm.""" +from omnisafe.adapter.barrier_function_adapter import BarrierFunctionAdapter +from omnisafe.adapter.beta_barrier_function_adapter import BetaBarrierFunctionAdapter from omnisafe.adapter.early_terminated_adapter import EarlyTerminatedAdapter from omnisafe.adapter.modelbased_adapter import ModelBasedAdapter from omnisafe.adapter.offline_adapter import OfflineAdapter from omnisafe.adapter.offpolicy_adapter import OffPolicyAdapter +from omnisafe.adapter.offpolicy_barrier_function_adapter import OffPolicyBarrierFunctionAdapter from omnisafe.adapter.online_adapter import OnlineAdapter from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.adapter.robust_barrier_function_adapter import RobustBarrierFunctionAdapter from omnisafe.adapter.saute_adapter import SauteAdapter from omnisafe.adapter.simmer_adapter import SimmerAdapter diff --git a/omnisafe/adapter/barrier_function_adapter.py b/omnisafe/adapter/barrier_function_adapter.py new file mode 100644 index 000000000..c247f7705 --- /dev/null +++ b/omnisafe/adapter/barrier_function_adapter.py @@ -0,0 +1,270 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Barrier Function Adapter for OmniSafe.""" + +from __future__ import annotations + +from typing import Any + +import torch +from rich.progress import track +from sklearn.gaussian_process import GaussianProcessRegressor + +from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.common.buffer import VectorOnPolicyBuffer +from omnisafe.common.gp_model import DynamicsModel +from omnisafe.common.logger import Logger +from omnisafe.envs.wrapper import AutoReset, CostNormalize, RewardNormalize, TimeLimit, Unsqueeze +from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic +from omnisafe.utils.config import Config + + +class BarrierFunctionAdapter(OnPolicyAdapter): + """Barrier Function Adapter for OmniSafe. + + The Barrier Function Adapter is used to establish the logic of interaction between agents and + the environment based on control barrier functions. Its key feature is the introduction of + action compensators and barrier function solvers. + + Args: + env_id (str): The environment id. + num_envs (int): The number of parallel environments. + seed (int): The random seed. + cfgs (Config): The configuration passed from yaml file. + """ + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + """Initialize an instance of :class:`BarrierFunctionAdapter`.""" + super().__init__(env_id, num_envs, seed, cfgs) + + if env_id == 'Pendulum-v1': + self.solver: PendulumSolver = PendulumSolver( + action_size=self.action_space.shape[0], # type: ignore + device=self._device, + ) + self.dynamics_model: DynamicsModel = DynamicsModel( + observation_size=self.observation_space.shape[0], # type: ignore + ) + else: + raise NotImplementedError(f'Please implement solver for {env_id} !') + self.compensator: BarrierCompensator = BarrierCompensator( + obs_dim=self.observation_space.shape[0], # type: ignore + act_dim=self.action_space.shape[0], # type: ignore + cfgs=cfgs.compensator_cfgs, + ).to(self._device) + self.first_iter: bool = True + + self.episode_rollout: dict[str, Any] = {} + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] + self.episode_rollout['approx_compensating_act'] = [] + self.episode_rollout['compensating_act'] = [] + + def _wrapper( + self, + obs_normalize: bool = False, + reward_normalize: bool = True, + cost_normalize: bool = True, + ) -> None: + """Wrapper the environment. + + .. warning:: + Since solving the optimization problem requires obtaining physical quantities with + practical significance from state observations, the Barrier Function Adapter does not + support normalization of observations. + + Args: + obs_normalize (bool, optional): Whether to normalize the observation. Defaults to False. + reward_normalize (bool, optional): Whether to normalize the reward. Defaults to True. + cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True. + """ + assert not obs_normalize, 'Barrier function does not support observation normalization!' + if self._env.need_time_limit_wrapper: + assert ( + self._env.max_episode_steps + ), 'You must define max_episode_steps as an integer\ + \nor cancel the use of the time_limit wrapper.' + self._env = TimeLimit( + self._env, + time_limit=self._env.max_episode_steps, + device=self._device, + ) + if self._env.need_auto_reset_wrapper: + self._env = AutoReset(self._env, device=self._device) + if reward_normalize: + self._env = RewardNormalize(self._env, device=self._device) + if cost_normalize: + self._env = CostNormalize(self._env, device=self._device) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env, device=self._device) + + def reset_gp_model(self) -> None: + """Reset the gaussian processing model of barrier function solver.""" + self.dynamics_model.reset_gp_model() + + def rollout( # pylint: disable=too-many-locals,too-many-branches + self, + steps_per_epoch: int, + agent: ConstraintActorCritic, + buffer: VectorOnPolicyBuffer, + logger: Logger, + ) -> None: + """Rollout the environment with barrier function controller. + + Args: + steps_per_epoch (int): Number of steps per epoch. + agent (ConstraintActorCritic): Constraint actor-critic, including actor , reward critic + and cost critic. + buffer (VectorOnPolicyBuffer): Vector on-policy buffer. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + """ + self._reset_log() + + obs, _ = self.reset() + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] + for step in track( + range(steps_per_epoch), + description=f'Processing rollout for epoch: {logger.current_epoch}...', + ): + with torch.no_grad(): + value_r = agent.reward_critic(obs)[0] + value_c = agent.cost_critic(obs)[0] + act_dist = agent.actor(obs) + act_mean, act_std = act_dist.mean, agent.actor.std + + safe_act = self.get_safe_action( + obs, + act_mean, + act_std, + ) + logp = agent.actor.log_prob(safe_act) + + self.episode_rollout['obs'].append(obs) + self.episode_rollout['final_act'].append(safe_act) + + next_obs, reward, cost, terminated, truncated, info = self.step(safe_act) + self._log_value(reward=reward, cost=cost, info=info) + + logger.store({'Value/reward': value_r}) + + buffer.store( + obs=obs, + act=safe_act, + reward=reward, + cost=cost, + value_r=value_r, + value_c=value_c, + logp=logp, + ) + + obs = next_obs + epoch_end = step >= steps_per_epoch + + if epoch_end: + num_dones = int(terminated.contiguous().sum()) + if self._env.num_envs - num_dones: + logger.log( + f'\nWarning: trajectory cut off when rollout by epoch\ + in {self._env.num_envs - num_dones} of {self._env.num_envs} environments.', + ) + + for idx, (done, time_out) in enumerate(zip(terminated, truncated)): + if epoch_end or done or time_out: + last_value_r = torch.zeros(1) + last_value_c = torch.zeros(1) + if not done: + if epoch_end: + _, last_value_r, last_value_c, _ = agent.step(obs[idx]) + if time_out: + _, last_value_r, last_value_c, _ = agent.step( + obs[idx], + ) + last_value_r = last_value_r.unsqueeze(0) + last_value_c = last_value_c.unsqueeze(0) + + if done or time_out: + self._log_metrics(logger, idx) + compensator_loss = self.compensator.update( + torch.cat(self.episode_rollout['obs']), + torch.cat(self.episode_rollout['approx_compensating_act']), + torch.cat(self.episode_rollout['compensating_act']), + ) + logger.store({'Value/Loss_compensator': compensator_loss.item()}) + self.dynamics_model.update_gp_dynamics( + obs=torch.cat(self.episode_rollout['obs']), # type: ignore + act=torch.cat(self.episode_rollout['final_act']), # type: ignore + ) + + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] + self.episode_rollout['approx_compensating_act'] = [] + self.episode_rollout['compensating_act'] = [] + + self._reset_log(idx) + obs, _ = self.reset() + buffer.finish_path(last_value_r, last_value_c, idx) + self.first_iter = False + self.reset_gp_model() + + def get_safe_action( + self, + obs: torch.Tensor, + act_mean: torch.Tensor, + act_std: torch.Tensor, + ) -> torch.Tensor: + """Computes a safe action by applying compensatory actions. + + .. note:: + This is the core method of the CBF method. Users can modify this function to implement + customized action mapping. + + Args: + obs (torch.Tensor): The current observation from the environment. + act_mean (torch.Tensor): The mean of proposed action to be controlled for safety. + act_std (torch.Tensor): The standard deviation of proposed action to be controlled for safety. + + Returns: + list(torch.Tensor): The safe actions for interaction and compensating actions for compensator training. + """ + with torch.no_grad(): + approx_compensating_act = self.compensator(obs=obs) + compensated_act_mean_raw = act_mean + approx_compensating_act + + [f, g, x, std] = self.dynamics_model.get_gp_dynamics( + obs, + use_prev_model=not self.first_iter, + ) + compensating_act = self.solver.control_barrier( + original_action=compensated_act_mean_raw, + f=f, + g=g, + x=x, + std=std, + ) + + compensated_act_mean = compensated_act_mean_raw + compensating_act + safe_act = torch.normal(compensated_act_mean, act_std) + self.episode_rollout['compensating_act'].append(compensating_act) + self.episode_rollout['approx_compensating_act'].append(approx_compensating_act) + + return safe_act + + @property + def gp_models(self) -> list[GaussianProcessRegressor]: + """Return the gp models to be saved.""" + return self.dynamics_model.gp_models diff --git a/omnisafe/adapter/beta_barrier_function_adapter.py b/omnisafe/adapter/beta_barrier_function_adapter.py new file mode 100644 index 000000000..1ab488d88 --- /dev/null +++ b/omnisafe/adapter/beta_barrier_function_adapter.py @@ -0,0 +1,238 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Barrier Function Adapter with Beta Distribution for OmniSafe.""" + +from __future__ import annotations + +from typing import Callable + +import numpy as np +import torch +from rich.progress import track + +from omnisafe.adapter.onpolicy_adapter import OnPolicyAdapter +from omnisafe.common.buffer import VectorOnPolicyBuffer +from omnisafe.common.logger import Logger +from omnisafe.envs.wrapper import AutoReset, CostNormalize, RewardNormalize, TimeLimit, Unsqueeze +from omnisafe.models.actor_critic.constraint_actor_critic import ConstraintActorCritic +from omnisafe.utils.config import Config + + +# pylint: disable-next=too-many-locals +def cbf(state: np.ndarray, eta: float = 0.99) -> tuple[np.ndarray, np.ndarray]: + """Calculates the Control Barrier Function (CBF) constraints. + + Args: + state (np.ndarray | None): A numpy array containing the pendulum's current angular position + (theta) and angular velocity (thetadot). + eta (float): A scaling factor used to adjust the safety bounds. + + Returns: + tuple containing two elements: 1. The minimum control torque that keeps the pendulum within + the safety bounds. 2. The maximum control torque that keeps the pendulum within the safety + bounds. + + Raises: + ValueError: If the `eta` value is not within the open interval (0, 1). + """ + g = 9.8 + m = 1 + length = 1 + tau = 5e-2 + theta_safety_bounds = [-1.0, 1.0] + torque_bounds = [-15.0, 15.0] + if (eta > 1 - 1e-3) or (eta < 1e-5): + raise ValueError('eta should be inside (0, 1)') + c1 = (3 * g) / (2 * length) + c2 = 3 / (m * (length**2)) + + theta, thetadot = state[0], state[1] + theta_min, theta_max = theta_safety_bounds[0], theta_safety_bounds[1] + thetadot_min, thetadot_max = -np.inf, np.inf + u_min1 = (1 / c2) * ( + ((1 / (tau**2)) * (-eta * (theta - theta_min) - tau * thetadot)) - c1 * np.sin(theta) + ) + u_max1 = (1 / c2) * ( + ((1 / (tau**2)) * (eta * (theta_max - theta) - tau * thetadot)) - c1 * np.sin(theta) + ) + + u_min2 = (1 / c2) * (((1 / (tau)) * (-eta * (thetadot - thetadot_min))) - c1 * np.sin(theta)) + u_max2 = (1 / c2) * (((1 / (tau)) * (eta * (thetadot_max - thetadot))) - c1 * np.sin(theta)) + + u_min = max(u_min1, u_min2, torque_bounds[0]) + u_max = min(u_max1, u_max2, torque_bounds[1]) + + return (u_min, u_max) + + +def vectorize_f(f: Callable) -> Callable: + """Vectorize the function. + + Args: + f (callable): A function that accepts 1D numpy arrays and returns a tuple (lower_bound, upper_bound). + + Returns: + callable: A vectorized function that can process batches of torch tensors and return pairs of torch tensors. + """ + + def vectorized_f_(obs: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Inner function to process the torch tensor batch. + + Args: + obs (torch.Tensor): A batch of observations as torch tensors. + + Returns: + tuple: Two torch tensors representing the lower and upper bounds for each observation in the batch. + """ + device = obs.device + obs = obs.cpu().detach().numpy() + + batch_size = obs.shape[0] + lbs = torch.zeros([batch_size, 1]) + ubs = torch.zeros([batch_size, 1]) + for i in range(batch_size): + lbs[i], ubs[i] = f(obs[i]) + + lbs = torch.FloatTensor(lbs).reshape(batch_size, 1).to(device) + ubs = torch.FloatTensor(ubs).reshape(batch_size, 1).to(device) + + return lbs, ubs + + return vectorized_f_ + + +class BetaBarrierFunctionAdapter(OnPolicyAdapter): + """Barrier Function Adapter with Beta Distribution for OmniSafe. + + Args: + env_id (str): The environment id. + num_envs (int): The number of parallel environments. + seed (int): The random seed. + cfgs (Config): The configuration passed from yaml file. + """ + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + """Initialize an instance of :class:`BetaBarrierFunctionAdapte`.""" + super().__init__(env_id, num_envs, seed, cfgs) + self.constraint_fn: Callable = vectorize_f(cbf) + + def _wrapper( + self, + obs_normalize: bool = False, + reward_normalize: bool = True, + cost_normalize: bool = True, + ) -> None: + """Wrapper the environment. + + .. warning:: + Since solving the optimization problem requires obtaining physical quantities with + practical significance from state observations, the Beta Barrier Function Adapter does + not support normalization of observations. + + Args: + obs_normalize (bool, optional): Whether to normalize the observation. Defaults to False. + reward_normalize (bool, optional): Whether to normalize the reward. Defaults to True. + cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True. + """ + assert not obs_normalize, 'Barrier function does not support observation normalization!' + if self._env.need_time_limit_wrapper: + assert ( + self._env.max_episode_steps + ), 'You must define max_episode_steps as an integer\ + \nor cancel the use of the time_limit wrapper.' + self._env = TimeLimit( + self._env, + time_limit=self._env.max_episode_steps, + device=self._device, + ) + if self._env.need_auto_reset_wrapper: + self._env = AutoReset(self._env, device=self._device) + if reward_normalize: + self._env = RewardNormalize(self._env, device=self._device) + if cost_normalize: + self._env = CostNormalize(self._env, device=self._device) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env, device=self._device) + + def rollout( # pylint: disable=too-many-locals + self, + steps_per_epoch: int, + agent: ConstraintActorCritic, + buffer: VectorOnPolicyBuffer, + logger: Logger, + ) -> None: + """Rollout the environment and store the data in the buffer. + + Args: + steps_per_epoch (int): Number of steps per epoch. + agent (ConstraintActorCritic): Constraint actor-critic, including actor , reward critic + and cost critic. + buffer (VectorOnPolicyBuffer): Vector on-policy buffer. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + """ + self._reset_log() + obs, _ = self.reset() + for step in track( + range(steps_per_epoch), + description=f'Processing rollout for epoch: {logger.current_epoch}...', + ): + with torch.no_grad(): + act, value_r, value_c, logp = agent.step(obs) + lb, ub = self.constraint_fn(obs) + final_act = lb + (ub - lb) * act + + next_obs, reward, cost, terminated, truncated, info = self.step(final_act) + + self._log_value(reward=reward, cost=cost, info=info) + logger.store({'Value/reward': value_r}) + + buffer.store( + obs=obs, + act=act, + reward=reward, + cost=cost, + value_r=value_r, + value_c=value_c, + logp=logp, + ) + + obs = next_obs + epoch_end = step >= steps_per_epoch + for idx, (done, time_out) in enumerate(zip(terminated, truncated)): + if epoch_end or done or time_out: + last_value_r = torch.zeros(1) + last_value_c = torch.zeros(1) + if not done: + if epoch_end: + logger.log( + f'Warning: trajectory cut off when rollout by epoch at {self._ep_len[idx]} steps.', + ) + _, last_value_r, last_value_c, _ = agent.step(obs[idx]) + if time_out: + _, last_value_r, last_value_c, _ = agent.step( + obs[idx], + ) + last_value_r = last_value_r.unsqueeze(0) + last_value_c = last_value_c.unsqueeze(0) + + if done or time_out: + self._log_metrics(logger, idx) + self._reset_log(idx) + + self._ep_ret[idx] = 0.0 + self._ep_cost[idx] = 0.0 + self._ep_len[idx] = 0.0 + obs, _ = self.reset() + buffer.finish_path(last_value_r, last_value_c, idx) diff --git a/omnisafe/adapter/modelbased_adapter.py b/omnisafe/adapter/modelbased_adapter.py index 8abbd90d7..6e2154531 100644 --- a/omnisafe/adapter/modelbased_adapter.py +++ b/omnisafe/adapter/modelbased_adapter.py @@ -269,8 +269,8 @@ def rollout( # pylint: disable=too-many-arguments,too-many-locals update_actor_critic_time = 0.0 update_dynamics_time = 0.0 - if use_eval: - eval_time = 0.0 + + eval_time = 0.0 epoch_steps = 0 diff --git a/omnisafe/adapter/offpolicy_barrier_function_adapter.py b/omnisafe/adapter/offpolicy_barrier_function_adapter.py new file mode 100644 index 000000000..20b4abdb8 --- /dev/null +++ b/omnisafe/adapter/offpolicy_barrier_function_adapter.py @@ -0,0 +1,256 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""OffPolicy Barrier Function Adapter for OmniSafe.""" + +from __future__ import annotations + +from typing import Any + +import torch +from sklearn.gaussian_process import GaussianProcessRegressor + +from omnisafe.adapter.offpolicy_adapter import OffPolicyAdapter +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.common.buffer import VectorOffPolicyBuffer +from omnisafe.common.gp_model import DynamicsModel +from omnisafe.common.logger import Logger +from omnisafe.envs.wrapper import CostNormalize, RewardNormalize, Unsqueeze +from omnisafe.models.actor_critic.constraint_actor_q_critic import ConstraintActorQCritic +from omnisafe.utils.config import Config + + +class OffPolicyBarrierFunctionAdapter(OffPolicyAdapter): + """OffPolicy Barrier Function Adapter for OmniSafe. + + :class:`OffPolicyBarrierFunctionAdapter` is used to adapt the environment with a CBF controller, + mapping the agent actions from unsafe ones to safe ones. + + Args: + env_id (str): The environment id. + num_envs (int): The number of environments. + seed (int): The random seed. + cfgs (Config): The configuration. + + Attributes: + solver (PendulumSolver): The solver used for the environment, currently supporting + ``Pendulum-v1``. + dynamics_model (DynamicsModel): The dynamics model used to predict the environment's behavior. + compensator (BarrierCompensator): The compensator used to approximate previous actions. + first_iter (bool): A flag indicating if it is the first iteration. + episode_rollout (dict[str, Any]): A dictionary to store the episode rollout information, + including observations and various actions, + useful for updating compensator. + """ + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + """Initialize an instance of :class:`OffPolicyBarrierFunctionAdapter`.""" + super().__init__(env_id, num_envs, seed, cfgs) + + if env_id == 'Pendulum-v1': + self.solver: PendulumSolver = PendulumSolver( + action_size=self.action_space.shape[0], # type: ignore + device=self._device, + ) + self.dynamics_model: DynamicsModel = DynamicsModel( + observation_size=self.observation_space.shape[0], # type: ignore + ) + else: + raise NotImplementedError(f'Please implement solver for {env_id} !') + self.compensator: BarrierCompensator = BarrierCompensator( + obs_dim=self.observation_space.shape[0], # type: ignore + act_dim=self.action_space.shape[0], # type: ignore + cfgs=cfgs.compensator_cfgs, + ).to(self._device) + + self.first_iter: bool = True + self.episode_rollout: dict[str, Any] = {} + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] + self.episode_rollout['approx_compensating_act'] = [] + self.episode_rollout['compensating_act'] = [] + + def _wrapper( + self, + obs_normalize: bool = False, + reward_normalize: bool = True, + cost_normalize: bool = True, + ) -> None: + assert not obs_normalize, 'Barrier function does not support observation normalization!' + if reward_normalize: + self._env = RewardNormalize(self._env, device=self._device) + if cost_normalize: + self._env = CostNormalize(self._env, device=self._device) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env, device=self._device) + + def eval_policy( # pylint: disable=too-many-locals + self, + episode: int, + agent: ConstraintActorQCritic, + logger: Logger, + ) -> None: + """Rollout the environment in an evaluation environment. + + Args: + episode (int): Number of episodes. + agent (ConstraintActorCritic): Agent. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + """ + assert self._eval_env + for _ in range(episode): + ep_ret, ep_cost, ep_len = 0.0, 0.0, 0 + obs, _ = self._eval_env.reset() + obs = obs.to(self._device) + + done = False + while not done: + act = agent.step(obs, deterministic=True) + final_act = self.get_safe_action(obs=obs, act=act, is_eval=True) + obs, reward, cost, terminated, truncated, info = self._eval_env.step(final_act) + obs, reward, cost, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, cost, terminated, truncated) + ) + ep_ret += info.get('original_reward', reward).cpu() + ep_cost += info.get('original_cost', cost).cpu() + ep_len += 1 + done = bool(terminated[0].item()) or bool(truncated[0].item()) + + logger.store( + { + 'Metrics/TestEpRet': ep_ret, + 'Metrics/TestEpCost': ep_cost, + 'Metrics/TestEpLen': ep_len, + }, + ) + + def reset_gp_model(self) -> None: + """Reset the gaussian processing model of barrier function solver.""" + self.dynamics_model.reset_gp_model() + + def rollout( # pylint: disable=too-many-locals + self, + rollout_step: int, + agent: ConstraintActorQCritic, + buffer: VectorOffPolicyBuffer, + logger: Logger, + use_rand_action: bool, + ) -> None: + """Rollout in off-policy manner with the ``dynamics_model``, ``solver`` and ``compensator``. + + Args: + rollout_step (int): Number of rollout steps. + agent (ConstraintActorCritic): Constraint actor-critic, including actor, reward critic, + and cost critic. + buffer (VectorOnPolicyBuffer): Vector on-policy buffer. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + use_rand_action (bool): Whether to use random action. + """ + for _ in range(rollout_step): + if use_rand_action: + act = (torch.rand(self.action_space.shape) * 2 - 1).unsqueeze(0).to(self._device) # type: ignore + else: + act = agent.actor.predict(self._current_obs, deterministic=False) + + final_act = self.get_safe_action(self._current_obs, act) + + self.episode_rollout['obs'].append(self._current_obs) + self.episode_rollout['final_act'].append(final_act) + + next_obs, reward, cost, terminated, truncated, info = self.step(final_act) + self._log_value(reward=reward, cost=cost, info=info) + + buffer.store( + obs=self._current_obs, + act=act, + reward=reward, + cost=cost, + done=torch.logical_and(terminated, torch.logical_xor(terminated, truncated)), + next_obs=next_obs, + ) + + self._current_obs = next_obs + for idx, done in enumerate(torch.logical_or(terminated, truncated)): + if done: + self._log_metrics(logger, idx) + compensator_loss = self.compensator.update( + torch.cat(self.episode_rollout['obs']), + torch.cat(self.episode_rollout['approx_compensating_act']), + torch.cat(self.episode_rollout['compensating_act']), + ) + logger.store({'Value/Loss_compensator': compensator_loss.item()}) + self.dynamics_model.update_gp_dynamics( + obs=torch.cat(self.episode_rollout['obs']), # type: ignore + act=torch.cat(self.episode_rollout['final_act']), # type: ignore + ) + + self.episode_rollout['obs'] = [] + self.episode_rollout['final_act'] = [] + self.episode_rollout['approx_compensating_act'] = [] + self.episode_rollout['compensating_act'] = [] + + self._reset_log(idx) + self._current_obs, _ = self._env.reset() + self.first_iter = False + self.reset_gp_model() + + def get_safe_action( + self, + obs: torch.Tensor, + act: torch.Tensor, + is_eval: bool = False, + ) -> torch.Tensor: + """Computes a safe action by applying compensatory actions. + + .. note:: + This is the core method of the CBF method. Users can modify this function to implement + customized action mapping. + + Args: + obs (torch.Tensor): The current observation from the environment. + act (torch.Tensor): The proposed action to be controlled for safety. + is_eval (bool, optional): A flag to indicate whether this is an evaluation phase, defaulting to False. + + Returns: + torch.Tensor: The safe action to be executed in the environment. + """ + with torch.no_grad(): + approx_compensating_act = self.compensator(obs=obs) + compensated_act_mean_raw = act + approx_compensating_act + + [f, g, x, std] = self.dynamics_model.get_gp_dynamics( + obs, + use_prev_model=not self.first_iter, + ) + compensating_act = self.solver.control_barrier( + original_action=compensated_act_mean_raw, + f=f, + g=g, + x=x, + std=std, + ) + safe_act = compensated_act_mean_raw + compensating_act + + if not is_eval: + self.episode_rollout['compensating_act'].append(compensating_act) + self.episode_rollout['approx_compensating_act'].append(approx_compensating_act) + + return safe_act + + @property + def gp_models(self) -> list[GaussianProcessRegressor]: + """Return the gp models to be saved.""" + return self.dynamics_model.gp_models diff --git a/omnisafe/adapter/robust_barrier_function_adapter.py b/omnisafe/adapter/robust_barrier_function_adapter.py new file mode 100644 index 000000000..cc5a22e02 --- /dev/null +++ b/omnisafe/adapter/robust_barrier_function_adapter.py @@ -0,0 +1,222 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Robust Barrier Function Adapter for OmniSafe.""" + +from __future__ import annotations + +from typing import Any + +import torch + +from omnisafe.adapter.offpolicy_adapter import OffPolicyAdapter +from omnisafe.common.buffer import VectorOffPolicyBuffer +from omnisafe.common.logger import Logger +from omnisafe.common.robust_barrier_solver import CBFQPLayer +from omnisafe.common.robust_gp_model import DynamicsModel +from omnisafe.envs.wrapper import CostNormalize, RewardNormalize, Unsqueeze +from omnisafe.models.actor_critic.constraint_actor_q_critic import ConstraintActorQCritic +from omnisafe.typing import OmnisafeSpace +from omnisafe.utils.config import Config + + +class RobustBarrierFunctionAdapter(OffPolicyAdapter): + """Robust Barrier Function Adapter for OmniSafe. + + :class:`RobustBarrierFunctionAdapter` is used to adapt the environment with RCBF controller. + + Args: + env_id (str): The environment id. + num_envs (int): The number of environments. + seed (int): The random seed. + cfgs (Config): The configuration. + """ + + def __init__(self, env_id: str, num_envs: int, seed: int, cfgs: Config) -> None: + """Initialize an instance of :class:`RobustBarrierFunctionAdapter`.""" + super().__init__(env_id, num_envs, seed, cfgs) + self.solver: CBFQPLayer + self.dynamics_model: DynamicsModel + self._current_steps = 0 + self._num_episodes = 0 + + def _wrapper( + self, + obs_normalize: bool = False, + reward_normalize: bool = True, + cost_normalize: bool = True, + ) -> None: + """Wrapper the environment. + + .. warning:: + Since solving the optimization problem requires obtaining physical quantities with + practical significance from state observations, the Barrier Function Adapter does not + support normalization of observations. + + Args: + obs_normalize (bool, optional): Whether to normalize the observation. Defaults to False. + reward_normalize (bool, optional): Whether to normalize the reward. Defaults to True. + cost_normalize (bool, optional): Whether to normalize the cost. Defaults to True. + """ + assert not obs_normalize, 'Barrier function does not support observation normalization!' + if reward_normalize: + self._env = RewardNormalize(self._env, device=self._device) + if cost_normalize: + self._env = CostNormalize(self._env, device=self._device) + if self._env.num_envs == 1: + self._env = Unsqueeze(self._env, device=self._device) + + def set_solver(self, solver: CBFQPLayer) -> None: + """Set the barrier function solver for Pendulum environment.""" + self.solver = solver + self.solver.env = self._env # type: ignore + + def set_dynamics_model(self, dynamics_model: DynamicsModel) -> None: + """Set the dynamics model.""" + self.dynamics_model = dynamics_model + self.dynamics_model.env = self._env # type: ignore + + def eval_policy( # pylint: disable=too-many-locals + self, + episode: int, + agent: ConstraintActorQCritic, + logger: Logger, + ) -> None: + """Rollout the environment with deterministic agent action. + + Args: + episode (int): Number of episodes. + agent (ConstraintActorCritic): Agent. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + """ + assert self._eval_env + for _ in range(episode): + ep_ret, ep_cost, ep_len = 0.0, 0.0, 0 + obs, _ = self._eval_env.reset() + obs = obs.to(self._device) + + done = False + while not done: + act = agent.step(obs, deterministic=True) + obs, reward, cost, terminated, truncated, info = self._eval_env.step(act) + obs, reward, cost, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, cost, terminated, truncated) + ) + ep_ret += info.get('original_reward', reward).cpu() + ep_cost += info.get('original_cost', cost).cpu() + ep_len += 1 + done = bool(terminated[0].item()) or bool(truncated[0].item()) + + logger.store( + { + 'Metrics/TestEpRet': ep_ret, + 'Metrics/TestEpCost': ep_cost, + 'Metrics/TestEpLen': ep_len, + }, + ) + + def rollout( # pylint: disable=too-many-locals + self, + rollout_step: int, + agent: ConstraintActorQCritic, + buffer: VectorOffPolicyBuffer, + logger: Logger, + use_rand_action: bool, + ) -> None: + """Rollout the environment and store the data in the buffer. + + .. warning:: + As OmniSafe uses :class:`AutoReset` wrapper, the environment will be reset automatically, + so the final observation will be stored in ``info['final_observation']``. + + Args: + rollout_step (int): Number of rollout steps. + agent (ConstraintActorCritic): Constraint actor-critic, including actor, reward critic, + and cost critic. + buffer (VectorOnPolicyBuffer): Vector on-policy buffer. + logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``. + use_rand_action (bool): Whether to use random action. + """ + for _ in range(rollout_step): + state = self.dynamics_model.get_state(self._current_obs) + self._current_steps += 1 + if use_rand_action: + act = (torch.rand(self.action_space.shape) * 2 - 1).unsqueeze(0).to(self._device) # type: ignore + else: + act = agent.step(self._current_obs, deterministic=False) + + final_act = self.get_safe_action(obs=self._current_obs, act=act) + + next_obs, reward, cost, terminated, truncated, info = self.step(final_act) + self._log_value(reward=reward, cost=cost, info=info) + + buffer.store( + obs=self._current_obs, + act=final_act, + reward=reward, + cost=cost, + done=torch.logical_and(terminated, torch.logical_xor(terminated, truncated)), + next_obs=next_obs, + ) + + if ( + self._ep_len[0] % 2 == 0 + and self._num_episodes < self._cfgs.dynamics_model_cfgs.gp_max_episodes + ): + next_state = self.dynamics_model.get_state(next_obs) + self.dynamics_model.append_transition( + state.cpu().detach().numpy(), + final_act.cpu().detach().numpy(), + next_state.cpu().detach().numpy(), + ) + + self._current_obs = next_obs + for idx, done in enumerate(torch.logical_or(terminated, truncated)): + if done: + self._log_metrics(logger, idx) + self._reset_log(idx) + self._num_episodes += 1 + self._current_obs, _ = self._env.reset() + + @property + def safe_action_space(self) -> OmnisafeSpace: + """Return the action space in the safe domain.""" + if hasattr(self._env, 'safe_action_space'): + return self._env.safe_action_space + return self._env.action_space + + def get_safe_action(self, obs: torch.Tensor, act: torch.Tensor) -> torch.Tensor: + """Computes a safe action by applying robust barrier function. + + Args: + obs (torch.Tensor): The current observation from the environment. + act (torch.Tensor): The proposed action to be evaluated for safety. + + Returns: + torch.Tensor: The safe action to be executed in the environment. + """ + state_batch = self.dynamics_model.get_state(obs) + mean_pred_batch, sigma_pred_batch = self.dynamics_model.predict_disturbance(state_batch) + + return self.solver.get_safe_action( + state_batch, + act, + mean_pred_batch, + sigma_pred_batch, + ) + + def __getattr__(self, name: str) -> Any: + """Return the unwrapped environment attributes.""" + return getattr(self._env, name) diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py index df6832226..da82ecbea 100644 --- a/omnisafe/algorithms/__init__.py +++ b/omnisafe/algorithms/__init__.py @@ -27,9 +27,11 @@ from omnisafe.algorithms.off_policy import ( CRABS, DDPG, + DDPGCBF, DDPGPID, SAC, SACPID, + SACRCBF, TD3, TD3PID, DDPGLag, @@ -51,10 +53,12 @@ PPO, RCPO, TRPO, + TRPOCBF, TRPOPID, NaturalPG, OnCRPO, PolicyGradient, + PPOBetaCBF, PPOEarlyTerminated, PPOLag, PPOSaute, diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py index 80e48e1a0..1e14ebd26 100644 --- a/omnisafe/algorithms/off_policy/__init__.py +++ b/omnisafe/algorithms/off_policy/__init__.py @@ -16,11 +16,13 @@ from omnisafe.algorithms.off_policy.crabs import CRABS from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.algorithms.off_policy.ddpg_cbf import DDPGCBF from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag from omnisafe.algorithms.off_policy.ddpg_pid import DDPGPID from omnisafe.algorithms.off_policy.sac import SAC from omnisafe.algorithms.off_policy.sac_lag import SACLag from omnisafe.algorithms.off_policy.sac_pid import SACPID +from omnisafe.algorithms.off_policy.sac_rcbf import SACRCBF from omnisafe.algorithms.off_policy.td3 import TD3 from omnisafe.algorithms.off_policy.td3_lag import TD3Lag from omnisafe.algorithms.off_policy.td3_pid import TD3PID @@ -36,5 +38,7 @@ 'DDPGPID', 'TD3PID', 'SACPID', + 'SACRCBF', + 'DDPGCBF', 'CRABS', ] diff --git a/omnisafe/algorithms/off_policy/ddpg.py b/omnisafe/algorithms/off_policy/ddpg.py index 517d8c0be..0d698e5f2 100644 --- a/omnisafe/algorithms/off_policy/ddpg.py +++ b/omnisafe/algorithms/off_policy/ddpg.py @@ -188,14 +188,9 @@ def _init_log(self) -> None: config=self._cfgs, ) - what_to_save: dict[str, Any] = {} - what_to_save['pi'] = self._actor_critic.actor - if self._cfgs.algo_cfgs.obs_normalize: - obs_normalizer = self._env.save()['obs_normalizer'] - what_to_save['obs_normalizer'] = obs_normalizer - - self._logger.setup_torch_saver(what_to_save) + self._setup_torch_saver() self._logger.torch_save() + self._specific_save() self._logger.register_key( 'Metrics/EpRet', @@ -338,6 +333,7 @@ def learn(self) -> tuple[float, float, float]: # save model to disk if (epoch + 1) % self._cfgs.logger_cfgs.save_model_freq == 0: self._logger.torch_save() + self._specific_save() ep_ret = self._logger.get_stats('Metrics/EpRet')[0] ep_cost = self._logger.get_stats('Metrics/EpCost')[0] @@ -562,3 +558,21 @@ def _log_when_not_update(self) -> None: 'Value/cost_critic': 0.0, }, ) + + def _setup_torch_saver(self) -> None: + """Define what need to be saved below. + + OmniSafe's main storage interface is based on PyTorch. If you need to save models in other + formats, please use :meth:`_specific_save`. + """ + what_to_save: dict[str, Any] = {} + + what_to_save['pi'] = self._actor_critic.actor + if self._cfgs.algo_cfgs.obs_normalize: + obs_normalizer = self._env.save()['obs_normalizer'] + what_to_save['obs_normalizer'] = obs_normalizer + + self._logger.setup_torch_saver(what_to_save) + + def _specific_save(self) -> None: + """Save some algorithms specific models other than PyTorch format per epoch.""" diff --git a/omnisafe/algorithms/off_policy/ddpg_cbf.py b/omnisafe/algorithms/off_policy/ddpg_cbf.py new file mode 100644 index 000000000..6df1fcbb3 --- /dev/null +++ b/omnisafe/algorithms/off_policy/ddpg_cbf.py @@ -0,0 +1,91 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the DDPG algorithm with Control Barrier Function.""" +# mypy: ignore-errors + + +from __future__ import annotations + +import os + +import joblib + +from omnisafe.adapter.offpolicy_barrier_function_adapter import OffPolicyBarrierFunctionAdapter +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.ddpg import DDPG +from omnisafe.typing import Any +from omnisafe.utils.distributed import get_rank + + +@registry.register +# pylint: disable-next=too-many-instance-attributes, too-few-public-methods +class DDPGCBF(DDPG): + """The DDPG algorithm with CBF. + + References: + - Title: End-to-end safe reinforcement learning through barrier functions for + safety-critical continuous control tasks + - Authors: R Cheng, G Orosz, RM Murray, JW Burdick. + - URL: `DDPGCBF `_ + """ + + def _init_env(self) -> None: + super()._init_env() + self._env: OffPolicyBarrierFunctionAdapter = OffPolicyBarrierFunctionAdapter( + self._env_id, + self._cfgs.train_cfgs.vector_env_nums, + self._seed, + self._cfgs, + ) + + def _init_log(self) -> None: + """Log the DDPGCBF specific information. + + +----------------------------+---------------------------------+ + | Things to log | Description | + +============================+=================================+ + | Value/Loss_compensator | The Loss of action compensator. | + +----------------------------+---------------------------------+ + """ + super()._init_log() + self._logger.register_key('Value/Loss_compensator') + + def _specific_save(self) -> None: + """Save some algorithms specific models per epoch.""" + super()._specific_save() + if get_rank() == 0: + path = os.path.join( + self._logger.log_dir, + 'gp_model_save', + f'gaussian_process_regressor_{self._logger.current_epoch}.pkl', + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + joblib.dump(self._env.gp_models, path) + + def _setup_torch_saver(self) -> None: + """Define what need to be saved below. + + OmniSafe's main storage interface is based on PyTorch. If you need to save models in other + formats, please use :meth:`_specific_save`. + """ + what_to_save: dict[str, Any] = {} + + what_to_save['pi'] = self._actor_critic.actor + what_to_save['compensator'] = self._env.compensator + if self._cfgs.algo_cfgs.obs_normalize: + obs_normalizer = self._env.save()['obs_normalizer'] + what_to_save['obs_normalizer'] = obs_normalizer + + self._logger.setup_torch_saver(what_to_save) diff --git a/omnisafe/algorithms/off_policy/sac_rcbf.py b/omnisafe/algorithms/off_policy/sac_rcbf.py new file mode 100644 index 000000000..598c4a14c --- /dev/null +++ b/omnisafe/algorithms/off_policy/sac_rcbf.py @@ -0,0 +1,190 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Soft Actor-Critic algorithm with Robust Control Barrier Function.""" +# mypy: ignore-errors + +from __future__ import annotations + +import os + +import torch +from torch import nn +from torch.nn.utils.clip_grad import clip_grad_norm_ + +from omnisafe.adapter.robust_barrier_function_adapter import RobustBarrierFunctionAdapter +from omnisafe.algorithms import registry +from omnisafe.algorithms.off_policy.sac import SAC +from omnisafe.common.robust_barrier_solver import CBFQPLayer +from omnisafe.common.robust_gp_model import DynamicsModel +from omnisafe.utils.distributed import get_rank + + +@registry.register +# pylint: disable-next=too-many-instance-attributes, too-few-public-methods +class SACRCBF(SAC): + """The Soft Actor-Critic algorithm with Robust Control Barrier Function. + + References: + - Title: The Soft Actor-Critic algorithm with Robust Control Barrier Function + - Authors: Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, Sergey Levine. + - URL: `SAC `_ + """ + + def _init_env(self) -> None: + self._env: RobustBarrierFunctionAdapter = RobustBarrierFunctionAdapter( + self._env_id, + self._cfgs.train_cfgs.vector_env_nums, + self._seed, + self._cfgs, + ) + if self._env_id == 'Unicycle': + solver = CBFQPLayer( + env=self._env, + device=self._cfgs.train_cfgs.device, + gamma_b=self._cfgs.cbf_cfgs.gamma_b, + l_p=self._cfgs.cbf_cfgs.l_p, + ) + dynamics_model = DynamicsModel(env=self._env) + else: + raise NotImplementedError(f'Please implement solver for {self._env_id} !') + + self._env.set_dynamics_model(dynamics_model=dynamics_model) + self._env.set_solver(solver=solver) + + assert ( + self._cfgs.algo_cfgs.steps_per_epoch % self._cfgs.train_cfgs.vector_env_nums == 0 + ), 'The number of steps per epoch is not divisible by the number of environments.' + + assert ( + int(self._cfgs.train_cfgs.total_steps) % self._cfgs.algo_cfgs.steps_per_epoch == 0 + ), 'The total number of steps is not divisible by the number of steps per epoch.' + self._epochs: int = int( + self._cfgs.train_cfgs.total_steps // self._cfgs.algo_cfgs.steps_per_epoch, + ) + self._epoch: int = 0 + self._steps_per_epoch: int = ( + self._cfgs.algo_cfgs.steps_per_epoch // self._cfgs.train_cfgs.vector_env_nums + ) + + self._update_cycle: int = self._cfgs.algo_cfgs.update_cycle + assert ( + self._steps_per_epoch % self._update_cycle == 0 + ), 'The number of steps per epoch is not divisible by the number of steps per sample.' + self._samples_per_epoch: int = self._steps_per_epoch // self._update_cycle + self._update_count: int = 0 + + def _update_actor( + self, + obs: torch.Tensor, + ) -> None: + super()._update_actor(obs) + + if self._cfgs.algo_cfgs.auto_alpha: + with torch.no_grad(): + action = self._actor_critic.actor.predict(obs, deterministic=False) + action = self._env.get_safe_action(obs, action) + log_prob = self._actor_critic.actor.log_prob(action) + alpha_loss = -self._log_alpha * (log_prob + self._target_entropy).mean() + + self._alpha_optimizer.zero_grad() + alpha_loss.backward() + self._alpha_optimizer.step() + self._logger.store( + { + 'Loss/alpha_loss': alpha_loss.mean().item(), + }, + ) + self._logger.store( + { + 'Value/alpha': self._alpha, + }, + ) + + def _update_reward_critic( + self, + obs: torch.Tensor, + action: torch.Tensor, + reward: torch.Tensor, + done: torch.Tensor, + next_obs: torch.Tensor, + ) -> None: + with torch.no_grad(): + next_action = self._actor_critic.actor.predict(next_obs, deterministic=False) + next_action = self._env.get_safe_action(next_obs, next_action) + next_logp = self._actor_critic.actor.log_prob(next_action) + next_q1_value_r, next_q2_value_r = self._actor_critic.target_reward_critic( + next_obs, + next_action, + ) + next_q_value_r = torch.min(next_q1_value_r, next_q2_value_r) - next_logp * self._alpha + target_q_value_r = reward + self._cfgs.algo_cfgs.gamma * (1 - done) * next_q_value_r + + q1_value_r, q2_value_r = self._actor_critic.reward_critic(obs, action) + loss = nn.functional.mse_loss(q1_value_r, target_q_value_r) + nn.functional.mse_loss( + q2_value_r, + target_q_value_r, + ) + + if self._cfgs.algo_cfgs.use_critic_norm: + for param in self._actor_critic.reward_critic.parameters(): + loss += param.pow(2).sum() * self._cfgs.algo_cfgs.critic_norm_coeff + + self._actor_critic.reward_critic_optimizer.zero_grad() + loss.backward() + + if self._cfgs.algo_cfgs.max_grad_norm: + clip_grad_norm_( + self._actor_critic.reward_critic.parameters(), + self._cfgs.algo_cfgs.max_grad_norm, + ) + self._actor_critic.reward_critic_optimizer.step() + self._logger.store( + { + 'Loss/Loss_reward_critic': loss.mean().item(), + 'Value/reward_critic': q1_value_r.mean().item(), + }, + ) + + def _loss_pi( + self, + obs: torch.Tensor, + ) -> torch.Tensor: + action = self._actor_critic.actor.predict(obs, deterministic=False) + action = self._env.get_safe_action(obs, action) + log_prob = self._actor_critic.actor.log_prob(action) + q1_value_r, q2_value_r = self._actor_critic.reward_critic(obs, action) + return (self._alpha * log_prob - torch.min(q1_value_r, q2_value_r)).mean() + + def _specific_save(self) -> None: + """Save some algorithms specific models per epoch.""" + super()._specific_save() + if get_rank() == 0: + path = os.path.join(self._logger.log_dir, 'gp_model_save') + os.makedirs(path, exist_ok=True) + train_x = self._env.dynamics_model.train_x + train_y = self._env.dynamics_model.train_y + disturb_estimators = self._env.dynamics_model.disturb_estimators + weights = [] + for disturb_estimator in disturb_estimators: + weights.append(disturb_estimator.model.state_dict()) + torch.save(weights, os.path.join(path, f'gp_models_{self._logger.current_epoch}.pkl')) + torch.save( + train_x, + os.path.join(path, f'gp_models_train_x_{self._logger.current_epoch}.pkl'), + ) + torch.save( + train_y, + os.path.join(path, f'gp_models_train_y_{self._logger.current_epoch}.pkl'), + ) diff --git a/omnisafe/algorithms/on_policy/__init__.py b/omnisafe/algorithms/on_policy/__init__.py index 722ce0b11..8351ecf2d 100644 --- a/omnisafe/algorithms/on_policy/__init__.py +++ b/omnisafe/algorithms/on_policy/__init__.py @@ -15,6 +15,7 @@ """On-policy algorithms.""" from omnisafe.algorithms.on_policy import ( + barrier_function, base, early_terminated, first_order, @@ -26,6 +27,7 @@ second_order, simmer, ) +from omnisafe.algorithms.on_policy.barrier_function import TRPOCBF, PPOBetaCBF from omnisafe.algorithms.on_policy.base import PPO, TRPO, NaturalPG, PolicyGradient from omnisafe.algorithms.on_policy.early_terminated import PPOEarlyTerminated, TRPOEarlyTerminated from omnisafe.algorithms.on_policy.first_order import CUP, FOCOPS @@ -49,4 +51,5 @@ *saute.__all__, *second_order.__all__, *simmer.__all__, + *barrier_function.__all__, ] diff --git a/omnisafe/algorithms/on_policy/barrier_function/__init__.py b/omnisafe/algorithms/on_policy/barrier_function/__init__.py new file mode 100644 index 000000000..dacdc3c4d --- /dev/null +++ b/omnisafe/algorithms/on_policy/barrier_function/__init__.py @@ -0,0 +1,24 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Control Barrier Function Safe Reinforcement Learning algorithms.""" + +from omnisafe.algorithms.on_policy.barrier_function.ppo_cbf import PPOBetaCBF +from omnisafe.algorithms.on_policy.barrier_function.trpo_cbf import TRPOCBF + + +__all__ = [ + 'TRPOCBF', + 'PPOBetaCBF', +] diff --git a/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py new file mode 100644 index 000000000..4ab2f4d17 --- /dev/null +++ b/omnisafe/algorithms/on_policy/barrier_function/ppo_cbf.py @@ -0,0 +1,85 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the PPO algorithm with Control Barrier Function and Beta Actor.""" + +from __future__ import annotations + +import torch + +from omnisafe.adapter.beta_barrier_function_adapter import BetaBarrierFunctionAdapter +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.ppo import PPO +from omnisafe.utils import distributed + + +@registry.register +class PPOBetaCBF(PPO): + """The PPO algorithm with CBF and Beta Actor. + + References: + - Title: Sampling-based Safe Reinforcement Learning for Nonlinear Dynamical Systems + - Authors: Wesley A. Suttle, Vipul K. Sharma, Krishna C. Kosaraju, S. Sivaranjani, Ji Liu, + Vijay Gupta, Brian M. Sadler. + - URL: `PPOBetaCBF `_ + """ + + def _init_env(self) -> None: + self._env: BetaBarrierFunctionAdapter = BetaBarrierFunctionAdapter( + self._env_id, + self._cfgs.train_cfgs.vector_env_nums, + self._seed, + self._cfgs, + ) + assert (self._cfgs.algo_cfgs.steps_per_epoch) % ( + distributed.world_size() * self._cfgs.train_cfgs.vector_env_nums + ) == 0, 'The number of steps per epoch is not divisible by the number of environments.' + self._steps_per_epoch: int = ( + self._cfgs.algo_cfgs.steps_per_epoch + // distributed.world_size() + // self._cfgs.train_cfgs.vector_env_nums + ) + + def _loss_pi( + self, + obs: torch.Tensor, + act: torch.Tensor, + logp: torch.Tensor, + adv: torch.Tensor, + ) -> torch.Tensor: + r"""Computing pi/actor loss. + + This section of the logic is consistent with PPO, except that it does not record the + standard deviation of the actor distribution. + """ + distribution = self._actor_critic.actor(obs) + logp_ = self._actor_critic.actor.log_prob(act) + ratio = torch.exp(logp_ - logp) + ratio_cliped = torch.clamp( + ratio, + 1 - self._cfgs.algo_cfgs.clip, + 1 + self._cfgs.algo_cfgs.clip, + ) + loss = -torch.min(ratio * adv, ratio_cliped * adv).mean() + loss -= self._cfgs.algo_cfgs.entropy_coef * distribution.entropy().mean() + # useful extra info + entropy = distribution.entropy().mean().item() + self._logger.store( + { + 'Train/Entropy': entropy, + 'Train/PolicyRatio': ratio, + 'Loss/Loss_pi': loss.mean().item(), + }, + ) + return loss diff --git a/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py new file mode 100644 index 000000000..0324170c4 --- /dev/null +++ b/omnisafe/algorithms/on_policy/barrier_function/trpo_cbf.py @@ -0,0 +1,89 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the TRPO algorithm with Control Barrier Function.""" +# mypy: ignore-errors + +from __future__ import annotations + +import os + +import joblib + +from omnisafe.adapter.barrier_function_adapter import BarrierFunctionAdapter +from omnisafe.algorithms import registry +from omnisafe.algorithms.on_policy.base.trpo import TRPO +from omnisafe.typing import Any +from omnisafe.utils.distributed import get_rank + + +@registry.register +class TRPOCBF(TRPO): + """The TRPO algorithm with CBF. + + References: + - Title: End-to-end safe reinforcement learning through barrier functions for + safety-critical continuous control tasks + - Authors: R Cheng, G Orosz, RM Murray, JW Burdick. + - URL: `TRPOCBF `_ + """ + + def _init_log(self) -> None: + """Log the TRPOCBF specific information. + + +----------------------------+---------------------------------+ + | Things to log | Description | + +============================+=================================+ + | Value/Loss_compensator | The Loss of action compensator. | + +----------------------------+---------------------------------+ + """ + super()._init_log() + self._logger.register_key('Value/Loss_compensator') + + def _init_env(self) -> None: + super()._init_env() + self._env: BarrierFunctionAdapter = BarrierFunctionAdapter( + self._env_id, + self._cfgs.train_cfgs.vector_env_nums, + self._seed, + self._cfgs, + ) + + def _specific_save(self) -> None: + """Save some algorithms specific models per epoch.""" + super()._specific_save() + if get_rank() == 0: + path = os.path.join( + self._logger.log_dir, + 'gp_model_save', + f'gaussian_process_regressor_{self._logger.current_epoch}.pkl', + ) + os.makedirs(os.path.dirname(path), exist_ok=True) + joblib.dump(self._env.gp_models, path) + + def _setup_torch_saver(self) -> None: + """Define what need to be saved below. + + OmniSafe's main storage interface is based on PyTorch. If you need to save models in other + formats, please use :meth:`_specific_save`. + """ + what_to_save: dict[str, Any] = {} + + what_to_save['pi'] = self._actor_critic.actor + what_to_save['compensator'] = self._env.compensator + if self._cfgs.algo_cfgs.obs_normalize: + obs_normalizer = self._env.save()['obs_normalizer'] + what_to_save['obs_normalizer'] = obs_normalizer + + self._logger.setup_torch_saver(what_to_save) diff --git a/omnisafe/algorithms/on_policy/base/policy_gradient.py b/omnisafe/algorithms/on_policy/base/policy_gradient.py index e0792d6ab..cb144922a 100644 --- a/omnisafe/algorithms/on_policy/base/policy_gradient.py +++ b/omnisafe/algorithms/on_policy/base/policy_gradient.py @@ -180,13 +180,9 @@ def _init_log(self) -> None: config=self._cfgs, ) - what_to_save: dict[str, Any] = {} - what_to_save['pi'] = self._actor_critic.actor - if self._cfgs.algo_cfgs.obs_normalize: - obs_normalizer = self._env.save()['obs_normalizer'] - what_to_save['obs_normalizer'] = obs_normalizer - self._logger.setup_torch_saver(what_to_save) + self._setup_torch_saver() self._logger.torch_save() + self._specific_save() self._logger.register_key( 'Metrics/EpRet', @@ -296,6 +292,7 @@ def learn(self) -> tuple[float, float, float]: epoch + 1 ) == self._cfgs.train_cfgs.epochs: self._logger.torch_save() + self._specific_save() ep_ret = self._logger.get_stats('Metrics/EpRet')[0] ep_cost = self._logger.get_stats('Metrics/EpCost')[0] @@ -586,3 +583,21 @@ def _loss_pi( }, ) return loss + + def _setup_torch_saver(self) -> None: + """Define what need to be saved below. + + OmniSafe's main storage interface is based on PyTorch. If you need to save models in other + formats, please use :meth:`_specific_save`. + """ + what_to_save: dict[str, Any] = {} + + what_to_save['pi'] = self._actor_critic.actor + if self._cfgs.algo_cfgs.obs_normalize: + obs_normalizer = self._env.save()['obs_normalizer'] + what_to_save['obs_normalizer'] = obs_normalizer + + self._logger.setup_torch_saver(what_to_save) + + def _specific_save(self) -> None: + """Save some algorithms specific models other than PyTorch format per epoch.""" diff --git a/omnisafe/common/__init__.py b/omnisafe/common/__init__.py index 9e4fc1bf1..c1311f150 100644 --- a/omnisafe/common/__init__.py +++ b/omnisafe/common/__init__.py @@ -14,6 +14,9 @@ # ============================================================================== """Common Common utilities for OmniSafe.""" +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver +from omnisafe.common.gp_model import DynamicsModel from omnisafe.common.lagrange import Lagrange from omnisafe.common.logger import Logger from omnisafe.common.normalizer import Normalizer diff --git a/omnisafe/common/barrier_comp.py b/omnisafe/common/barrier_comp.py new file mode 100644 index 000000000..64d1af104 --- /dev/null +++ b/omnisafe/common/barrier_comp.py @@ -0,0 +1,95 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of Compensator Used in Control Barrier Function.""" + + +from __future__ import annotations + +import torch +from torch import optim + +from omnisafe.utils.config import Config +from omnisafe.utils.model import build_mlp_network + + +class BarrierCompensator(torch.nn.Module): + """A module that represents a barrier compensator using a multi-layer perceptron (MLP) network. + + This module is designed to compute actions based on observations, with the intention of + compensating for potential barriers in a control system or a similar application. It is built + upon a configurable MLP network and trained using an optimization routine. + + Attributes: + obs_dim (int): Dimension of the observation space. + act_dim (int): Dimension of the action space. + _cfgs (Config): Configuration parameters for the MLP network and training. + model (torch.nn.Module): The MLP network. + optimizer (torch.optim.Optimizer): The optimizer for training the network. + + Args: + obs_dim (int): Dimension of the observation space. + act_dim (int): Dimension of the action space. + cfgs (Config): Configuration parameters for the network and training. + """ + + def __init__(self, obs_dim: int, act_dim: int, cfgs: Config) -> None: + """Initialize the action compensator.""" + super().__init__() + self._cfgs: Config = cfgs + self.model: torch.nn.Module = build_mlp_network( + sizes=[obs_dim, *self._cfgs.hidden_sizes, act_dim], + activation=self._cfgs.activation, + weight_initialization_mode=self._cfgs.weight_initialization_mode, + ) + self.optimizer: optim.Adam = optim.Adam(self.parameters(), lr=self._cfgs.lr) + + def forward(self, obs: torch.Tensor) -> torch.Tensor: + """Estimate the sum of previous compensating actions. + + Args: + obs (torch.Tensor): The input observation. + + Returns: + torch.Tensor: The estimation of previous compensating actions. + """ + return self.model(obs) + + def update( + self, + observation: torch.Tensor, + approx_compensating_act: torch.Tensor, + compensating_act: torch.Tensor, + ) -> torch.Tensor: + """Train the barrier compensator model. + + This method updates the model parameters to minimize the difference between the model's output and the + target, which is a combination of approximate compensating action and compensating action. + + Args: + observation (torch.Tensor): The observation data. + approx_compensating_act (torch.Tensor): The approximate compensating action. + compensating_act (torch.Tensor): The actual compensating action. + + Returns: + torch.Tensor: The loss after training. + """ + for _ in range(self._cfgs.update_iters): + target = approx_compensating_act + compensating_act + self.optimizer.zero_grad() + loss = torch.pow((self(observation) - target), 2).mean() + loss.backward() + self.optimizer.step() + + return loss diff --git a/omnisafe/common/barrier_solver.py b/omnisafe/common/barrier_solver.py new file mode 100644 index 000000000..f281fd0e6 --- /dev/null +++ b/omnisafe/common/barrier_solver.py @@ -0,0 +1,192 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Control Barrier Function Solver.""" + +# pylint: disable=invalid-name,wrong-spelling-in-docstring +# mypy: ignore-errors + + +from __future__ import annotations + +import warnings + +import numpy as np +import torch +from cvxopt import matrix, solvers + +from omnisafe.typing import DEVICE_CPU + + +# pylint: disable-next=too-many-instance-attributes +class PendulumSolver: + """The CBF solver for the pendulum problem using Gaussian Process models. + + This class implements a solver for the pendulum control problem using Control Barrier Functions + (CBFs). The primary goal is to ensure safe reinforcement learning by maintaining + safety constraints during the control process. + + For more details, please refer to: + + *End-to-End Safe Reinforcement Learning through Barrier Functions for Safety-Critical Continuous + Control Tasks* + + Attributes: + action_size (int): Size of the action space, typically 1 for the pendulum. + torque_bound (float): Maximum torque bound that can be applied to the pendulum. + max_speed (float): Maximum speed (angular velocity) of the pendulum. + device (torch.device): Device to run the computations on. + """ + + # pylint: disable-next=invalid-name + def __init__( + self, + action_size: int = 1, + torque_bound: float = 15.0, + max_speed: float = 60.0, + device: torch.device = DEVICE_CPU, + ) -> None: + """Initialize the PendulumSolver with specified parameters. + + Args: + action_size (int): Size of the action space, typically 1 for the pendulum. + torque_bound (float): Maximum torque bound that can be applied to the pendulum. + max_speed (float): Maximum speed (angular velocity) of the pendulum. + device (torch.device): Device to run the computations on. + + Attributes: + F (float): A control gain factor used in the CBF computation. + _gamma_b (float): Parameter for the barrier function. + _kd (float): Damping coefficient used in the barrier function. + """ + self.action_size = action_size + self.torque_bound = torque_bound + self.max_speed = max_speed + self.F = 1.0 + self._device = device + self._gamma_b = 0.5 + self._kd = 1.5 + self._build_barrier() + warnings.filterwarnings('ignore') + + def _build_barrier(self) -> None: + """Construct the Control Barrier Function (CBF) for safe control of the pendulum. + + This method initializes and sets up the necessary components for the CBF, which is used to + ensure that the control actions taken do not violate safety constraints. + """ + self.P = matrix(np.diag([1.0, 1e16]), tc='d') + self.q = matrix(np.zeros(self.action_size + 1)) + self.h1 = np.array([1, 0.01]) + self.h2 = np.array([1, -0.01]) + self.h3 = np.array([-1, 0.01]) + self.h4 = np.array([-1, -0.01]) + + def control_barrier( # pylint: disable=invalid-name + self, + original_action: torch.Tensor, + f: np.ndarray, + g: np.ndarray, + x: np.ndarray, + std: np.ndarray, + ) -> torch.Tensor: + """Adjust the original action using a control barrier function. + + Args: + original_action (torch.Tensor): The original action proposed by the RL algorithm. + f (np.ndarray): The drift component of the system's dynamics. + g (np.ndarray): The control component of the system's dynamics. + x (np.ndarray): The current state of the system. + std (np.ndarray): The standard deviation of the system's state. + + Returns: + torch.Tensor: The adjusted action that respects the system's constraints. + """ + # define gamma for the barrier function + gamma_b = 0.5 + kd = 1.5 + u_rl = original_action.cpu().detach().numpy() + + # set up Quadratic Program to satisfy Control Barrier Function + G = np.array( + [ + [ + -np.dot(self.h1, g), + -np.dot(self.h2, g), + -np.dot(self.h3, g), + -np.dot(self.h4, g), + 1, + -1, + g[1], + -g[1], + ], + [ + -1, + -1, + -1, + -1, + 0, + 0, + 0, + 0, + ], + ], + ) + G = np.transpose(G) + h = np.array( + [ + gamma_b * self.F + + np.dot(self.h1, f) + + np.dot(self.h1, g) * u_rl + - (1 - gamma_b) * np.dot(self.h1, x) + - kd * np.abs(np.dot(self.h1, std)), + gamma_b * self.F + + np.dot(self.h2, f) + + np.dot(self.h2, g) * u_rl + - (1 - gamma_b) * np.dot(self.h2, x) + - kd * np.abs(np.dot(self.h2, std)), + gamma_b * self.F + + np.dot(self.h3, f) + + np.dot(self.h3, g) * u_rl + - (1 - gamma_b) * np.dot(self.h3, x) + - kd * np.abs(np.dot(self.h3, std)), + gamma_b * self.F + + np.dot(self.h4, f) + + np.dot(self.h4, g) * u_rl + - (1 - gamma_b) * np.dot(self.h4, x) + - kd * np.abs(np.dot(self.h4, std)), + -u_rl + self.torque_bound, + u_rl + self.torque_bound, + -f[1] - g[1] * u_rl + self.max_speed, + f[1] + g[1] * u_rl + self.max_speed, + ], + ) + h = np.squeeze(h).astype(np.double) + + # convert numpy arrays to cvx matrices to set up QP + G = matrix(G, tc='d') + h = matrix(h, tc='d') + solvers.options['show_progress'] = False + sol = solvers.qp(self.P, self.q, G, h) + u_bar = sol['x'] + + # check if the adjusted action is within bounds + if np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) - 0.001 >= self.torque_bound: + u_bar[0] = self.torque_bound - u_rl + print('Error in QP') + elif np.add(np.squeeze(u_rl), np.squeeze(u_bar[0])) + 0.001 <= -self.torque_bound: + u_bar[0] = -self.torque_bound - u_rl + print('Error in QP') + + return torch.as_tensor(u_bar[0], dtype=torch.float32, device=self._device).unsqueeze(dim=0) diff --git a/omnisafe/common/buffer/onpolicy_buffer.py b/omnisafe/common/buffer/onpolicy_buffer.py index b6f9586df..6fab686aa 100644 --- a/omnisafe/common/buffer/onpolicy_buffer.py +++ b/omnisafe/common/buffer/onpolicy_buffer.py @@ -216,17 +216,7 @@ def get(self) -> dict[str, torch.Tensor]: The data stored and calculated in the buffer. """ self.ptr, self.path_start_idx = 0, 0 - - data = { - 'obs': self.data['obs'], - 'act': self.data['act'], - 'target_value_r': self.data['target_value_r'], - 'adv_r': self.data['adv_r'], - 'logp': self.data['logp'], - 'discounted_ret': self.data['discounted_ret'], - 'adv_c': self.data['adv_c'], - 'target_value_c': self.data['target_value_c'], - } + data = self.data.copy() adv_mean, adv_std, *_ = distributed.dist_statistics_scalar(data['adv_r']) cadv_mean, *_ = distributed.dist_statistics_scalar(data['adv_c']) diff --git a/omnisafe/common/buffer/vector_onpolicy_buffer.py b/omnisafe/common/buffer/vector_onpolicy_buffer.py index a920d8e6a..3ebd61c87 100644 --- a/omnisafe/common/buffer/vector_onpolicy_buffer.py +++ b/omnisafe/common/buffer/vector_onpolicy_buffer.py @@ -88,6 +88,23 @@ def __init__( # pylint: disable=super-init-not-called,too-many-arguments for _ in range(num_envs) ] + def add_field(self, name: str, shape: tuple[int, ...], dtype: torch.dtype) -> None: + """Add a field to the buffer. + + Examples: + >>> buffer = BaseBuffer(...) + >>> buffer.add_field('new_field', (2, 3), torch.float32) + >>> buffer.data['new_field'].shape + >>> (buffer.size, 2, 3) + + Args: + name (str): The name of the field. + shape (tuple of int): The shape of the field. + dtype (torch.dtype): The dtype of the field. + """ + for buffer in self.buffers: + buffer.add_field(name=name, shape=shape, dtype=dtype) + @property def num_buffers(self) -> int: """Number of buffers.""" diff --git a/omnisafe/common/experiment_grid.py b/omnisafe/common/experiment_grid.py index f93cef8d3..787f4592f 100644 --- a/omnisafe/common/experiment_grid.py +++ b/omnisafe/common/experiment_grid.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -544,6 +544,8 @@ def analyze( compare_num: int | None = None, cost_limit: float | None = None, show_image: bool = False, + reward_metrics: str = 'Metrics/EpRet', + cost_metrics: str = 'Metrics/EpCost', ) -> None: """Analyze the experiment results. @@ -559,6 +561,8 @@ def analyze( cost_limit (float or None, optional): Value for one line showed on graph to indicate cost. Defaults to None. show_image (bool): Whether to show graph image in GUI windows. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. """ assert self._statistical_tools is not None, 'Please run run() first!' self._statistical_tools.load_source(self.log_dir) @@ -568,6 +572,8 @@ def analyze( compare_num, cost_limit, show_image=show_image, + reward_metrics=reward_metrics, + cost_metrics=cost_metrics, ) def evaluate(self, num_episodes: int = 10, cost_criteria: float = 1.0) -> None: diff --git a/omnisafe/common/gp_model.py b/omnisafe/common/gp_model.py new file mode 100644 index 000000000..dac93ea13 --- /dev/null +++ b/omnisafe/common/gp_model.py @@ -0,0 +1,218 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of the Control Barrier Function Solver.""" + +# pylint: disable=invalid-name,wrong-spelling-in-docstring +# mypy: ignore-errors + + +from __future__ import annotations + +import joblib +import numpy as np +import torch +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import RBF +from sklearn.gaussian_process.kernels import ConstantKernel as C + + +# pylint: disable-next=too-many-instance-attributes +class DynamicsModel: + """This class handles the creation and management of Gaussian Process (GP) models. + + These GP models predict the next state of the environment based on the current state. + + .. warning:: + This class provides an implementation for the ``Pendulum-v1`` environment. It needs to be + customized to extend it to more environments. + + Args: + observation_size (int): The size of the observation space. This determines + the number of GP models to create. + load_dir (Optional[str]): The directory to load the GP models from. If None, new models + are initialized. Default is None. + + Attributes: + observation_size (int): The size of the observation space. + gp_model_prev (List[GaussianProcessRegressor]): The GP models from the previous iteration. + gp_model (List[GaussianProcessRegressor]): The current GP models used for predictions. + """ + + def __init__(self, observation_size: int, load_dir: str | None = None) -> None: + """Initialize the DynamicsModel with a specified observation size and optional model loading. + + Args: + observation_size (int): Size of the observation space. + load_dir (Optional[str]): Directory to load the GP models from. If not provided, + new models will be created. + """ + self.observation_size: int = observation_size + self.gp_model_prev: list[GaussianProcessRegressor] + self.gp_model: list[GaussianProcessRegressor] + self._build_gp_model(load_dir=load_dir) + + def _build_gp_model(self, load_dir: str | None = None) -> None: + """Build or load the Gaussian Process models. + + If a load directory is provided, the models are loaded from the specified directory. + Otherwise, new models are created with default parameters. + + Args: + load_dir (Optional[str]): Directory to load the GP models from. If None, new models + will be created. + """ + gp_list = [] + noise = 0.01 # Small noise term to stabilize the GP model + for _ in range(self.observation_size - 1): + if not load_dir: + # Define the kernel as a product of a constant kernel and an RBF kernel + kern = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2)) + # Initialize the GaussianProcessRegressor with the specified kernel and noise + gp = GaussianProcessRegressor(kernel=kern, alpha=noise, n_restarts_optimizer=10) + gp_list.append(gp) + else: + # Load the GP models from the specified directory + gp_list = joblib.load(load_dir) + self.gp_model = gp_list + self.gp_model_prev = gp_list.copy() + + @property + def gp_models(self) -> list[GaussianProcessRegressor]: + """Return all gaussian process regressor for saving.""" + return self.gp_model + + def get_dynamics(self, obs: list[float], original_action: float) -> np.ndarray: + """Calculate the dynamics of the system based on the current observation and the original action. + + This method computes the next state of a pendulum system using the provided state and + action. + + Args: + obs (list[float]): The current observation of the system state. + For the ``Pendulum-v1``, It should contain at least three elements: + [x, y, theta_dot], where x and y are the Cartesian coordinates of + the pendulum, and theta_dot is the angular velocity. + original_action (float): The original action proposed by the RL agent. + + Returns: + np.ndarray: The calculated dynamics of the system, representing the next state. + """ + # Time step + dt = 0.05 + # Gravitational constant + G = 10 + # Mass of the pendulum + m = 2 + # Length of the pendulum + length = 2 + + # Calculate the angle theta from the Cartesian coordinates + theta = np.arctan2(obs[1], obs[0]) + # Angular velocity + theta_dot = obs[2] + + f = np.array( + [ + -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 + + theta_dot * dt + + theta + + 3 / (m * length**2) * original_action * dt**2, + theta_dot + - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt + + 3 / (m * length**2) * original_action * dt, + ], + ) + + return np.squeeze(f) + + def update_gp_dynamics(self, obs: np.ndarray, act: np.ndarray) -> None: + """Update the Gaussian Process (GP) dynamics model based on observed states and actions. + + Args: + obs (np.ndarray): Agent's observation of the current environment. + act (np.ndarray): Actions taken. + """ + obs = obs.detach().cpu().squeeze().numpy() + act = act.detach().cpu().squeeze().numpy() + N = self.observation_size + X = obs + U = act + L = len(X) + err = np.zeros((L - 1, N - 1)) + S = np.zeros((L - 1, 2)) + for i in range(L - 1): + f = self.get_dynamics(X[i], U[i]) + theta_p = np.arctan2(X[i][1], X[i][0]) + theta_dot_p = X[i][2] + theta = np.arctan2(X[i + 1][1], X[i + 1][0]) + theta_dot = X[i + 1][2] + S[i, :] = np.array([theta_p, theta_dot_p]) + err[i, :] = np.array([theta, theta_dot]) - f + self.gp_model[0].fit(S, err[:, 0]) + self.gp_model[1].fit(S, err[:, 1]) + + def get_gp_dynamics(self, obs: torch.Tensor, use_prev_model: bool) -> list[np.ndarray]: + """Retrieve the GP dynamics based on the current observation. + + Args: + obs (torch.Tensor): Agent's observation of the current environment. + use_prev_model (bool): Whether to use previous gaussian model. + + Returns: + list[np.ndarray]: list containing the gp dynamics [f, g, x, std]. + """ + obs = obs.cpu().detach().numpy() + u_rl = 0 + dt = 0.05 + G = 10 + m = 1 + length = 1 + obs = np.squeeze(obs) + theta = np.arctan2(obs[1], obs[0]) + theta_dot = obs[2] + x = np.array([theta, theta_dot]) + f_nom = np.array( + [ + -3 * G / (2 * length) * np.sin(theta + np.pi) * dt**2 + + theta_dot * dt + + theta + + 3 / (m * length**2) * u_rl * dt**2, + theta_dot + - 3 * G / (2 * length) * np.sin(theta + np.pi) * dt + + 3 / (m * length**2) * u_rl * dt, + ], + ) + g = np.array([3 / (m * length**2) * dt**2, 3 / (m * length**2) * dt]) + f_nom = np.squeeze(f_nom) + f = np.zeros(2) + if use_prev_model: + [m1, std1] = self.gp_model_prev[0].predict(x.reshape(1, -1), return_std=True) + [m2, std2] = self.gp_model_prev[1].predict(x.reshape(1, -1), return_std=True) + else: + [m1, std1] = self.gp_model[0].predict(x.reshape(1, -1), return_std=True) + [m2, std2] = self.gp_model[1].predict(x.reshape(1, -1), return_std=True) + f[0] = f_nom[0] + m1 + f[1] = f_nom[1] + m2 + return [ + np.squeeze(f), + np.squeeze(g), + np.squeeze(x), + np.array([np.squeeze(std1), np.squeeze(std2)]), + ] + + def reset_gp_model(self) -> None: + """Reset the gaussian process model of barrier function solver.""" + self.gp_model_prev = self.gp_model.copy() + self._build_gp_model() diff --git a/omnisafe/common/robust_barrier_solver.py b/omnisafe/common/robust_barrier_solver.py new file mode 100644 index 000000000..a871ccc4d --- /dev/null +++ b/omnisafe/common/robust_barrier_solver.py @@ -0,0 +1,329 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Robust Control Barrier Function Solver for OmniSafe.""" + + +# mypy: ignore-errors +# pylint: disable=invalid-name,wrong-spelling-in-docstring +from __future__ import annotations + +import warnings +from typing import Any + +import gymnasium as gym +import torch +from qpth.qp import QPFunction + +from omnisafe.utils.tools import to_tensor + + +DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}} + + +class CBFQPLayer: + """CBFQLayer for robust control barrier function solver. + + Args: + env (gymnasium.Env): The Gymnasium environment to interact with. + device (str, optional): The device type, such as 'cpu' or 'gpu'. Defaults to 'cpu'. + gamma_b (float, optional): The gamma parameter. Defaults to 20. + l_p (float, optional): Some additional layer parameter, purpose unspecified. Defaults to 0.03. + + Attributes: + device (torch.device): The device on which computations will be performed. + env (gym.Env): The Gym environment instance. + u_min (float): The minimum control bound. + u_max (float): The maximum control bound. + gamma_b (float): The gamma parameter. + l_p (float): An additional layer parameter. + action_dim (int): The dimensionality of the action space. + """ + + def __init__( + self, + env: gym.Env, + device: str = 'cpu', + gamma_b: float = 20, + l_p: float = 0.03, + ) -> None: + """Initializes a CBFLayer instance with specified parameters and environment.""" + self.device = torch.device(device) + self.env = env + self.u_min, self.u_max = self.get_control_bounds() + self.gamma_b = gamma_b + self.l_p = l_p + self.action_dim = env.action_space.shape[0] + warnings.filterwarnings('ignore') + + def get_safe_action( + self, + state_batch: torch.Tensor, + action_batch: torch.Tensor, + mean_pred_batch: torch.Tensor, + sigma_batch: torch.Tensor, + ) -> torch.Tensor: + """Computes safe actions based on current state and action predictions, adjusting for uncertainties. + + Args: + state_batch (torch.Tensor): Current state batch, tensor or ndarray. + action_batch (torch.Tensor): Nominal action batch, tensor or ndarray. + mean_pred_batch (torch.Tensor): Mean disturbance predictions, tensor or ndarray. + sigma_batch (torch.Tensor): Standard deviations of disturbances, tensor or ndarray. + + Returns: + torch.Tensor: Safe actions adjusted for given constraints and uncertainties. + """ + expand_dims = len(state_batch.shape) == 1 + if expand_dims: + state_batch = state_batch.unsqueeze(0) + action_batch = action_batch.unsqueeze(0) + mean_pred_batch = mean_pred_batch.unsqueeze(0) + sigma_batch = sigma_batch.unsqueeze(0) + + Ps, qs, Gs, hs = self.get_cbf_qp_constraints( + state_batch, + action_batch, + mean_pred_batch, + sigma_batch, + ) + safe_action_batch = self.solve_qp(Ps, qs, Gs, hs) + final_action_batch = torch.clamp( + action_batch + safe_action_batch, + self.u_min.repeat(action_batch.shape[0], 1), + self.u_max.repeat(action_batch.shape[0], 1), + ) + + return final_action_batch if not expand_dims else final_action_batch.squeeze(0) + + def solve_qp( + self, + Ps: torch.Tensor, + qs: torch.Tensor, + Gs: torch.Tensor, + hs: torch.Tensor, + ) -> torch.Tensor: + """Solves a batch of quadratic programming (QP) problems. + + Each QP problem is defined as: + minimize_{u,eps} 0.5 * u^T P u + q^T u + subject to G[u,eps]^T <= h + + Args: + Ps (torch.Tensor): Quadratic cost matrix for each problem. + qs (torch.Tensor): Linear cost vector for each problem. + Gs (torch.Tensor): Inequality constraint matrix for each problem. + hs (torch.Tensor): Inequality constraint vector for each problem. + + Returns: + The safe action for each problem, omitting the slack variable, with dimension (batch_size, n_u). + """ + Ghs = torch.cat((Gs, hs.unsqueeze(2)), -1) + Ghs_norm = torch.max(torch.abs(Ghs), dim=2, keepdim=True)[0] + Gs /= Ghs_norm + hs = hs / Ghs_norm.squeeze(-1) + sol = self.cbf_layer( + Ps, + qs, + Gs, + hs, + solver_args={ + 'check_Q_spd': False, + 'maxIter': 100000, + 'notImprovedLim': 10, + 'eps': 1e-4, + }, + ) + + return sol[:, : self.env.action_space.shape[0]] + + def cbf_layer( + self, + Qs: torch.Tensor, + ps: torch.Tensor, + Gs: torch.Tensor, + hs: torch.Tensor, + As: torch.Tensor | None = None, + bs: torch.Tensor | None = None, + solver_args: dict[str, Any] | None = None, + ) -> torch.Tensor: + """Applies a custom layer to solve QP problems using given constraints. + + Args: + Qs (torch.Tensor): Quadratic cost matrix for each problem. + ps (torch.Tensor): Linear cost vector for each problem. + Gs (torch.Tensor): Inequality constraint matrix for each problem. + hs (torch.Tensor): Inequality constraint vector for each problem. + As (torch.Tensor, optional): Equality constraint matrix. Defaults to None. + bs (torch.Tensor, optional): Equality constraint vector. Defaults to None. + solver_args (dict, optional): Dictionary of solver arguments. Defaults to None. + + Returns: + Result of the QP solver for each problem. + """ + if solver_args is None: + solver_args = {} + + if As is None or bs is None: + As = torch.Tensor().to(self.device).double() + bs = torch.Tensor().to(self.device).double() + + return QPFunction(verbose=-1, **solver_args)( + Qs.double(), + ps.double(), + Gs.double(), + hs.double(), + As, + bs, + ).float() + + # pylint: disable-next=too-many-locals + def get_cbf_qp_constraints( + self, + state_batch: torch.Tensor, + action_batch: torch.Tensor, + mean_pred_batch: torch.Tensor, + sigma_pred_batch: torch.Tensor, + gamma_b: float = 1.0, + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Builds up matrices required to solve a quadratic program (QP). + + The QP is defined to solve: + minimize_{u,eps} 0.5 * u^T P u + q^T u + subject to G[u,eps]^T <= h + + Args: + state_batch (torch.Tensor): Current state batch. + action_batch (torch.Tensor): Nominal control input batch. + mean_pred_batch (torch.Tensor): Mean disturbance prediction state batch. + sigma_pred_batch (torch.Tensor): Standard deviation of the additive disturbance. + gamma_b (float, optional): CBF parameter for the class-Kappa function. Defaults to 1.0. + + Returns: + tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: A tuple containing: + P (torch.Tensor): Quadratic cost matrix in the QP. + q (torch.Tensor): Linear cost vector in the QP. + G (torch.Tensor): Inequality constraint matrix for QP constraints. + h (torch.Tensor): Inequality constraint vector for QP constraints. + """ + assert ( + len(state_batch.shape) == 2 + and len(action_batch.shape) == 2 + and len(mean_pred_batch.shape) == 2 + and len(sigma_pred_batch.shape) == 2 + ), print( + state_batch.shape, + action_batch.shape, + mean_pred_batch.shape, + sigma_pred_batch.shape, + ) + + batch_size = state_batch.shape[0] + gamma_b = self.gamma_b + + state_batch = torch.unsqueeze(state_batch, -1).to(self.device) + action_batch = torch.unsqueeze(action_batch, -1).to(self.device) + mean_pred_batch = torch.unsqueeze(mean_pred_batch, -1).to(self.device) + sigma_pred_batch = torch.unsqueeze(sigma_pred_batch, -1).to(self.device) + if self.env.dynamics_mode == 'Unicycle': + num_cbfs = len(self.env.hazards) + l_p = self.l_p + buffer = 0.1 + + thetas = state_batch[:, 2, :].squeeze(-1) + c_thetas = torch.cos(thetas) + s_thetas = torch.sin(thetas) + ps = torch.zeros((batch_size, 2)).to(self.device) + ps[:, 0] = state_batch[:, 0, :].squeeze(-1) + l_p * c_thetas + ps[:, 1] = state_batch[:, 1, :].squeeze(-1) + l_p * s_thetas + f_ps = torch.zeros((batch_size, 2, 1)).to(self.device) + Rs = torch.zeros((batch_size, 2, 2)).to(self.device) + Rs[:, 0, 0] = c_thetas + Rs[:, 0, 1] = -s_thetas + Rs[:, 1, 0] = s_thetas + Rs[:, 1, 1] = c_thetas + Ls = torch.zeros((batch_size, 2, 2)).to(self.device) + Ls[:, 0, 0] = 1 + Ls[:, 1, 1] = l_p + g_ps = torch.bmm(Rs, Ls) + mu_theta_aug = torch.zeros([batch_size, 2, 1]).to(self.device) + mu_theta_aug[:, 1, :] = mean_pred_batch[:, 2, :] + mu_ps = torch.bmm(g_ps, mu_theta_aug) + mean_pred_batch[:, :2, :] + sigma_theta_aug = torch.zeros([batch_size, 2, 1]).to(self.device) + sigma_theta_aug[:, 1, :] = sigma_pred_batch[:, 2, :] + sigma_ps = torch.bmm(torch.abs(g_ps), sigma_theta_aug) + sigma_pred_batch[:, :2, :] + + hs = 1e3 * torch.ones((batch_size, num_cbfs), device=self.device) + dhdps = torch.zeros((batch_size, num_cbfs, 2), device=self.device) + hazards = self.env.hazards + for i, hazard in enumerate(hazards): + if hazard['type'] == 'circle': + obs_loc = to_tensor(hazard['location'], torch.FloatTensor, self.device) + hs[:, i] = 0.5 * ( + torch.sum((ps - obs_loc) ** 2, dim=1) - (hazard['radius'] + buffer) ** 2 + ) + dhdps[:, i, :] = ps - obs_loc + else: + raise NotImplementedError + + n_u = action_batch.shape[1] + num_constraints = num_cbfs + 2 * n_u + + G = torch.zeros((batch_size, num_constraints, n_u + 1)).to(self.device) + h = torch.zeros((batch_size, num_constraints)).to(self.device) + ineq_constraint_counter = 0 + + G[:, :num_cbfs, :n_u] = -torch.bmm(dhdps, g_ps) + G[:, :num_cbfs, n_u] = -1 + h[:, :num_cbfs] = gamma_b * (hs**3) + ( + torch.bmm(dhdps, f_ps + mu_ps) + - torch.bmm(torch.abs(dhdps), sigma_ps) + + torch.bmm(torch.bmm(dhdps, g_ps), action_batch) + ).squeeze(-1) + ineq_constraint_counter += num_cbfs + P = ( + torch.diag(torch.tensor([1.0e0, 1.0e-2, 1e5])) + .repeat(batch_size, 1, 1) + .to(self.device) + ) + q = torch.zeros((batch_size, n_u + 1)).to(self.device) + else: + raise NotImplementedError + + n_u = action_batch.shape[1] + + for c in range(n_u): + + if self.u_max is not None: + G[:, ineq_constraint_counter, c] = 1 + h[:, ineq_constraint_counter] = self.u_max[c] - action_batch[:, c].squeeze(-1) + ineq_constraint_counter += 1 + + if self.u_min is not None: + G[:, ineq_constraint_counter, c] = -1 + h[:, ineq_constraint_counter] = -self.u_min[c] + action_batch[:, c].squeeze(-1) + ineq_constraint_counter += 1 + + return P, q, G, h + + def get_control_bounds(self) -> tuple[torch.Tensor, torch.Tensor]: + """Obtain the action bounds. + + Returns: + Action bounds, i.e., min control input and max control input. + """ + u_min = torch.tensor(self.env.safe_action_space.low).to(self.device) + u_max = torch.tensor(self.env.safe_action_space.high).to(self.device) + + return u_min, u_max diff --git a/omnisafe/common/robust_gp_model.py b/omnisafe/common/robust_gp_model.py new file mode 100644 index 000000000..9361c833b --- /dev/null +++ b/omnisafe/common/robust_gp_model.py @@ -0,0 +1,434 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of Dynamics Model Based on GPyTorch.""" +# mypy: ignore-errors + + +from __future__ import annotations + +import os +import warnings +from typing import Callable + +import gpytorch +import gymnasium as gym +import numpy as np +import torch +from gpytorch.distributions import MultivariateNormal +from gpytorch.kernels import RBFKernel, ScaleKernel +from gpytorch.likelihoods import Likelihood +from gpytorch.means import ZeroMean +from gpytorch.priors import NormalPrior + +from omnisafe.typing import DEVICE_CPU +from omnisafe.utils.tools import to_tensor + + +DYNAMICS_MODE = {'Unicycle': {'n_s': 3, 'n_u': 2}} +MAX_STD = {'Unicycle': [2e-1, 2e-1, 2e-1]} + + +class BaseGPy(gpytorch.models.ExactGP): + """A Gaussian Process (GP) model using a zero mean function and a scaled RBF kernel with priors. + + This class extends gpytorch.models.ExactGP, specifically designed for use in + disturbance estimation tasks. + + Attributes: + mean_module (ZeroMean): The mean module which is set to zero mean. + covar_module (ScaleKernel): The covariance kernel, a scaled RBF kernel with specified priors. + + Args: + train_x (Tensor): Training input features, which should be a tensor. + train_y (Tensor): Training target values, which should be a tensor. + prior_std (float): The prior standard deviation used to adjust the output scale of the kernel. + likelihood (Likelihood): The likelihood function associated with the GP model. + """ + + def __init__( + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + prior_std: float, + likelihood: Likelihood, + ) -> None: + """Initialize the BaseGPy model.""" + super().__init__(train_x, train_y, likelihood) + self.mean_module = ZeroMean() + self.covar_module = ScaleKernel( + RBFKernel(lengthscale_prior=NormalPrior(1e5, 1e-5)), + outputscale_prior=NormalPrior(prior_std + 1e-6, 1e-5), + ) + self.covar_module.base_kernel.lengthscale = 1e5 + self.covar_module.outputscale = prior_std + 1e-6 + + # pylint: disable=arguments-differ + def forward(self, x: torch.Tensor) -> MultivariateNormal: + """Forward pass through the GP model to produce a multivariate normal distribution. + + Args: + x (Tensor): Input features for which predictions are to be made. + + Returns: + MultivariateNormal: A multivariate normal distribution reflecting the GP predictions. + """ + mean = self.mean_module(x) + covar = self.covar_module(x) + return MultivariateNormal(mean, covar) + + +class GPyDisturbanceEstimator: + """A class for estimating disturbances using Gaussian Processes with GPyTorch. + + Attributes: + device (torch.device): The device (CPU or CUDA) on which the tensors will be processed. + _train_x (torch.Tensor): Training data features. + _train_y (torch.Tensor): Training data targets. + likelihood (gpytorch.likelihoods.Likelihood): The likelihood model for GP inference. + model (BaseGPy): The GPyTorch model. + + Args: + train_x (torch.Tensor): Training data features. If not a tensor, it will be converted. + train_y (torch.Tensor): Training data targets. If not a tensor, it will be converted. + prior_std (float): Standard deviation of the prior distribution. + likelihood (Optional[gpytorch.likelihoods.Likelihood]): A GPyTorch likelihood. + device (Optional[torch.device]): The torch device. Defaults to CPU if None. + """ + + def __init__( + self, + train_x: torch.Tensor, + train_y: torch.Tensor, + prior_std: float, + likelihood: gpytorch.likelihoods.Likelihood | None = None, + device: torch.device = DEVICE_CPU, + ) -> None: + """Initialize the GPyDisturbanceEstimator.""" + self.device = device if device else torch.device('cpu') + + if not torch.is_tensor(train_x): + train_x = torch.tensor(train_x, dtype=torch.float32, device=self.device) + if not torch.is_tensor(train_y): + train_y = torch.tensor(train_y, dtype=torch.float32, device=self.device) + self._train_x = train_x + self._train_y = train_y + + if not likelihood: + likelihood = gpytorch.likelihoods.GaussianLikelihood() + self.likelihood = likelihood.to(self.device) + + self.model = BaseGPy(train_x, train_y, prior_std, likelihood) + self.model = self.model.to(self.device) + warnings.filterwarnings('ignore') + + def train(self, training_iter: int) -> None: + """Train the Gaussian Process model. + + Args: + training_iter (int): Number of training iterations. + verbose (bool): If True, print detailed logging information. + """ + self.model.train() + self.likelihood.train() + optimizer = torch.optim.Adam(self.model.parameters(), lr=0.1) + mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model) + + for _ in range(training_iter): + optimizer.zero_grad() + output = self.model(self._train_x) + loss = -mll(output, self._train_y) + loss.backward() + optimizer.step() + + def predict(self, test_x: torch.Tensor) -> dict[str, torch.Tensor | np.ndarray]: + """Make predictions on new data. + + Args: + test_x (torch.Tensor): Test data features. If not a tensor, it will be converted. + + Returns: + A dictionary containing prediction mean, variance, covariance matrix, and confidence + intervals. If the input was not a tensor, values will be converted to numpy arrays. + """ + is_tensor = torch.is_tensor(test_x) + if not is_tensor: + test_x = torch.tensor(test_x, dtype=torch.float32, device=self.device) + + self.model.eval() + self.likelihood.eval() + + with torch.no_grad(), gpytorch.settings.fast_pred_var(): + observed_pred = self.likelihood(self.model(test_x)) + pred_dict = { + 'mean': observed_pred.mean.cpu(), + 'f_var': observed_pred.variance.cpu(), + 'f_covar': observed_pred.covariance_matrix.cpu(), + 'lower_ci': observed_pred.confidence_region()[0].cpu(), + 'upper_ci': observed_pred.confidence_region()[1].cpu(), + } + + if not is_tensor: + for key, val in pred_dict.items(): + pred_dict[key] = val.numpy() + + return pred_dict + + +# pylint: disable-next=too-many-instance-attributes +class DynamicsModel: + """Initialize the DynamicsModel with a gymnasium environment. + + Args: + env (gym.Env): The gymnasium environment to model dynamics for. + gp_model_size (int, optional): Maximum history count for disturbances. Default to 2000. + l_p (float, optional): Learning parameter. Default to 0.03. + device (str, optional): The device to perform computations on. Default to 'cpu'. + """ + + def __init__( + self, + env: gym.Env, + gp_model_size: int = 2000, + l_p: float = 0.03, + device: str = 'cpu', + ) -> None: + """Initialize the DynamicsModel with a gymnasium environment.""" + self.env = env + self.get_f, self.get_g = self.get_dynamics() + self.n_s = DYNAMICS_MODE[self.env.dynamics_mode]['n_s'] + self.n_u = DYNAMICS_MODE[self.env.dynamics_mode]['n_u'] + + self.disturbance_history = {} + self.history_counter = 0 + self.max_history_count = gp_model_size + self.disturbance_history['state'] = np.zeros((self.max_history_count, self.n_s)) + self.disturbance_history['disturbance'] = np.zeros((self.max_history_count, self.n_s)) + self._train_x = np.zeros((self.max_history_count, self.n_s)) + self._train_y = np.zeros((self.max_history_count, self.n_s)) + self._disturb_estimators = [] + self.device = torch.device(device) + + for i in range(self.n_s): + self._disturb_estimators.append( + GPyDisturbanceEstimator( + np.zeros((self.max_history_count, self.n_s)), + np.zeros((self.max_history_count, self.n_s)), + MAX_STD[self.env.dynamics_mode][i], + device=self.device, + ), + ) + self._disturb_initialized = True + self.l_p = l_p + + def get_dynamics(self) -> tuple[Callable, Callable]: + """Retrieve the dynamics functions for drift and control based on the environment's dynamics mode. + + Returns: + tuple: A tuple containing two callable methods, `get_f` and `get_g`. + """ + if self.env.dynamics_mode == 'Unicycle': + + def get_f(state_batch: np.ndarray) -> np.ndarray: + return np.zeros(state_batch.shape) + + def get_g(state_batch: np.ndarray) -> np.ndarray: + theta = state_batch[:, 2] + g_x = np.zeros((state_batch.shape[0], 3, 2)) + g_x[:, 0, 0] = np.cos(theta) + g_x[:, 1, 0] = np.sin(theta) + g_x[:, 2, 1] = 1.0 + return g_x + + else: + raise NotImplementedError('Unknown Dynamics mode.') + + return get_f, get_g + + def get_state(self, obs: torch.Tensor) -> torch.Tensor: + """Process the raw observations from the environment. + + Args: + obs (torch.Tensor): The environment observations. + + Returns: + torch.Tensor: The processed state of the system. + """ + expand_dims = len(obs.shape) == 1 + dtype = obs.dtype + device = obs.device + obs = obs.cpu().numpy() if obs.is_cuda else obs.numpy() + + if expand_dims: + obs = np.expand_dims(obs, 0) + + if self.env.dynamics_mode == 'Unicycle': + theta = np.arctan2(obs[:, 3], obs[:, 2]) + state_batch = np.zeros((obs.shape[0], 3)) + state_batch[:, 0] = obs[:, 0] + state_batch[:, 1] = obs[:, 1] + state_batch[:, 2] = theta + else: + raise NotImplementedError('Unknown dynamics') + + if expand_dims: + state_batch = state_batch.squeeze(0) + + return torch.tensor(state_batch, dtype=dtype, device=device) + + def append_transition( + self, + state_batch: np.ndarray, + u_batch: np.ndarray, + next_state_batch: np.ndarray, + ) -> None: + """Estimate the disturbance from the current dynamics transition and adds it to the buffer. + + Args: + state_batch (np.ndarray): The batch of current states, shape (n_s,) or (batch_size, n_s). + u_batch (np.ndarray): The batch of actions applied, shape (n_u,) or (batch_size, n_u). + next_state_batch (np.ndarray): The batch of next states, shape (n_s,) or (batch_size, n_s). + """ + u_batch = np.expand_dims(u_batch, -1) + disturbance_batch = ( + next_state_batch + - state_batch + - self.env.dt + * (self.get_f(state_batch) + (self.get_g(state_batch) @ u_batch).squeeze(-1)) + ) / self.env.dt + + for i in range(state_batch.shape[0]): + self.disturbance_history['state'][self.history_counter % self.max_history_count] = ( + state_batch[i] + ) + self.disturbance_history['disturbance'][ + self.history_counter % self.max_history_count + ] = disturbance_batch[i] + self.history_counter += 1 + + if self.history_counter % (self.max_history_count // 10) == 0: + self.fit_gp_model() + + def fit_gp_model(self, training_iter: int = 70) -> None: + """Fit a Gaussian Process model to the disturbance data. + + Args: + training_iter (int, optional): Number of training iterations for the GP model. Defaults to 70. + """ + if self.history_counter < self.max_history_count: + train_x = self.disturbance_history['state'][: self.history_counter] + train_y = self.disturbance_history['disturbance'][: self.history_counter] + else: + train_x = self.disturbance_history['state'] + train_y = self.disturbance_history['disturbance'] + + train_x_std = np.std(train_x, axis=0) + train_x_normalized = train_x / (train_x_std + 1e-8) + train_y_std = np.std(train_y, axis=0) + train_y_normalized = train_y / (train_y_std + 1e-8) + + self._disturb_estimators = [] + for i in range(self.n_s): + self._disturb_estimators.append( + GPyDisturbanceEstimator( + train_x_normalized, + train_y_normalized[:, i], + MAX_STD[self.env.dynamics_mode][i], + device=self.device, + ), + ) + self._disturb_estimators[i].train(training_iter) + self._disturb_initialized = False + self._train_x = train_x + self._train_y = train_y + + def predict_disturbance(self, test_x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + """Predict the disturbance at the queried states using the trained Gaussian Process models. + + Args: + test_x (torch.Tensor): The state for which to predict disturbances, shape (n_test, n_s). + + Returns: + tuple: A tuple of arrays (means, variances). + """ + dtype = test_x.dtype + device = test_x.device + test_x = test_x.cpu().detach().double().numpy() + + expand_dims = len(test_x.shape) == 1 + if expand_dims: + test_x = np.expand_dims(test_x, axis=0) + + means = np.zeros(test_x.shape) + f_std = np.zeros(test_x.shape) + + if not self._disturb_initialized: + train_x_std = np.std(self._train_x, axis=0) + train_y_std = np.std(self._train_y, axis=0) + test_x = test_x / train_x_std + for i in range(self.n_s): + prediction_ = self._disturb_estimators[i].predict(test_x) + means[:, i] = prediction_['mean'] * (train_y_std[i] + 1e-8) + f_std[:, i] = np.sqrt(prediction_['f_var']) * (train_y_std[i] + 1e-8) + + else: + f_std = np.ones(test_x.shape) + for i in range(self.n_s): + f_std[:, i] *= MAX_STD[self.env.dynamics_mode][i] + + if expand_dims: + means = means.squeeze(0) + f_std = f_std.squeeze(0) + + return (to_tensor(means, dtype, device), to_tensor(f_std, dtype, device)) + + def load_disturbance_models(self, load_dir: str, epoch: str) -> None: + """Load the disturbance models and their training data. + + Args: + load_dir (str): The directory where the model files are saved. + epoch (str): The epoch identifier used in the filenames to load the specific model checkpoint. + """ + self._disturb_estimators = [] + weights = torch.load( + os.path.join(load_dir, f'gp_models_{epoch}.pkl'), + map_location=self.device, + ) + self._train_x = torch.load(os.path.join(load_dir, f'gp_models_train_x_{epoch}.pkl')) + self._train_y = torch.load(os.path.join(load_dir, f'gp_models_train_y_{epoch}.pkl')) + for i in range(self.n_s): + self._disturb_estimators.append( + GPyDisturbanceEstimator( + self._train_x, + self._train_y[:, i], + MAX_STD[self.env.dynamics_mode][i], + device=self.device, + ), + ) + self._disturb_estimators[i].model.load_state_dict(weights[i]) + + @property + def train_x(self) -> np.ndarray: + """Return the training data input features used for the disturbance estimators.""" + return self._train_x + + @property + def train_y(self) -> np.ndarray: + """Return the training data labels used for the disturbance estimators.""" + return self._train_y + + @property + def disturb_estimators(self) -> list[GPyDisturbanceEstimator]: + """Provide access to the list of trained disturbance estimator models.""" + return self._disturb_estimators diff --git a/omnisafe/common/statistics_tools.py b/omnisafe/common/statistics_tools.py index 3856b81a7..72e661c33 100644 --- a/omnisafe/common/statistics_tools.py +++ b/omnisafe/common/statistics_tools.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -83,6 +83,7 @@ def load_source(self, path: str) -> None: 'The config file is not found in the save directory.', ) from error + # pylint: disable-next=too-many-arguments, too-many-locals def draw_graph( self, parameter: str, @@ -91,6 +92,8 @@ def draw_graph( cost_limit: float | None = None, smooth: int = 1, show_image: bool = False, + reward_metrics: str = 'Metrics/EpRet', + cost_metrics: str = 'Metrics/EpCost', ) -> None: """Draw graph. @@ -102,6 +105,8 @@ def draw_graph( cost_limit (float or None, optional): The cost limit of the experiment. Defaults to None. smooth (int, optional): The smooth window size. Defaults to 1. show_image (bool): Whether to show graph image in GUI windows. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. .. note:: `values` and `compare_num` cannot be set at the same time. @@ -161,6 +166,8 @@ def draw_graph( 'mean', save_name=save_name, show_image=show_image, + reward_metrics=reward_metrics, + cost_metrics=cost_metrics, ) except Exception: # noqa # pragma: no cover # pylint: disable=broad-except print( diff --git a/omnisafe/configs/off-policy/DDPGCBF.yaml b/omnisafe/configs/off-policy/DDPGCBF.yaml new file mode 100644 index 000000000..f9d706305 --- /dev/null +++ b/omnisafe/configs/off-policy/DDPGCBF.yaml @@ -0,0 +1,170 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # seed for random number generator + seed: 0 + # training configurations + train_cfgs: + # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc. + device: cpu + # number of threads for torch + torch_threads: 16 + # number of vectorized environments + vector_env_nums: 1 + # number of parallel agent, similar to a3c + parallel: 1 + # total number of steps to train + total_steps: 80_000 + # number of evaluate episodes + eval_episodes: 1 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # number of steps per sample + update_cycle: 1 + # number of iterations to update the policy + update_iters: 1 + # The size of replay buffer + size: 1000000 + # The size of batch + batch_size: 256 + # normalize reward + reward_normalize: False + # normalize cost + cost_normalize: False + # normalize observation + obs_normalize: False + # max gradient norm + max_grad_norm: 40 + # use critic norm + use_critic_norm: False + # critic norm coefficient + critic_norm_coeff: 0.001 + # The soft update coefficient + polyak: 0.001 + # The discount factor of GAE + gamma: 0.99 + # Actor perdorm random action before `start_learning_steps` steps + start_learning_steps: 0 + # The delay step of policy update + policy_delay: 1 + # Whether to use the exploration noise + use_exploration_noise: True + # The exploration noise + exploration_noise: 0.1 + # use cost + use_cost: False + # logger configurations + logger_cfgs: + # use wandb for logging + use_wandb: False + # wandb project name + wandb_project: omnisafe + # use tensorboard for logging + use_tensorboard: True + # save model frequency + save_model_freq: 20 + # save logger path + log_dir: "./runs" + # save model path + window_lens: 10 + # model configurations + model_cfgs: + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # actor type + actor_type: mlp + # linear learning rate decay + linear_lr_decay: False + # Configuration of Actor network + actor: + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # The learning rate of Actor network + lr: 0.0001 + # Configuration of Critic network + critic: + # The number of critic networks + num_critics: 1 + # Size of hidden layers + hidden_sizes: [400, 300] + # Activation function + activation: relu + # The learning rate of Critic network + lr: 0.001 + # barrier function compensator configurations + compensator_cfgs: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: relu + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # learning rate + lr: 0.01 + # number of iterations to update the compensator + update_iters: 1 + +SafetyCarCircle1-v0: + # model configurations + model_cfgs: + # Configuration of Actor network + actor: + # The learning rate of Actor network + lr: 0.000005 + # Configuration of Critic network + critic: + # The learning rate of Critic network + lr: 0.001 + +SafetyCarGoal1-v0: + # model configurations + model_cfgs: + # Configuration of Actor network + actor: + # The learning rate of Actor network + lr: 0.000005 + # Configuration of Critic network + critic: + # The learning rate of Critic network + lr: 0.001 + +SafetyPointCircle1-v0: + # model configurations + model_cfgs: + # Configuration of Actor network + actor: + # The learning rate of Actor network + lr: 0.000005 + # Configuration of Critic network + critic: + # The learning rate of Critic network + lr: 0.001 + +SafetyPointGoal1-v0: + # model configurations + model_cfgs: + # Configuration of Actor network + actor: + # The learning rate of Actor network + lr: 0.000005 + # Configuration of Critic network + critic: + # The learning rate of Critic network + lr: 0.001 diff --git a/omnisafe/configs/off-policy/SACRCBF.yaml b/omnisafe/configs/off-policy/SACRCBF.yaml new file mode 100644 index 000000000..f70327e6d --- /dev/null +++ b/omnisafe/configs/off-policy/SACRCBF.yaml @@ -0,0 +1,134 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # seed for random number generator + seed: 0 + # training configurations + train_cfgs: + # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc. + device: cpu + # number of threads for torch + torch_threads: 4 + # number of vectorized environments + vector_env_nums: 1 + # number of parallel agent, similar to a3c + parallel: 1 + # total number of steps to train + total_steps: 200000 + # number of evaluate episodes + eval_episodes: 1 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 1000 + # number of steps per sample + update_cycle: 1 + # number of iterations to update the policy + update_iters: 1 + # size of replay buffer + size: 1000000 + # size of batch + batch_size: 256 + # normalize reward + reward_normalize: False + # normalize cost + cost_normalize: False + # normalize observation + obs_normalize: False + # max gradient norm + max_grad_norm: 40 + # use critic norm + use_critic_norm: False + # critic norm coefficient + critic_norm_coeff: 0.001 + # soft update coefficient + polyak: 0.005 + # discount factor of GAE + gamma: 0.99 + # actor perform random action before `start_learning_steps` steps + start_learning_steps: 5000 + # delay step of policy update + policy_delay: 1 + # whether to use the exploration noise + use_exploration_noise: False + # exploration noise + exploration_noise: 0.1 + # policy noise + policy_noise: 0.2 + # policy_noise_clip + policy_noise_clip: 0.5 + # value of alpha + alpha: 0.2 + # Whether to use auto alpha + auto_alpha: True + # use cost + use_cost: False + # control barrier function configurations + cbf_cfgs: + # gamma of control barrier certificate. + gamma_b: 20 + # confidence parameter desired + k_d: 3.0 + # environment dynamics coefficient + l_p: 0.03 + # logger configurations + logger_cfgs: + # use wandb for logging + use_wandb: False + # wandb project name + wandb_project: omnisafe + # use tensorboard for logging + use_tensorboard: True + # save model frequency + save_model_freq: 40 + # save logger path + log_dir: "./runs" + # save model path + window_lens: 10 + # model configurations + model_cfgs: + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # actor type + actor_type: gaussian_sac + # linear learning rate decay + linear_lr_decay: False + # configuration of actor network + actor: + # size of hidden layers + hidden_sizes: [400, 300] + # activation function + activation: relu + # learning rate of actor network + lr: 0.0003 + # configuration of critic network + critic: + # number of critic networks + num_critics: 2 + # size of hidden layers + hidden_sizes: [400, 300] + # activation function + activation: relu + # learning rate of critic network + lr: 0.0003 + # dynamics model configurations + dynamics_model_cfgs: + # max number of episodes updating GP models + gp_max_episodes: 100 + # size of gp model + gp_model_size: 2000 + # whether to use the action compensator + use_compensator: False diff --git a/omnisafe/configs/on-policy/IPO.yaml b/omnisafe/configs/on-policy/IPO.yaml index 852b08344..807984252 100644 --- a/omnisafe/configs/on-policy/IPO.yaml +++ b/omnisafe/configs/on-policy/IPO.yaml @@ -33,7 +33,7 @@ defaults: # number of steps to update the policy steps_per_epoch: 20000 # number of iterations to update the policy - update_iters: 10 + update_iters: 40 # batch size for each iteration batch_size: 64 # target kl divergence @@ -41,9 +41,9 @@ defaults: # entropy coefficient entropy_coef: 0.0 # normalize reward - reward_normalize: True + reward_normalize: False # normalize cost - cost_normalize: True + cost_normalize: False # normalize observation obs_normalize: True # early stop when kl divergence is bigger than target kl @@ -134,3 +134,29 @@ defaults: lambda_lr: 0.035 # Type of lagrangian optimizer lambda_optimizer: "Adam" + +Pendulum-v1: + # training configurations + train_cfgs: + # total number of steps to train + total_steps: 80_000 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # number of iterations to update the policy + update_iters: 10 + # batch size for each iteration + batch_size: 256 + # target kl divergence + target_kl: 0.005 + # normalize observation + obs_normalize: False + # reward discount factor + gamma: 0.995 + # lambda for gae + lam: 0.98 + # lagrangian configurations + lagrange_cfgs: + # Tolerance of constraint violation + cost_limit: 1000.0 diff --git a/omnisafe/configs/on-policy/PPOBetaCBF.yaml b/omnisafe/configs/on-policy/PPOBetaCBF.yaml new file mode 100644 index 000000000..afb636e8b --- /dev/null +++ b/omnisafe/configs/on-policy/PPOBetaCBF.yaml @@ -0,0 +1,120 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # seed for random number generator + seed: 0 + # training configurations + train_cfgs: + # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc. + device: cpu + # number of threads for torch + torch_threads: 16 + # number of vectorized environments + vector_env_nums: 1 + # number of parallel agent, similar to a3c + parallel: 1 + # total number of steps to train + total_steps: 80_000 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # number of iterations to update the policy + update_iters: 10 + # batch size for each iteration + batch_size: 64 + # target kl divergence + target_kl: 0.02 + # entropy coefficient + entropy_coef: 0.0 + # normalize reward + reward_normalize: False + # normalize cost + cost_normalize: False + # normalize observation + obs_normalize: False + # early stop when kl divergence is bigger than target kl + kl_early_stop: True + # use max gradient norm + use_max_grad_norm: False + # max gradient norm + max_grad_norm: 40.0 + # use critic norm + use_critic_norm: True + # critic norm coefficient + critic_norm_coef: 0.001 + # reward discount factor + gamma: 0.995 + # cost discount factor + cost_gamma: 0.99 + # lambda for gae + lam: 0.98 + # lambda for cost gae + lam_c: 0.95 + # clip ratio + clip: 0.2 + # advantage estimation method, options: gae, retrace + adv_estimation_method: gae + # standardize reward advantage + standardized_rew_adv: True + # standardize cost advantage + standardized_cost_adv: True + # penalty coefficient + penalty_coef: 0.0 + # use cost + use_cost: False + # logger configurations + logger_cfgs: + # use wandb for logging + use_wandb: False + # wandb project name + wandb_project: omnisafe + # use tensorboard for logging + use_tensorboard: True + # save model frequency + save_model_freq: 100 + # save logger path + log_dir: "./runs" + # save model path + window_lens: 100 + # model configurations + model_cfgs: + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # actor type, options: gaussian, gaussian_learning + actor_type: beta + # linear learning rate decay + linear_lr_decay: True + # exploration noise anneal + exploration_noise_anneal: False + # std upper bound, and lower bound + std_range: [0.5, 0.1] + # actor network configurations + actor: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: tanh + # out_activation: tanh + # learning rate + lr: 0.0003 + critic: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: tanh + # learning rate + lr: 0.0003 diff --git a/omnisafe/configs/on-policy/TRPO.yaml b/omnisafe/configs/on-policy/TRPO.yaml index 455ba163f..ab025a391 100644 --- a/omnisafe/configs/on-policy/TRPO.yaml +++ b/omnisafe/configs/on-policy/TRPO.yaml @@ -124,3 +124,35 @@ defaults: activation: tanh # learning rate lr: 0.001 + +Pendulum-v1: + # training configurations + train_cfgs: + # total number of steps to train + total_steps: 80_000 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # batch size for each iteration + batch_size: 256 + # target kl divergence + target_kl: 0.005 + # normalize observation + obs_normalize: False + # reward discount factor + gamma: 0.995 + # lambda for gae + lam: 0.98 + # model configurations + model_cfgs: + # actor network configurations + actor: + # activation function + activation: relu + # barrier function compensator configurations + compensator_cfgs: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: relu diff --git a/omnisafe/configs/on-policy/TRPOCBF.yaml b/omnisafe/configs/on-policy/TRPOCBF.yaml new file mode 100644 index 000000000..c61d3df44 --- /dev/null +++ b/omnisafe/configs/on-policy/TRPOCBF.yaml @@ -0,0 +1,140 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +defaults: + # seed for random number generator + seed: 0 + # training configurations + train_cfgs: + # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc. + device: cpu + # number of threads for torch + torch_threads: 16 + # number of vectorized environments + vector_env_nums: 1 + # number of parallel agent, similar to a3c + parallel: 1 + # total number of steps to train + total_steps: 80_000 + # algorithm configurations + algo_cfgs: + # number of steps to update the policy + steps_per_epoch: 2000 + # number of iterations to update the policy + update_iters: 10 + # batch size for each iteration + batch_size: 256 + # target kl divergence + target_kl: 0.005 + # entropy coefficient + entropy_coef: 0.0 + # normalize reward + reward_normalize: False + # normalize cost + cost_normalize: False + # normalize observation + obs_normalize: False + # early stop when kl divergence is bigger than target kl + kl_early_stop: False + # use max gradient norm + use_max_grad_norm: True + # max gradient norm + max_grad_norm: 40.0 + # use critic norm + use_critic_norm: True + # critic norm coefficient + critic_norm_coef: 0.001 + # reward discount factor + gamma: 0.995 + # cost discount factor + cost_gamma: 0.99 + # lambda for gae + lam: 0.98 + # lambda for cost gae + lam_c: 0.95 + # advantage estimation method, options: gae, retrace + adv_estimation_method: gae + # standardize reward advantage + standardized_rew_adv: True + # standardize cost advantage + standardized_cost_adv: True + # penalty coefficient + penalty_coef: 0.0 + # use cost + use_cost: False + # Damping value for conjugate gradient + cg_damping: 0.1 + # Number of conjugate gradient iterations + cg_iters: 15 + # Subsampled observation + fvp_obs: None + # The sub-sampling rate of the observation + fvp_sample_freq: 1 + # The max steps to update dynamics model + update_dynamics_steps: 650 + # logger configurations + logger_cfgs: + # use wandb for logging + use_wandb: False + # wandb project name + wandb_project: omnisafe + # use tensorboard for logging + use_tensorboard: True + # save model frequency + save_model_freq: 10 + # save logger path + log_dir: "./runs" + # save model path + window_lens: 100 + # model configurations + model_cfgs: + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # actor type, options: gaussian, gaussian_learning + actor_type: gaussian_learning + # linear learning rate decay + linear_lr_decay: False + # exploration noise anneal + exploration_noise_anneal: False + # std upper bound, and lower bound + std_range: [0.5, 0.1] + # actor network configurations + actor: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: relu + # learning rate + lr: ~ + # critic network configurations + critic: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: tanh + # learning rate + lr: 0.001 + # barrier function compensator configurations + compensator_cfgs: + # hidden layer sizes + hidden_sizes: [64, 64] + # activation function + activation: relu + # weight initialization mode + weight_initialization_mode: "kaiming_uniform" + # learning rate + lr: 0.01 + # number of iterations to update the compensator + update_iters: 1 diff --git a/omnisafe/envs/__init__.py b/omnisafe/envs/__init__.py index 4d225c61d..095a1134c 100644 --- a/omnisafe/envs/__init__.py +++ b/omnisafe/envs/__init__.py @@ -15,11 +15,13 @@ """Environment API for OmniSafe.""" from omnisafe.envs import classic_control +from omnisafe.envs.cbf_env import BarrierFunctionEnv from omnisafe.envs.core import CMDP, env_register, make, support_envs from omnisafe.envs.crabs_env import CRABSEnv from omnisafe.envs.custom_env import CustomEnv from omnisafe.envs.meta_drive_env import SafetyMetaDriveEnv from omnisafe.envs.mujoco_env import MujocoEnv +from omnisafe.envs.rcbf_env import RobustBarrierFunctionEnv from omnisafe.envs.safety_gymnasium_env import SafetyGymnasiumEnv from omnisafe.envs.safety_gymnasium_modelbased import SafetyGymnasiumModelBased from omnisafe.envs.safety_isaac_gym_env import SafetyIsaacGymEnv diff --git a/omnisafe/envs/cbf_env.py b/omnisafe/envs/cbf_env.py new file mode 100644 index 000000000..a46e91c94 --- /dev/null +++ b/omnisafe/envs/cbf_env.py @@ -0,0 +1,235 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Interface of control barrier function-based environments.""" + +# mypy: ignore-errors +# pylint: disable=all + +from __future__ import annotations + +from typing import Any, ClassVar + +import gymnasium +import numpy as np +import torch +from gymnasium import spaces + +from omnisafe.common.logger import Logger +from omnisafe.envs.core import CMDP, env_register +from omnisafe.typing import Box + + +@env_register +class BarrierFunctionEnv(CMDP): + """Interface of control barrier function-based environments. + + .. warning:: + Since environments based on control barrier functions require special judgment and control + of environmental dynamics, they do not support the use of vectorized environments. + + Attributes: + need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. + need_time_limit_wrapper (bool): Whether to use time limit wrapper. + """ + + need_auto_reset_wrapper = True + need_time_limit_wrapper = False + _support_envs: ClassVar[list[str]] = [ + 'Pendulum-v1', + ] + + def __init__( + self, + env_id: str, + num_envs: int = 1, + device: str = 'cpu', + **kwargs: Any, + ) -> None: + """Initialize the environment. + + Args: + env_id (str): Environment id. + num_envs (int, optional): Number of environments. Defaults to 1. + device (torch.device, optional): Device to store the data. Defaults to 'cpu'. + + Keyword Args: + render_mode (str, optional): The render mode, ranging from ``human``, ``rgb_array``, ``rgb_array_list``. + Defaults to ``rgb_array``. + camera_name (str, optional): The camera name. + camera_id (int, optional): The camera id. + width (int, optional): The width of the rendered image. Defaults to 256. + height (int, optional): The height of the rendered image. Defaults to 256. + """ + super().__init__(env_id) + self._env_id = env_id + if num_envs == 1: + self._env = gymnasium.make( + id=env_id, + autoreset=False, + render_mode=kwargs.get('render_mode'), + ) + self._env_specific_setting() + assert isinstance(self._env.action_space, Box), 'Only support Box action space.' + assert isinstance( + self._env.observation_space, + Box, + ), 'Only support Box observation space.' + self._action_space = self._env.action_space + self._observation_space = self._env.observation_space + else: + raise NotImplementedError('Only support num_envs=1 now.') + self._device = torch.device(device) + self._episodic_violation: list[float] = [] + self._num_envs = num_envs + self._metadata = self._env.metadata + self.env_spec_log = {'Metrics/Max_angle_violation': 0.0} + + def _env_specific_setting(self) -> None: + """Execute some specific setting for environments. + + Some algorithms based on control barrier functions have made fine-tuning adjustments to the environment. + We have organized these adjustments and encapsulated them in this function. + """ + if self._env_id == 'Pendulum-v1': + self._env.unwrapped.max_torque = 15.0 # type: ignore + self._env.unwrapped.max_speed = 60.0 # type: ignore + self._env.unwrapped.action_space = spaces.Box( + low=-self._env.unwrapped.max_torque, # type: ignore + high=self._env.unwrapped.max_torque, # type: ignore + shape=(1,), + ) + high = np.array([1.0, 1.0, self._env.unwrapped.max_speed], dtype=np.float32) # type: ignore + self._env.unwrapped.observation_space = spaces.Box(low=-high, high=high) + self._env.dt = 0.05 # type: ignore + + def step( + self, + action: torch.Tensor, + ) -> tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + dict[str, Any], + ]: + """Step the environment. + + .. note:: + + OmniSafe use auto reset wrapper to reset the environment when the episode is + terminated. So the ``obs`` will be the first observation of the next episode. + And the true ``final_observation`` in ``info`` will be stored in the ``final_observation`` key of ``info``. + + Args: + action (torch.Tensor): Action to take. + + Returns: + observation: Agent's observation of the current environment. + reward: Amount of reward returned after previous action. + cost: Amount of cost returned after previous action. + terminated: Whether the episode has ended. + truncated: Whether the episode has been truncated due to a time limit. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + obs, reward, terminated, truncated, info = self._env.step( + action.detach().cpu().numpy(), + ) + obs, reward, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, terminated, truncated) + ) + cost = torch.abs(torch.atan2(obs[1], obs[0])).to(self._device) + self._episodic_violation.append(cost.item()) + + if 'final_observation' in info: + info['final_observation'] = np.array( + [ + array if array is not None else np.zeros(obs.shape[-1]) + for array in info['final_observation'] + ], + ) + info['final_observation'] = torch.as_tensor( + info['final_observation'], + dtype=torch.float32, + device=self._device, + ) + + return obs, reward, cost, terminated, truncated, info + + def spec_log(self, logger: Logger) -> None: + """Log specific environment into logger. + + Max angle violation in one episode. + + .. note:: + This function will be called after each episode. + + Args: + logger (Logger): The logger to use for logging. + """ + logger.store({'Metrics/Max_angle_violation': max(self._episodic_violation)}) + self._episodic_violation = [] + + def reset( + self, + seed: int | None = None, + options: dict[str, Any] | None = None, + ) -> tuple[torch.Tensor, dict]: + """Reset the environment. + + Args: + seed (int, optional): The random seed. Defaults to None. + options (dict[str, Any], optional): The options for the environment. Defaults to None. + + Returns: + observation: Agent's observation of the current environment. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + obs, info = self._env.reset(seed=seed, options=options) + if self._env_id == 'Pendulum-v1': + while self._env.unwrapped.state[0] > 1.0 or self._env.unwrapped.state[0] < -1.0: # type: ignore + obs, info = self._env.reset(options=options) + return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info + + @property + def max_episode_steps(self) -> int: + """The max steps per episode.""" + return self._env.spec.max_episode_steps + + def set_seed(self, seed: int) -> None: + """Set the seed for the environment. + + Args: + seed (int): Seed to set. + """ + self.reset(seed=seed) + + def render(self) -> Any: + """Render the environment. + + Returns: + Rendered environment. + """ + return self._env.render() + + def close(self) -> None: + """Close the environment.""" + self._env.close() + + @property + def unwrapped(self) -> gymnasium.Env: + """Return the original interface of environment.""" + return self._env.unwrapped diff --git a/omnisafe/envs/classic_control/__init__.py b/omnisafe/envs/classic_control/__init__.py index d899a41de..9c8e7b35a 100644 --- a/omnisafe/envs/classic_control/__init__.py +++ b/omnisafe/envs/classic_control/__init__.py @@ -13,4 +13,5 @@ # limitations under the License. # ============================================================================== """Environment implementations from papers.""" -from omnisafe.envs.classic_control import envs_from_crabs + +from omnisafe.envs.classic_control import envs_from_crabs, envs_from_rcbf diff --git a/omnisafe/envs/classic_control/envs_from_rcbf.py b/omnisafe/envs/classic_control/envs_from_rcbf.py new file mode 100644 index 000000000..211c8a352 --- /dev/null +++ b/omnisafe/envs/classic_control/envs_from_rcbf.py @@ -0,0 +1,189 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Interface of control barrier function-based environments.""" + +# mypy: ignore-errors +# pylint: disable=all + +from __future__ import annotations + +from typing import Any, Callable + +import gymnasium +import numpy as np +from gymnasium import spaces + + +class UnicycleEnv(gymnasium.Env): + """Environment from `The Soft Actor-Critic algorithm with Robust Control Barrier Function`.""" + + def __init__(self) -> None: + """Initialize the unicycle environment.""" + super().__init__() + + self.dynamics_mode = 'Unicycle' + self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,)) + self.safe_action_space = spaces.Box(low=-2.5, high=2.5, shape=(2,)) + self.observation_space = spaces.Box(low=-1e10, high=1e10, shape=(7,)) + self.bds = np.array([[-3.0, -3.0], [3.0, 3.0]]) + + self.dt = 0.02 + self.max_episode_steps = 1000 + self.reward_goal = 1.0 + self.goal_size = 0.3 + self.state = None + self.episode_step = 0 + self.initial_state = np.array( + [[-2.5, -2.5, 0.0], [-2.5, 2.5, 0.0], [-2.5, 0.0, 0.0], [2.5, -2.5, np.pi / 2]], + ) + self.goal_pos = np.array([2.5, 2.5]) + self.rand_init = False + + self.reset() + + self.get_f, self.get_g = self._get_dynamics() + self.disturb_mean = np.zeros((3,)) + self.disturb_covar = np.diag([0.005, 0.005, 0.05]) * 20 + self.hazards = [] + + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([0.0, 0.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([-1.0, 1.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([-1.0, -1.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([1.0, -1.0])}, + ) + self.hazards.append( + {'type': 'circle', 'radius': 0.6, 'location': 1.5 * np.array([1.0, 1.0])}, + ) + self.viewer = None + + def step( + self, + action: np.ndarray, + ) -> tuple[np.ndarray, float, float, bool, bool, dict[str, Any]]: + """Step the environment.""" + action = np.clip(action, -1.0, 1.0) + state, reward, cost, terminated, truncated, info = self._step(action) + return self.get_obs(), reward, cost, terminated, truncated, info + + def _step(self, action: np.ndarray) -> tuple: + """The details of step dynamics.""" + self.state += self.dt * (self.get_f(self.state) + self.get_g(self.state) @ action) + self.state -= self.dt * 0.1 * self.get_g(self.state) @ np.array([np.cos(self.state[2]), 0]) + + self.episode_step += 1 + + dist_goal = self._goal_dist() + reward = self.last_goal_dist - dist_goal + self.last_goal_dist = dist_goal + terminated = False + if self.goal_met(): + reward += self.reward_goal + terminated = True + truncated = self.episode_step >= self.max_episode_steps + + cost = 0.0 + for hazard in self.hazards: + if hazard['type'] == 'circle': + cost += 0.1 * ( + np.sum((self.state[:2] - hazard['location']) ** 2) < hazard['radius'] ** 2 + ) + + return self.state, reward, cost, terminated, truncated, {} + + def goal_met(self) -> bool: + """Return whether meeting the goal.""" + return np.linalg.norm(self.state[:2] - self.goal_pos) <= self.goal_size + + def reset(self, seed: int | None = None, options: dict | None = None) -> tuple: + """Reset the environment.""" + self.episode_step = 0 + + if self.rand_init: + self.state = np.copy(self.initial_state[np.random.randint(self.initial_state.shape[0])]) + else: + self.state = np.copy(self.initial_state[0]) + + self.last_goal_dist = self._goal_dist() + + return self.get_obs(), {} + + def render(self, mode: str = 'human') -> np.ndarray: + """Get the image of the running environment.""" + raise NotImplementedError + + def get_obs(self) -> np.ndarray: + """Given the state, this function returns corresponding observation. + + Returns: + Observation: np.ndarray. + """ + rel_loc = self.goal_pos - self.state[:2] + goal_dist = np.linalg.norm(rel_loc) + goal_compass = self.obs_compass() + + return np.array( + [ + self.state[0], + self.state[1], + np.cos(self.state[2]), + np.sin(self.state[2]), + goal_compass[0], + goal_compass[1], + np.exp(-goal_dist), + ], + ) + + def obs_compass(self) -> np.ndarray: + """Return a robot-centric compass observation of a list of positions.""" + vec = self.goal_pos - self.state[:2] + R = np.array( + [ + [np.cos(self.state[2]), -np.sin(self.state[2])], + [np.sin(self.state[2]), np.cos(self.state[2])], + ], + ) + vec = np.matmul(vec, R) + vec /= np.sqrt(np.sum(np.square(vec))) + 0.001 + return vec + + def _get_dynamics(self) -> tuple[Callable, Callable]: + + def get_f(state: np.ndarray) -> np.ndarray: + """Function to compute the drift dynamics 'f(x)' of the system.""" + return np.zeros(state.shape) + + def get_g(state: np.ndarray) -> np.ndarray: + """Function to compute the control dynamics 'g(x)' of the system.""" + theta = state[2] + return np.array([[np.cos(theta), 0], [np.sin(theta), 0], [0, 1.0]]) + + return get_f, get_g + + def _goal_dist(self) -> np.ndarray: + """Calculate the distance between the goal.""" + return np.linalg.norm(self.goal_pos - self.state[:2]) + + def close(self) -> None: + """Close the instance of environment.""" + if self.viewer: + self.viewer.close() + self.viewer = None diff --git a/omnisafe/envs/rcbf_env.py b/omnisafe/envs/rcbf_env.py new file mode 100644 index 000000000..983528489 --- /dev/null +++ b/omnisafe/envs/rcbf_env.py @@ -0,0 +1,173 @@ +# Copyright 2024 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Interface of control barrier function-based environments.""" + +# mypy: ignore-errors +# pylint: disable=all + +from __future__ import annotations + +from typing import Any, ClassVar + +import numpy as np +import torch + +from omnisafe.envs.classic_control.envs_from_rcbf import UnicycleEnv +from omnisafe.envs.core import CMDP, env_register +from omnisafe.typing import Box + + +@env_register +class RobustBarrierFunctionEnv(CMDP): + """Interface of robust control barrier function-based environments. + + .. warning:: + Since environments based on control barrier functions require special judgment and control + of environmental dynamics, they do not support the use of vectorized environments for + parallelization. + + Attributes: + need_auto_reset_wrapper (bool): Whether to use auto reset wrapper. + need_time_limit_wrapper (bool): Whether to use time limit wrapper. + """ + + need_auto_reset_wrapper = True + need_time_limit_wrapper = False + _support_envs: ClassVar[list[str]] = [ + 'Unicycle', + ] + + def __init__( + self, + env_id: str, + num_envs: int = 1, + device: str = 'cpu', + **kwargs: Any, + ) -> None: + """Initialize the robust control barrier function-based environments.""" + super().__init__(env_id) + self._env_id = env_id + if num_envs == 1: + if self._env_id == 'Unicycle': + self._env = UnicycleEnv() + else: + raise NotImplementedError('Only support Unicycle now.') + assert isinstance(self._env.action_space, Box), 'Only support Box action space.' + assert isinstance( + self._env.observation_space, + Box, + ), 'Only support Box observation space.' + self._action_space = self._env.action_space + self._observation_space = self._env.observation_space + else: + raise NotImplementedError('Only support num_envs=1 now.') + self._device = torch.device(device) + + self._num_envs = num_envs + self._metadata = self._env.metadata + + def step( + self, + action: torch.Tensor, + ) -> tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + dict[str, Any], + ]: + """Step the environment. + + .. note:: + + OmniSafe use auto reset wrapper to reset the environment when the episode is + terminated. So the ``obs`` will be the first observation of the next episode. + And the true ``final_observation`` in ``info`` will be stored in the ``final_observation`` key of ``info``. + + Args: + action (torch.Tensor): Action to take. + + Returns: + observation: Agent's observation of the current environment. + reward: Amount of reward returned after previous action. + cost: Amount of cost returned after previous action. + terminated: Whether the episode has ended. + truncated: Whether the episode has been truncated due to a time limit. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + obs, reward, cost, terminated, truncated, info = self._env.step( + action.detach().cpu().numpy(), + ) + obs, reward, cost, terminated, truncated = ( + torch.as_tensor(x, dtype=torch.float32, device=self._device) + for x in (obs, reward, cost, terminated, truncated) + ) + if 'final_observation' in info: + info['final_observation'] = np.array( + [ + array if array is not None else np.zeros(obs.shape[-1]) + for array in info['final_observation'] + ], + ) + info['final_observation'] = torch.as_tensor( + info['final_observation'], + dtype=torch.float32, + device=self._device, + ) + + return obs, reward, cost, terminated, truncated, info + + def reset( + self, + seed: int | None = None, + options: dict[str, Any] | None = None, + ) -> tuple[torch.Tensor, dict]: + """Reset the environment. + + Args: + seed (int, optional): The random seed. Defaults to None. + options (dict[str, Any], optional): The options for the environment. Defaults to None. + + Returns: + observation: Agent's observation of the current environment. + info: Auxiliary diagnostic information (helpful for debugging, and sometimes learning). + """ + obs, info = self._env.reset(seed=seed, options=options) + return torch.as_tensor(obs, dtype=torch.float32, device=self._device), info + + def set_seed(self, seed: int) -> None: + """Set the seed for the environment. + + Args: + seed (int): Seed to set. + """ + self.reset(seed=seed) + + def render(self) -> Any: + """Render the environment. + + Returns: + Rendered environment. + """ + return self._env.render() + + def close(self) -> None: + """Close the environment.""" + self._env.close() + + def __getattr__(self, name: str) -> Any: + """Return the unwrapped environment attributes.""" + return getattr(self._env, name) diff --git a/omnisafe/envs/safety_gymnasium_modelbased.py b/omnisafe/envs/safety_gymnasium_modelbased.py index fe5ae5071..2e1a00598 100644 --- a/omnisafe/envs/safety_gymnasium_modelbased.py +++ b/omnisafe/envs/safety_gymnasium_modelbased.py @@ -181,6 +181,8 @@ def get_cost_from_obs_tensor(self, obs: torch.Tensor, is_binary: bool = True) -> elif len(obs.shape) == 3: batch_size = obs.shape[0] * obs.shape[1] hazard_obs = obs[:, :, hazards_key].reshape(batch_size, -1, 2) + else: + raise NotImplementedError hazards_dist = torch.sqrt(torch.sum(torch.square(hazard_obs), dim=2)).reshape( batch_size, -1, @@ -497,8 +499,10 @@ def reset( self.get_lidar_from_coordinate(flat_coordinate_obs) info['obs_original'] = obs_original info['goal_met'] = False - obs = torch.as_tensor(flat_coordinate_obs, dtype=torch.float32, device=self._device) + else: + obs = torch.as_tensor(obs_original, dtype=torch.float32, device=self._device) + return obs, info def set_seed(self, seed: int) -> None: diff --git a/omnisafe/evaluator.py b/omnisafe/evaluator.py index 8732d6e34..088c8b4af 100644 --- a/omnisafe/evaluator.py +++ b/omnisafe/evaluator.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,6 +13,8 @@ # limitations under the License. # ============================================================================== """Implementation of Evaluator.""" +# mypy: ignore-errors + from __future__ import annotations @@ -37,6 +39,8 @@ SafeARCPlanner, ) from omnisafe.common import Normalizer +from omnisafe.common.barrier_comp import BarrierCompensator +from omnisafe.common.barrier_solver import PendulumSolver from omnisafe.common.control_barrier_function.crabs.models import ( AddGaussianNoise, CrabsCore, @@ -47,6 +51,9 @@ from omnisafe.common.control_barrier_function.crabs.optimizers import Barrier from omnisafe.common.control_barrier_function.crabs.utils import Normalizer as CRABSNormalizer from omnisafe.common.control_barrier_function.crabs.utils import create_model_and_trainer +from omnisafe.common.gp_model import DynamicsModel +from omnisafe.common.robust_barrier_solver import CBFQPLayer +from omnisafe.common.robust_gp_model import DynamicsModel as RoboustDynamicsModel from omnisafe.envs.core import CMDP, make from omnisafe.envs.wrapper import ActionRepeat, ActionScale, ObsNormalize, TimeLimit from omnisafe.models.actor import ActorBuilder @@ -94,6 +101,9 @@ def __init__( self._safety_obs = torch.ones(1) self._cost_count = torch.zeros(1) self.__set_render_mode(render_mode) + self._dynamics_model: DynamicsModel | RoboustDynamicsModel | None = None + self._solver: PendulumSolver | CBFQPLayer | None = None + self._compensator = None def __set_render_mode(self, render_mode: str) -> None: """Set the render mode. @@ -130,7 +140,7 @@ def __load_cfgs(self, save_dir: str) -> None: self._dict_cfgs = kwargs self._cfgs = Config.dict2config(kwargs) - # pylint: disable-next=too-many-branches + # pylint: disable-next=attribute-defined-outside-init,import-outside-toplevel,too-many-branches,too-many-locals def __load_model_and_env( self, save_dir: str, @@ -301,6 +311,45 @@ def __load_model_and_env( ) self._actor = actor_builder.build_actor(actor_type) self._actor.load_state_dict(model_params['pi']) + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + epoch = model_name.split('.pt')[0].split('-')[-1] + self._solver = PendulumSolver(action_size=self._env.action_space.shape[0]) + path = os.path.join( + save_dir, + 'gp_model_save', + f'gaussian_process_regressor_{epoch}.pkl', + ) + self._dynamics_model = DynamicsModel( + observation_size=observation_space.shape[0], + load_dir=path, + ) + + self._compensator = BarrierCompensator( + obs_dim=observation_space.shape[0], + act_dim=action_space.shape[0], + cfgs=self._cfgs['compensator_cfgs'], + ) + model_path = os.path.join(save_dir, 'torch_save', model_name) + try: + model_params = torch.load(model_path) + except FileNotFoundError as error: + raise FileNotFoundError( + 'The model is not found in the save directory.', + ) from error + self._compensator.load_state_dict(model_params['compensator']) + if self._cfgs['algo'] == 'SACRCBF': + epoch = model_name.split('.pt')[0].split('-')[-1] + self._solver = CBFQPLayer( + env=self._env, + device=self._cfgs['train_cfgs']['device'], + gamma_b=self._cfgs['cbf_cfgs']['gamma_b'], + l_p=self._cfgs['cbf_cfgs']['l_p'], + ) + self._dynamics_model = RoboustDynamicsModel(env=self._env) + self._dynamics_model.load_disturbance_models( + load_dir=os.path.join(self._save_dir, 'gp_model_save'), + epoch=epoch, + ) if self._cfgs['algo'] in ['CRABS']: self._init_crabs(model_params) @@ -396,6 +445,7 @@ def load_saved( self.__load_model_and_env(save_dir, model_name, env_kwargs) + # pylint: disable-next=too-many-locals,too-many-branches def evaluate( self, num_episodes: int = 10, @@ -452,13 +502,44 @@ def evaluate( raise ValueError( 'The policy must be provided or created before evaluating the agent.', ) + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + approx_compensating_act = self._compensator(obs=obs) + compensated_act_mean_raw = act + approx_compensating_act + [f, g, x, std] = self._dynamics_model.get_gp_dynamics(obs, use_prev_model=False) + compensating_act = self._solver.control_barrier( + original_action=compensated_act_mean_raw, + f=f, + g=g, + x=x, + std=std, + ) + act = compensated_act_mean_raw + compensating_act + + if self._cfgs['algo'] == 'SACRCBF': + state_batch = self._dynamics_model.get_state(obs) + mean_pred_batch, sigma_pred_batch = self._dynamics_model.predict_disturbance( + state_batch, + ) + safe_act = self._solver.get_safe_action( + state_batch, + act, + mean_pred_batch, + sigma_pred_batch, + ) + act = safe_act + obs, rew, cost, terminated, truncated, _ = self._env.step(act) if 'Saute' in self._cfgs['algo'] or 'Simmer' in self._cfgs['algo']: self._safety_obs -= cost.unsqueeze(-1) / self._safety_budget self._safety_obs /= self._cfgs.algo_cfgs.saute_gamma ep_ret += rew.item() - ep_cost += (cost_criteria**length) * cost.item() + + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + ep_cost = ep_cost if ep_cost > cost.item() else cost.item() + else: + ep_cost += (cost_criteria**length) * cost.item() + if ( 'EarlyTerminated' in self._cfgs['algo'] and ep_cost >= self._cfgs.algo_cfgs.cost_limit @@ -570,6 +651,36 @@ def render( # pylint: disable=too-many-locals,too-many-arguments,too-many-branc ).reshape( -1, # to make sure the shape is (act_dim,) ) + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + approx_compensating_act = self._compensator(obs=obs) + compensated_act_mean_raw = act + approx_compensating_act + [f, g, x, std] = self._dynamics_model.get_gp_dynamics( + obs, + use_prev_model=False, + ) + compensating_act = self._solver.control_barrier( + original_action=compensated_act_mean_raw, + f=f, + g=g, + x=x, + std=std, + ) + act = compensated_act_mean_raw + compensating_act + + if self._cfgs['algo'] == 'SACRCBF': + state_batch = self._dynamics_model.get_state(obs) + mean_pred_batch, sigma_pred_batch = ( + self._dynamics_model.predict_disturbance( + state_batch, + ) + ) + safe_act = self._solver.get_safe_action( + state_batch, + act, + mean_pred_batch, + sigma_pred_batch, + ) + act = safe_act elif self._planner is not None: act = self._planner.output_action( obs.unsqueeze(0).to('cpu'), @@ -587,7 +698,10 @@ def render( # pylint: disable=too-many-locals,too-many-arguments,too-many-branc step += 1 done = bool(terminated or truncated) ep_ret += rew.item() - ep_cost += (cost_criteria**length) * cost.item() + if self._cfgs['algo'] == 'DDPGCBF' or self._cfgs['algo'] == 'TRPOCBF': + ep_cost = ep_cost if ep_cost > cost.item() else cost.item() + else: + ep_cost += (cost_criteria**length) * cost.item() if ( 'EarlyTerminated' in self._cfgs['algo'] and ep_cost >= self._cfgs.algo_cfgs.cost_limit diff --git a/omnisafe/models/actor/actor_builder.py b/omnisafe/models/actor/actor_builder.py index cd1a0df15..3f0b3e4a6 100644 --- a/omnisafe/models/actor/actor_builder.py +++ b/omnisafe/models/actor/actor_builder.py @@ -16,6 +16,7 @@ from __future__ import annotations +from omnisafe.models.actor.beta_learning_actor import BetaLearningActor from omnisafe.models.actor.gaussian_learning_actor import GaussianLearningActor from omnisafe.models.actor.gaussian_sac_actor import GaussianSACActor from omnisafe.models.actor.mlp_actor import MLPActor @@ -60,10 +61,13 @@ def build_actor( ) -> Actor: """Build actor network. - Currently, we support the following actor types: - - ``gaussian_learning``: Gaussian actor with learnable standard deviation parameters. - - ``gaussian_sac``: Gaussian actor with learnable standard deviation network. - - ``mlp``: Multi-layer perceptron actor, used in ``DDPG`` and ``TD3``. + This method supports multiple actor types, each corresponding to a different class: + - `gaussian_learning`: Returns a GaussianLearningActor with learnable std deviation parameters. + - `gaussian_sac`: Returns a GaussianSACActor with a learnable std deviation network. + - `mlp`: Returns an MLPActor, commonly used in DDPG and TD3 algorithms. + - `vae`: Returns a Variational Autoencoder (VAE) actor. + - `perturbation`: Returns a PerturbationActor. + - `beta`: Returns a BetaLearningActor. Args: actor_type (ActorType): Type of actor network, e.g. ``gaussian_learning``. @@ -114,6 +118,14 @@ def build_actor( activation=self._activation, weight_initialization_mode=self._weight_initialization_mode, ) + if actor_type == 'beta': + return BetaLearningActor( + self._obs_space, + self._act_space, + self._hidden_sizes, + activation=self._activation, + weight_initialization_mode=self._weight_initialization_mode, + ) raise NotImplementedError( f'Actor type {actor_type} is not implemented! ' f'Available actor types are: gaussian_learning, gaussian_sac, mlp, vae, perturbation.', diff --git a/omnisafe/models/actor/beta_learning_actor.py b/omnisafe/models/actor/beta_learning_actor.py new file mode 100644 index 000000000..e0ee6b3e9 --- /dev/null +++ b/omnisafe/models/actor/beta_learning_actor.py @@ -0,0 +1,141 @@ +# Copyright 2023 OmniSafe Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Implementation of BetaLearningActor.""" + +from __future__ import annotations + +import torch +import torch.nn as nn +from torch.distributions import Beta, Distribution + +from omnisafe.models.base import Actor +from omnisafe.typing import Activation, InitFunction, OmnisafeSpace +from omnisafe.utils.model import build_mlp_network + + +# pylint: disable-next=too-many-instance-attributes +class BetaLearningActor(Actor): + """Initialize an instance of :class:`BetaLearningActor`.""" + + _current_dist: Beta + + def __init__( + self, + obs_space: OmnisafeSpace, + act_space: OmnisafeSpace, + hidden_sizes: list[int], + activation: Activation = 'relu', + weight_initialization_mode: InitFunction = 'kaiming_uniform', + ) -> None: + """Initialize an instance of :class:`BetaLearningActor`.""" + super().__init__(obs_space, act_space, hidden_sizes, activation, weight_initialization_mode) + + self.mean: nn.Module = build_mlp_network( + sizes=[self._obs_dim, self._hidden_sizes[0], self._hidden_sizes[0]], + activation=activation, + output_activation='tanh', + weight_initialization_mode=weight_initialization_mode, + ) + + self.alpha_net: nn.Module = build_mlp_network( + sizes=[self._hidden_sizes[-1], self._act_dim], + activation='identity', + output_activation='softplus', + weight_initialization_mode=weight_initialization_mode, + ) + + self.beta_net: nn.Module = build_mlp_network( + sizes=[self._hidden_sizes[-1], self._act_dim], + activation='identity', + output_activation='softplus', + weight_initialization_mode=weight_initialization_mode, + ) + + def _distribution(self, obs: torch.Tensor) -> Beta: + """Get the distribution of the actor. + + .. warning:: + This method is not supposed to be called by users. You should call :meth:`forward` + instead. + + Args: + obs (torch.Tensor): Observation from environments. + + Returns: + The normal distribution of the mean and standard deviation from the actor. + """ + mean = self.mean(obs) + alphas = 1.0 + self.alpha_net(mean) + betas = 1.0 + self.beta_net(mean) + return Beta(alphas, betas) + + def predict(self, obs: torch.Tensor, deterministic: bool = False) -> torch.Tensor: + """Predict the action given observation. + + The predicted action depends on the ``deterministic`` flag. + + - If ``deterministic`` is ``True``, the predicted action is the mean of the distribution. + - If ``deterministic`` is ``False``, the predicted action is sampled from the distribution. + + Args: + obs (torch.Tensor): Observation from environments. + deterministic (bool, optional): Whether to use deterministic policy. Defaults to False. + + Returns: + The mean of the distribution if deterministic is True, otherwise the sampled action. + """ + self._current_dist = self._distribution(obs) + self._after_inference = True + if deterministic: + return self._current_dist.mean + return self._current_dist.rsample() + + def forward(self, obs: torch.Tensor) -> Distribution: + """Forward method. + + Args: + obs (torch.Tensor): Observation from environments. + + Returns: + The current distribution. + """ + self._current_dist = self._distribution(obs) + self._after_inference = True + return self._current_dist + + def log_prob(self, act: torch.Tensor) -> torch.Tensor: + """Compute the log probability of the action given the current distribution. + + .. warning:: + You must call :meth:`forward` or :meth:`predict` before calling this method. + + Args: + act (torch.Tensor): Action from :meth:`predict` or :meth:`forward` . + + Returns: + Log probability of the action. + """ + assert self._after_inference, 'log_prob() should be called after predict() or forward()' + self._after_inference = False + return self._current_dist.log_prob(act).sum(axis=-1) + + @property + def std(self) -> float: + """Standard deviation of the distribution.""" + return 1.0 + + @std.setter + def std(self, std: float) -> None: + pass diff --git a/omnisafe/typing.py b/omnisafe/typing.py index bf73b558f..492067e72 100644 --- a/omnisafe/typing.py +++ b/omnisafe/typing.py @@ -39,7 +39,7 @@ AdvatageEstimator = Literal['gae', 'gae-rtg', 'vtrace', 'plain'] InitFunction = Literal['kaiming_uniform', 'xavier_normal', 'glorot', 'xavier_uniform', 'orthogonal'] CriticType = Literal['v', 'q'] -ActorType = Literal['gaussian_learning', 'gaussian_sac', 'mlp', 'vae', 'perturbation'] +ActorType = Literal['gaussian_learning', 'gaussian_sac', 'mlp', 'vae', 'perturbation', 'beta'] DEVICE_CPU = torch.device('cpu') diff --git a/omnisafe/utils/plotter.py b/omnisafe/utils/plotter.py index 5bdbb7ec2..e592240be 100644 --- a/omnisafe/utils/plotter.py +++ b/omnisafe/utils/plotter.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -118,8 +118,7 @@ def plot_data( smoothed_x = np.convolve(x, y, 'same') / np.convolve(z, y, 'same') datum['Costs'] = smoothed_x - if isinstance(data, list): - data_to_plot = pd.concat(data, ignore_index=True) + data_to_plot = pd.concat(data, ignore_index=True) sns.lineplot( data=data_to_plot, x=xaxis, @@ -165,7 +164,13 @@ def plot_data( plt.tight_layout(pad=0.5) - def get_datasets(self, logdir: str, condition: str | None = None) -> list[DataFrame]: + def get_datasets( + self, + logdir: str, + condition: str | None = None, + reward_metrics: str = 'Metrics/EpRet', + cost_metrics: str = 'Metrics/EpCost', + ) -> list[DataFrame]: """Recursively look through logdir for files named "progress.txt". Assumes that any file "progress.txt" is a valid hit. @@ -173,9 +178,11 @@ def get_datasets(self, logdir: str, condition: str | None = None) -> list[DataFr Args: logdir (str): The directory to search for progress.txt files condition (str or None, optional): The condition label. Defaults to None. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. Returns: - The datasets. + list[DataFrame]: A list of DataFrame objects containing the datasets. Raise: FileNotFoundError: If the config file is not found. @@ -205,21 +212,21 @@ def get_datasets(self, logdir: str, condition: str | None = None) -> list[DataFr self.units[condition1] += 1 try: exp_data = pd.read_csv(os.path.join(root, 'progress.csv')) - except FileNotFoundError as error: progress_path = os.path.join(root, 'progress.csv') raise FileNotFoundError(f'Could not read from {progress_path}') from error - performance = ( - 'Metrics/TestEpRet' if 'Metrics/TestEpRet' in exp_data else 'Metrics/EpRet' - ) - cost_performance = ( - 'Metrics/TestEpCost' if 'Metrics/TestEpCost' in exp_data else 'Metrics/EpCost' - ) + + if reward_metrics not in exp_data: + raise KeyError(f'{reward_metrics} is not in data to plot!') + + if cost_metrics not in exp_data: + raise KeyError(f'{cost_metrics} is not in data to plot!') + exp_data.insert(len(exp_data.columns), 'Unit', unit) exp_data.insert(len(exp_data.columns), 'Condition1', condition1) exp_data.insert(len(exp_data.columns), 'Condition2', condition2) - exp_data.insert(len(exp_data.columns), 'Rewards', exp_data[performance]) - exp_data.insert(len(exp_data.columns), 'Costs', exp_data[cost_performance]) + exp_data.insert(len(exp_data.columns), 'Rewards', exp_data[reward_metrics]) + exp_data.insert(len(exp_data.columns), 'Costs', exp_data[cost_metrics]) epoch = exp_data.get('Train/Epoch') if epoch is None or steps_per_epoch is None: raise ValueError('No Train/Epoch column in progress.csv') @@ -237,6 +244,8 @@ def get_all_datasets( legend: list[str] | None = None, select: str | None = None, exclude: str | None = None, + reward_metrics: str = 'Metrics/EpCost', + cost_metrics: str = 'Metrics/EpCost', ) -> list[DataFrame]: """Get all the data from all the log directories. @@ -249,6 +258,8 @@ def get_all_datasets( legend (list of str or None, optional): List of legend names. Defaults to None. select (str or None, optional): Select logdirs that contain this string. Defaults to None. exclude (str or None, optional): Exclude logdirs that contain this string. Defaults to None. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. Returns: All the data stored in a list of DataFrames. @@ -286,13 +297,22 @@ def get_all_datasets( data = [] if legend: for log, leg in zip(logdirs, legend): - data += self.get_datasets(log, leg) + data += self.get_datasets( + log, + leg, + cost_metrics=cost_metrics, + reward_metrics=reward_metrics, + ) else: for log in logdirs: - data += self.get_datasets(log) + data += self.get_datasets( + log, + cost_metrics=cost_metrics, + reward_metrics=reward_metrics, + ) return data - # pylint: disable-next=too-many-arguments + # pylint: disable-next=too-many-arguments, too-many-locals def make_plots( self, all_logdirs: list[str], @@ -309,6 +329,8 @@ def make_plots( save_name: str | None = None, save_format: str = 'png', show_image: bool = False, + reward_metrics: str = 'Metrics/EpCost', + cost_metrics: str = 'Metrics/EpCost', ) -> None: """Make plots from the data in the specified log directories. @@ -356,9 +378,18 @@ def make_plots( to ``png``. show_image (bool, optional): Optional flag. If set, the plot will be displayed on screen. Defaults to ``False``. + reward_metrics (str, optional): The column name for reward metrics. Defaults to 'Metrics/EpRet'. + cost_metrics (str, optional): The column name for cost metrics. Defaults to 'Metrics/EpCost'. """ assert xaxis is not None, 'Must specify xaxis' - data = self.get_all_datasets(all_logdirs, legend, select, exclude) + data = self.get_all_datasets( + all_logdirs, + legend, + select, + exclude, + cost_metrics=cost_metrics, + reward_metrics=reward_metrics, + ) condition = 'Condition2' if count else 'Condition1' # choose what to show on main curve: mean? max? min? estimator = getattr(np, estimator) diff --git a/omnisafe/utils/tools.py b/omnisafe/utils/tools.py index 2c0c626eb..d5be5369d 100644 --- a/omnisafe/utils/tools.py +++ b/omnisafe/utils/tools.py @@ -356,3 +356,40 @@ def get_device(device: torch.device | str | int = DEVICE_CPU) -> torch.device: return torch.device('cpu') return device + + +def to_tensor( + x: np.ndarray, + dtype: torch.dtype, + device: torch.device, + requires_grad: bool = False, +) -> torch.Tensor: + """Convert a numpy array to a torch tensor of specified type and device. + + Args: + x (np.ndarray): A numpy array to be converted. + dtype (torch.dtype): The desired data type for the tensor. + device (torch.device): The device to store the tensor on. + requires_grad (bool): If True, gradients will be computed for operations involving this tensor. + + Returns: + torch.Tensor: A torch tensor representation of the input array. + """ + return torch.from_numpy(x).type(dtype).to(device).requires_grad_(requires_grad) + + +def sort_vertices_cclockwise(vertices: np.ndarray) -> np.ndarray: + """Sort vertices of a 2D convex polygon in counter-clockwise direction. + + Args: + vertices (np.ndarray): An array of shape (n_v, 2) where n_v is the number of vertices. + + Returns: + np.ndarray: An array of vertices sorted in counter-clockwise direction. + """ + assert vertices.shape[1] == 2, f'Vertices must each have dimension 2, got {vertices.shape[1]}' + polygon_center = vertices.sum(axis=0, keepdims=True) / vertices.shape[0] # (1, d) + rel_vecs = vertices - polygon_center + thetas = np.arctan2(rel_vecs[:, 1], rel_vecs[:, 0]) + idxs = np.argsort(thetas) + return vertices[idxs, :] diff --git a/omnisafe/version.py b/omnisafe/version.py index 0295dccbf..bb545ba26 100644 --- a/omnisafe/version.py +++ b/omnisafe/version.py @@ -1,4 +1,4 @@ -# Copyright 2023 OmniSafe Team. All Rights Reserved. +# Copyright 2024 OmniSafe Team. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,8 +25,8 @@ try: prefix, sep, suffix = ( - subprocess.check_output( - ['git', 'describe', '--abbrev=7'], # noqa: S603,S607 + subprocess.check_output( # noqa: S603 + ['git', 'describe', '--abbrev=7'], # noqa: S607 cwd=os.path.dirname(os.path.abspath(__file__)), stderr=subprocess.DEVNULL, text=True, diff --git a/pyproject.toml b/pyproject.toml index a74b46723..d7351aeb5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,11 @@ dependencies = [ "matplotlib >= 3.7.1", "gdown >= 4.6.0", "pytorch_lightning >= 2.2.2", + "cvxopt== 1.3.2", + "gpytorch== 1.11", + "joblib == 1.3.2", + "qpth == 0.0.16", + "scikit_learn == 1.3.2" ] dynamic = ["version", "entry-points"] @@ -125,9 +130,8 @@ ignore-words = "docs/source/spelling_wordlist.txt" # Sync with requires-python target-version = "py38" line-length = 100 -show-source = true src = ["omnisafe", "tests", "examples"] -select = [ +lint.select = [ "E", "W", # pycodestyle "F", # pyflakes "UP", # pyupgrade @@ -148,7 +152,7 @@ select = [ "TID", # flake8-tidy-imports "RUF", # ruff ] -ignore = [ +lint.ignore = [ # E501: line too long # W505: doc line too long # too long docstring due to long example blocks @@ -167,9 +171,9 @@ ignore = [ # use alias for import convention (e.g., `import torch.nn as nn`) "PLR0402", ] -typing-modules = ["omnisafe.typing"] +lint.typing-modules = ["omnisafe.typing"] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = [ "F401", # unused-import ] @@ -231,15 +235,15 @@ typing-modules = ["omnisafe.typing"] "ANN003", # Missing type annotation ] -[tool.ruff.flake8-annotations] +[tool.ruff.lint.flake8-annotations] allow-star-arg-any = true -[tool.ruff.flake8-quotes] +[tool.ruff.lint.flake8-quotes] docstring-quotes = "double" multiline-quotes = "double" inline-quotes = "single" -[tool.ruff.flake8-tidy-imports] +[tool.ruff.lint.flake8-tidy-imports] ban-relative-imports = "all" [tool.pytest.ini_options] diff --git a/requirements.txt b/requirements.txt index 0abf5e41a..03fec36c3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,8 @@ seaborn >= 0.12.2 pandas >= 1.5.3 matplotlib >= 3.7.1 gdown >= 4.6.0 +cvxopt==1.3.2 +gpytorch==1.11 +joblib==1.3.2 +qpth==0.0.16 +scikit_learn==1.3.2 diff --git a/tests/test_buffer.py b/tests/test_buffer.py index 0fee90a46..b284b9e10 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -79,7 +79,7 @@ def test_vector_onpolicy_buffer( assert ( vector_buffer.standardized_adv_r == standardized_adv_r ), f'vector_buffer.sstandardized_adv_r is {vector_buffer.sstandardized_adv_r}' - assert vector_buffer.buffers is not [], f'vector_buffer.buffers is {vector_buffer.buffers}' + assert vector_buffer.buffers != [], f'vector_buffer.buffers is {vector_buffer.buffers}' # checking the store function obs_dim = obs_space.shape[0] diff --git a/tests/test_policy.py b/tests/test_policy.py index 79810d0b9..21ed70782 100644 --- a/tests/test_policy.py +++ b/tests/test_policy.py @@ -38,6 +38,8 @@ pid_lagrange_policy = ['TRPOPID', 'CPPOPID'] early_terminated_policy = ['TRPOEarlyTerminated', 'PPOEarlyTerminated'] offline_policy = ['BCQ', 'BCQLag', 'CRR', 'CCRR', 'VAEBC'] +cbf_policy = ['TRPOCBF', 'DDPGCBF', 'PPOBetaCBF'] +auto_alpha = [True, False] model_cfgs = { 'linear_lr_decay': True, @@ -52,6 +54,53 @@ optim_case = [0, 1, 2, 3, 4] +@helpers.parametrize(algo=cbf_policy) +def test_cbf(algo): + env_id = 'Pendulum-v1' + + custom_cfgs = { + 'train_cfgs': { + 'total_steps': 200, + 'vector_env_nums': 1, + 'torch_threads': 4, + }, + 'algo_cfgs': { + 'steps_per_epoch': 200, + }, + 'logger_cfgs': { + 'use_wandb': False, + 'save_model_freq': 1, + }, + } + agent = omnisafe.Agent(algo, env_id, custom_cfgs=custom_cfgs) + agent.learn() + + +@helpers.parametrize(auto_alpha=auto_alpha) +def test_rcbf(auto_alpha): + env_id = 'Unicycle' + + custom_cfgs = { + 'train_cfgs': { + 'total_steps': 1000, + 'vector_env_nums': 1, + 'torch_threads': 4, + }, + 'algo_cfgs': { + 'start_learning_steps': 998, + 'update_iters': 1, + 'auto_alpha': auto_alpha, + }, + 'logger_cfgs': { + 'use_wandb': False, + 'save_model_freq': 1, + }, + } + agent = omnisafe.Agent('SACRCBF', env_id, custom_cfgs=custom_cfgs) + agent.learn() + agent.evaluate(num_episodes=1) + + @helpers.parametrize(optim_case=optim_case) def test_cpo(optim_case): agent = omnisafe.Agent('CPO', 'Test-v0', custom_cfgs={}) @@ -337,9 +386,6 @@ def test_off_lag_policy(algo): agent.learn() -auto_alpha = [True, False] - - @helpers.parametrize(auto_alpha=auto_alpha) def test_sac_policy(auto_alpha): """Test sac algorithms."""