Source code for src.Environments.single_agent.TargetRegime

from typing import Any

from numpy.linalg import norm

from src.Environments.single_agent.BaseEnvironment import BaseRegime


[docs] class TargetRegime(BaseRegime): """ A regime that defines the behavior and objectives for a drone in a target-reaching scenario within a simulation environment. It extends BaseRegime with specific reward and penalty mechanisms related to reaching a target, maintaining velocity, and avoiding crashes. :param tolerance_distance: The distance within which the drone is considered to have reached the target. Type: float :param max_time: The maximum duration for which the simulation or episode is allowed to run. If exceeded, the episode is considered truncated. Type: float :param reward_distance_coefficient: Coefficient for the reward based on the inverse of the distance to the target. The reward increases as the drone gets closer to the target. Type: float :param reward_distance_exp: The exponent applied to the inverse distance reward calculation. Type: float :param reward_distance_max: The maximum reward granted for distance to the target. Prevents the reward from becoming excessively large as the distance approaches zero. Type: float :param reward_goal: The reward given when the drone reaches the target. Type: float :param reward_velocity_coefficient: Coefficient for the reward based on the drone's velocity. Type: float :param reward_velocity_exp: The exponent applied to the velocity in the velocity-based reward calculation. Type: float :param reward_velocity_max: The maximum reward granted for the drone's velocity. Type: float :param penalty_time: The penalty applied at each timestep, encouraging the drone to reach the target faster. Type: float :param penalty_crash: The penalty applied if the drone crashes, i.e., hits the ground. Type: float :param kwargs: Additional keyword arguments passed to the base class (BaseRegime). """ def __init__(self, tolerance_distance: float, max_time: float, reward_distance_coefficient: float, reward_distance_exp: float, reward_distance_max: float, reward_goal: float, reward_velocity_coefficient: float, reward_velocity_exp: float, reward_velocity_max: float, penalty_time: float, penalty_crash: float, **kwargs): """ Initializes the TargetRegime with specific parameters related to the target-reaching task. :param tolerance_distance: The threshold distance within which the target is considered as reached. :param max_time: The time limit for the episode after which it is truncated. :param reward_distance_coefficient: Multiplier for the distance-based reward. :param reward_distance_exp: Exponent for scaling the distance-based reward. :param reward_distance_max: Cap for the distance-based reward to prevent infinite values. :param reward_goal: Reward for reaching the target. :param reward_velocity_coefficient: Multiplier for the velocity-based reward. :param reward_velocity_exp: Exponent for scaling the velocity-based reward. :param reward_velocity_max: Maximum possible velocity-based reward. :param penalty_time: Time penalty incurred each timestep to encourage faster completion. :param penalty_crash: Penalty for crashing, which discourages collision with the ground. :param kwargs: Additional arguments passed to the BaseRegime initializer. """ super().__init__(**kwargs) self._tolerance_distance = float(tolerance_distance) self._max_time = float(max_time) self._reward_distance_coefficient = float(reward_distance_coefficient) self._reward_distance_exp = float(reward_distance_exp) self._reward_distance_max = float(reward_distance_max) self._reward_goal = float(reward_goal) self._reward_velocity_coefficient = float(reward_velocity_coefficient) self._reward_velocity_exp = float(reward_velocity_exp) self._reward_velocity_max = float(reward_velocity_max) self._penalty_time = float(penalty_time) self._penalty_crash = float(penalty_crash) @property def reward_distance(self) -> float: """ Calculate the distance-based reward, which is inversely proportional to the distance between the drone and the target. The reward is capped by `reward_distance_max` to prevent it from becoming infinitely large as the distance approaches zero. :return: The distance-based reward, scaled and exponentiated based on the distance to the target, with a maximum limit. """ try: return min((self._reward_distance_coefficient / norm(self.drone_target_vector)) ** self._reward_distance_exp, self._reward_distance_max) except ZeroDivisionError: return self._reward_distance_max @property def reward_time(self) -> float: """ Calculate the time-based penalty at each timestep to encourage the drone to reach its target faster. :return: The time-based penalty, which is a constant negative value defined by `penalty_time`. """ return -self._penalty_time @property def reward_crash(self) -> float: """ Calculate the crash penalty, applied if the drone hits the ground. :return: The crash penalty if the drone hits the ground; otherwise, zero. """ return -self._penalty_crash if self.drone_hit_ground else 0. @property def reward_goal(self) -> float: """ Calculate the reward for reaching the goal. This reward is granted when the drone's distance to the target is less than or equal to `tolerance_distance`. :return: The reward for reaching the goal if the goal is reached; otherwise, zero. """ return self._reward_goal if self.goal_reached else 0. @property def reward_velocity(self) -> float: """ Calculate the velocity-based reward, which is proportional to the drone's current velocity. :return: The velocity-based reward, scaled and exponentiated based on the drone's current velocity, with a maximum limit. """ return (self._reward_velocity_coefficient * (norm(self.drone.velocity)) ** self._reward_velocity_exp) @property def reward(self) -> float: """ Aggregate the total reward for the current timestep, combining distance, time, crash, goal-reaching, and velocity-based rewards. :return: The total reward for the current timestep. """ return (self.reward_distance + self.reward_goal + self.reward_velocity - self._penalty_time - self._penalty_crash) @property def metrics(self) -> dict[str, Any]: """ Collect and return various metrics related to the drone's performance and the environment state. :return: A dictionary containing metrics such as distance to the target, goal-reaching status, drone's velocity, whether the drone hit the ground, and the simulation time. """ return { "distance": norm(self.drone_target_vector), "goal_reached": self.goal_reached, "goal_velocity": norm(self.drone.velocity), "hit_ground": self.drone_hit_ground, "time": self.data.time } @property def done(self) -> bool: """ Determine whether the episode has concluded. An episode is considered done if the drone crashes, reaches the target, or the simulation time exceeds `max_time`. :return: True if the episode is done; otherwise, False. """ return bool(self.drone_hit_ground or self.truncated or self.goal_reached) @property def truncated(self) -> bool: """ Check if the episode is truncated due to exceeding the maximum allowed simulation time (`max_time`). :return: True if the simulation time exceeds `max_time`; otherwise, False. """ return bool(self.data.time > self._max_time) @property def goal_reached(self) -> bool: """ Determine whether the drone has reached the target by checking if its distance to the target is less than or equal to `tolerance_distance`. :return: True if the drone has reached the target; otherwise, False. """ return bool(norm(self.drone_target_vector) < self._tolerance_distance)