Source code for pokebattle_rl_env.pokebattle_env

from math import exp

import numpy as np
from gym import Env
from gym.envs.registration import EnvSpec
from gym.spaces import Box

from pokebattle_rl_env.showdown_simulator import ShowdownSimulator

TURN_THRESHOLD = 10


def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)


def sigmoid(x):
    return 1 / (1 + exp(-x))


[docs]class PokeBattleEnv(Env): """The Pokemon battle Reinforecement Learning environment. A subclass of :class:`gym.core.Env`, which is compatible with most Reinforcement Learning frameworks. :class:`PokeBattleEnv` uses a :class:`pokebattle_rl_env.battle_simulator.BattleSimulator` to simulate the battles. Attributes: simulator (:class:`pokebattle_rl_env.battle_simulator.BattleSimulator`): The simulator to run battles in. Uses :class:`pokebattle_rl_env.showdown_simulator.ShowdownSimulator` by default. """ def __init__(self, simulator=ShowdownSimulator()): self.__version__ = "0.1.0" self._spec = EnvSpec('PokeBattleEnv-v0') self.simulator = simulator num_actions = len(self.simulator.get_available_actions()) + len(self.simulator.get_available_modifiers()) self.action_space = Box(low=0.0, high=1.0, shape=(num_actions,), dtype=np.float32) state_dimensions = len(self.simulator.state.to_array()) self.observation_space = Box(low=0, high=1000, shape=(state_dimensions,), dtype=np.float32) self.reward_range = (-1, 1) self.metadata['render.modes'] = ['human'] self.metadata['semantics.autoreset'] = False def get_action(self, action_probs): valid_actions = self.simulator.get_available_actions() if len(valid_actions) == 0: from pickle import dump from pokebattle_rl_env.util import generate_token with open(generate_token(5), 'wb') as file: dump(self.simulator.state, file) estimates = [] for valid_action in valid_actions: if valid_action.mode == 'attack': action_ix = valid_action.number - 1 elif valid_action.mode == 'switch': action_ix = valid_action.number + 2 else: continue estimates.append(action_probs[action_ix]) estimates = softmax(estimates) action = np.random.choice(valid_actions, p=estimates) return action def get_action_modifier(self, action_probs): valid_modifiers = self.simulator.get_available_modifiers() modifiers = [] for valid_modifier in valid_modifiers: prob = 0 if valid_modifier == 'mega': prob = action_probs[len(action_probs) - 1] prob = sigmoid(prob) if np.random.binomial(1, prob): modifiers.append(valid_modifier) return modifiers def compute_reward(self): if not (self.simulator.state.forfeited and self.simulator.state.turn < TURN_THRESHOLD): if self.simulator.state.state == 'win': return 1 elif self.simulator.state.state == 'loss': return -1 return 0
[docs] def step(self, action): game_action = self.get_action(action) modifiers = self.get_action_modifier(action) self.simulator.act(game_action, modifiers) observation = self.simulator.state.to_array() reward = self.compute_reward() # ToDo: Maybe negative reward for assigning probability to invalid action done = self.simulator.state.state in ['win', 'loss', 'tie'] return observation, reward, done, None
[docs] def reset(self): self.simulator.reset() return self.simulator.state.to_array()
[docs] def render(self, mode='human'): if mode == 'rgb_array': raise NotImplementedError('rendering rgb_arrays not yet implemented') if mode is 'human': self.simulator.render() else: super().render(mode=mode)
[docs] def close(self): self.simulator.close()
[docs] def seed(self, seed=None): pass