【強化学習#7】モンテカルロ法
記事の目的
youtubeの「【強化学習#7】モンテカルロ法」で解説した内容のコードです。
目次
1 環境とエージェント
import numpy as np import matplotlib.pyplot as plt import seaborn as sns np.random.seed(1)
class Environment: def __init__(self, size=3, lucky=[]): self.size = size self.lucky = lucky self.goal = (size-1, size-1) self.states = [(x, y) for x in range(size) for y in range(size)] self.value = {} for s in self.states: self.value[s] = 0 def next_state(self, s, a): s_next = (s[0] + a[0], s[1] + a[1]) if s == self.goal: return s if s_next not in self.states: return s if s_next in self.lucky: if np.random.random() < 0.8: return self.goal else: return s_next return s_next def reward(self, s, s_next): if s == self.goal: return -1 if s_next == self.goal: return 0 return -1
class Agent(): def __init__(self, environment): self.actions = [(-1, 0), (0, -1), (1, 0), (0, 1)] self.environment = environment self.policy = {} for s in self.environment.states: self.policy[s] = self.actions[np.random.randint(2,4)] self.q = {} for s in self.environment.states: for a in self.actions: self.q[(s, a)] = -100 def action(self, s, a): s_next = self.environment.next_state(s, a) r = self.environment.reward(s, s_next) return r, s_next
2 モンテカルロ法
def get_episode(agent, epsilon=0.1): s = agent.environment.states[np.random.randint(agent.environment.size**2-1)] episode = [] n = 0 while True: if np.random.random() < epsilon: a = agent.actions[np.random.randint(2,4)] else: a = agent.policy[s] r, s_next = agent.action(s, a) episode.append((s, a, r)) if s_next == agent.environment.goal: break s = s_next n += 1 return episode
def policy(agent, s): q_max = -100 a_max = None for a in agent.actions: if agent.q[(s, a)]>q_max: q_max = agent.q[(s, a)] a_max = a agent.policy[s] = a_max return q_max
def train(agent, epsilon=0.1, alpha=0.2, num=100): for _ in range(num): episode = get_episode(agent, epsilon) episode.reverse() r_sum = 0 last = False for (s, a, r) in episode: if a != agent.policy[s]: last=True r_sum += r agent.q[(s, a)] += alpha*(r_sum - agent.q[(s, a)]) agent.environment.value[s] = policy(agent, s) if last: break show_values(agent) show_policy(agent)
3 可視化用関数
def show_maze(environment): size = environment.size fig = plt.figure(figsize=(3,3)) plt.plot([-0.5, -0.5], [-0.5, size-0.5], color='k') plt.plot([-0.5, size-0.5], [size-0.5, size-0.5], color='k') plt.plot([size-0.5, -0.5], [-0.5, -0.5], color='k') plt.plot([size-0.5, size-0.5], [size-0.5, -0.5], color='k') for i in range(size): for j in range(size): plt.text(i, j, "{}".format(i+size*j), size=20, ha="center", va="center") if (i,j) in environment.lucky: x = np.array([i-0.5,i-0.5,i+0.5,i+0.5]) y = np.array([j-0.5,j+0.5,j+0.5,j-0.5]) plt.fill(x,y, color="lightgreen") plt.axis("off")
def show_values(agent): fig = plt.figure(figsize=(3,3)) result = np.zeros([agent.environment.size, agent.environment.size]) for (x, y) in agent.environment.states: result[y][x] = agent.environment.value[(x, y)] sns.heatmap(result, square=True, cbar=False, annot=True, fmt='3.2f', cmap='autumn_r').invert_yaxis() plt.axis("off")
def show_policy(agent): size = agent.environment.size fig = plt.figure(figsize=(3,3)) plt.plot([-0.5, -0.5], [-0.5, size-0.5], color='k') plt.plot([-0.5, size-0.5], [size-0.5, size-0.5], color='k') plt.plot([size-0.5, -0.5], [-0.5, -0.5], color='k') plt.plot([size-0.5, size-0.5], [size-0.5, -0.5], color='k') for i in range(size): for j in range(size): if (i,j) in agent.environment.lucky: x = np.array([i-0.5,i-0.5,i+0.5,i+0.5]) y = np.array([j-0.5,j+0.5,j+0.5,j-0.5]) plt.fill(x,y, color="lightgreen") rotation = {(-1, 0): 180, (0, 1): 90, (1, 0): 0, (0, -1): 270} for s in agent.environment.states: if s == agent.environment.goal: direction=None else: direction = rotation[agent.policy[s]] if direction != None: bbox_props = dict(boxstyle='rarrow') plt.text(s[0], s[1], ' ', bbox=bbox_props, size=8, ha='center', va='center', rotation=direction) plt.axis("off")
4 シミュレーション
env1 = Environment(size=4, lucky=[(1,2), (2,3)]) agent1 = Agent(env1) show_maze(env1)
train(agent1, num=10000)