
【強化学習#7】モンテカルロ法
記事の目的
youtubeの「【強化学習#7】モンテカルロ法」で解説した内容のコードです。
目次
1 環境とエージェント
import numpy as np import matplotlib.pyplot as plt import seaborn as sns np.random.seed(1)
class Environment:
def __init__(self, size=3, lucky=[]):
self.size = size
self.lucky = lucky
self.goal = (size-1, size-1)
self.states = [(x, y) for x in range(size) for y in range(size)]
self.value = {}
for s in self.states:
self.value[s] = 0
def next_state(self, s, a):
s_next = (s[0] + a[0], s[1] + a[1])
if s == self.goal:
return s
if s_next not in self.states:
return s
if s_next in self.lucky:
if np.random.random() < 0.8:
return self.goal
else:
return s_next
return s_next
def reward(self, s, s_next):
if s == self.goal:
return -1
if s_next == self.goal:
return 0
return -1
class Agent():
def __init__(self, environment):
self.actions = [(-1, 0), (0, -1), (1, 0), (0, 1)]
self.environment = environment
self.policy = {}
for s in self.environment.states:
self.policy[s] = self.actions[np.random.randint(2,4)]
self.q = {}
for s in self.environment.states:
for a in self.actions:
self.q[(s, a)] = -100
def action(self, s, a):
s_next = self.environment.next_state(s, a)
r = self.environment.reward(s, s_next)
return r, s_next
2 モンテカルロ法
def get_episode(agent, epsilon=0.1):
s = agent.environment.states[np.random.randint(agent.environment.size**2-1)]
episode = []
n = 0
while True:
if np.random.random() < epsilon:
a = agent.actions[np.random.randint(2,4)]
else:
a = agent.policy[s]
r, s_next = agent.action(s, a)
episode.append((s, a, r))
if s_next == agent.environment.goal:
break
s = s_next
n += 1
return episode
def policy(agent, s):
q_max = -100
a_max = None
for a in agent.actions:
if agent.q[(s, a)]>q_max:
q_max = agent.q[(s, a)]
a_max = a
agent.policy[s] = a_max
return q_max
def train(agent, epsilon=0.1, alpha=0.2, num=100):
for _ in range(num):
episode = get_episode(agent, epsilon)
episode.reverse()
r_sum = 0
last = False
for (s, a, r) in episode:
if a != agent.policy[s]:
last=True
r_sum += r
agent.q[(s, a)] += alpha*(r_sum - agent.q[(s, a)])
agent.environment.value[s] = policy(agent, s)
if last:
break
show_values(agent)
show_policy(agent)
3 可視化用関数
def show_maze(environment):
size = environment.size
fig = plt.figure(figsize=(3,3))
plt.plot([-0.5, -0.5], [-0.5, size-0.5], color='k')
plt.plot([-0.5, size-0.5], [size-0.5, size-0.5], color='k')
plt.plot([size-0.5, -0.5], [-0.5, -0.5], color='k')
plt.plot([size-0.5, size-0.5], [size-0.5, -0.5], color='k')
for i in range(size):
for j in range(size):
plt.text(i, j, "{}".format(i+size*j), size=20, ha="center", va="center")
if (i,j) in environment.lucky:
x = np.array([i-0.5,i-0.5,i+0.5,i+0.5])
y = np.array([j-0.5,j+0.5,j+0.5,j-0.5])
plt.fill(x,y, color="lightgreen")
plt.axis("off")
def show_values(agent):
fig = plt.figure(figsize=(3,3))
result = np.zeros([agent.environment.size, agent.environment.size])
for (x, y) in agent.environment.states:
result[y][x] = agent.environment.value[(x, y)]
sns.heatmap(result, square=True, cbar=False, annot=True, fmt='3.2f', cmap='autumn_r').invert_yaxis()
plt.axis("off")
def show_policy(agent):
size = agent.environment.size
fig = plt.figure(figsize=(3,3))
plt.plot([-0.5, -0.5], [-0.5, size-0.5], color='k')
plt.plot([-0.5, size-0.5], [size-0.5, size-0.5], color='k')
plt.plot([size-0.5, -0.5], [-0.5, -0.5], color='k')
plt.plot([size-0.5, size-0.5], [size-0.5, -0.5], color='k')
for i in range(size):
for j in range(size):
if (i,j) in agent.environment.lucky:
x = np.array([i-0.5,i-0.5,i+0.5,i+0.5])
y = np.array([j-0.5,j+0.5,j+0.5,j-0.5])
plt.fill(x,y, color="lightgreen")
rotation = {(-1, 0): 180, (0, 1): 90, (1, 0): 0, (0, -1): 270}
for s in agent.environment.states:
if s == agent.environment.goal:
direction=None
else:
direction = rotation[agent.policy[s]]
if direction != None:
bbox_props = dict(boxstyle='rarrow')
plt.text(s[0], s[1], ' ', bbox=bbox_props, size=8,
ha='center', va='center', rotation=direction)
plt.axis("off")
4 シミュレーション
env1 = Environment(size=4, lucky=[(1,2), (2,3)]) agent1 = Agent(env1) show_maze(env1)

train(agent1, num=10000)
