Provide different policy for player 1 and -1

This commit is contained in:
Nicolai Czempin 2017-04-04 18:16:57 +02:00
parent acd515bc9a
commit d5347cafc7

View File

@ -3,10 +3,21 @@ import numpy as np
import gym_tic_tac_toe import gym_tic_tac_toe
import random import random
def random_plus_middle_move(moves, p):
if ([p, 4] in moves):
m = [p, 4]
else:
m = random_move(moves, p)
return m
def random_move(moves, p):
m = random.choice(moves)
return m
env = gym.make('tic_tac_toe-v0') env = gym.make('tic_tac_toe-v0')
num_episodes = 20 num_episodes = 2000
num_steps_per_episode = 200 num_steps_per_episode = 10
collected_rewards = [] collected_rewards = []
for i in range(num_episodes): for i in range(num_episodes):
@ -22,22 +33,30 @@ for i in range(num_episodes):
moves = env.move_generator() moves = env.move_generator()
print ("moves: ", moves) print ("moves: ", moves)
if (not moves): if (not moves):
print ("out of moves")
break break
m = random.choice(moves) if (len(moves)==1):
m = moves[0]
else:
if (om == 1):
m = random_plus_middle_move(moves, om)
else:
m = random_move(moves, om)
print ("m: ", m) print ("m: ", m)
a = env.action_space.sample() # a = env.action_space.sample()
print (a[0]) # print (a[0])
#sm = s['on_move'] # #sm = s['on_move']
#print (sm) # #print (sm)
a = tuple((om, a[1])) # a = tuple((om, a[1]))
s1, reward, done, _ = env.step(m) s1, reward, done, _ = env.step(m)
om = -om om = -om
env.render() env.render()
total_reward += reward total_reward += reward
s = s1 s = s1
if done: if done:
print ("game over: ", reward)
break break
collected_rewards.append(total_reward) collected_rewards.append(total_reward)
print ("total reward ", total_reward, " after episode: ", j) print ("total reward ", total_reward, " after episode: ", i, ". steps: ", j+1)
print ("average score: ", sum(collected_rewards) / num_episodes) print ("average score: ", sum(collected_rewards) / num_episodes)
print("#########") print("#########")