Provide different policy for player 1 and -1
This commit is contained in:
parent
acd515bc9a
commit
d5347cafc7
@ -3,10 +3,21 @@ import numpy as np
|
|||||||
import gym_tic_tac_toe
|
import gym_tic_tac_toe
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
def random_plus_middle_move(moves, p):
|
||||||
|
if ([p, 4] in moves):
|
||||||
|
m = [p, 4]
|
||||||
|
else:
|
||||||
|
m = random_move(moves, p)
|
||||||
|
return m
|
||||||
|
def random_move(moves, p):
|
||||||
|
m = random.choice(moves)
|
||||||
|
return m
|
||||||
|
|
||||||
env = gym.make('tic_tac_toe-v0')
|
env = gym.make('tic_tac_toe-v0')
|
||||||
|
|
||||||
num_episodes = 20
|
num_episodes = 2000
|
||||||
num_steps_per_episode = 200
|
num_steps_per_episode = 10
|
||||||
|
|
||||||
collected_rewards = []
|
collected_rewards = []
|
||||||
for i in range(num_episodes):
|
for i in range(num_episodes):
|
||||||
@ -22,22 +33,30 @@ for i in range(num_episodes):
|
|||||||
moves = env.move_generator()
|
moves = env.move_generator()
|
||||||
print ("moves: ", moves)
|
print ("moves: ", moves)
|
||||||
if (not moves):
|
if (not moves):
|
||||||
|
print ("out of moves")
|
||||||
break
|
break
|
||||||
m = random.choice(moves)
|
if (len(moves)==1):
|
||||||
|
m = moves[0]
|
||||||
|
else:
|
||||||
|
if (om == 1):
|
||||||
|
m = random_plus_middle_move(moves, om)
|
||||||
|
else:
|
||||||
|
m = random_move(moves, om)
|
||||||
print ("m: ", m)
|
print ("m: ", m)
|
||||||
a = env.action_space.sample()
|
# a = env.action_space.sample()
|
||||||
print (a[0])
|
# print (a[0])
|
||||||
#sm = s['on_move']
|
# #sm = s['on_move']
|
||||||
#print (sm)
|
# #print (sm)
|
||||||
a = tuple((om, a[1]))
|
# a = tuple((om, a[1]))
|
||||||
s1, reward, done, _ = env.step(m)
|
s1, reward, done, _ = env.step(m)
|
||||||
om = -om
|
om = -om
|
||||||
env.render()
|
env.render()
|
||||||
total_reward += reward
|
total_reward += reward
|
||||||
s = s1
|
s = s1
|
||||||
if done:
|
if done:
|
||||||
|
print ("game over: ", reward)
|
||||||
break
|
break
|
||||||
collected_rewards.append(total_reward)
|
collected_rewards.append(total_reward)
|
||||||
print ("total reward ", total_reward, " after episode: ", j)
|
print ("total reward ", total_reward, " after episode: ", i, ". steps: ", j+1)
|
||||||
print ("average score: ", sum(collected_rewards) / num_episodes)
|
print ("average score: ", sum(collected_rewards) / num_episodes)
|
||||||
print("#########")
|
print("#########")
|
||||||
|
Loading…
Reference in New Issue
Block a user