2017-04-01 14:01:16 +04:00
|
|
|
import gym
|
|
|
|
import numpy as np
|
2017-04-04 01:51:35 +04:00
|
|
|
import gym_tic_tac_toe
|
2017-04-04 19:46:38 +04:00
|
|
|
import random
|
2017-04-01 14:01:16 +04:00
|
|
|
|
2017-04-04 01:51:35 +04:00
|
|
|
env = gym.make('tic_tac_toe-v0')
|
2017-04-01 14:01:16 +04:00
|
|
|
|
|
|
|
num_episodes = 20
|
|
|
|
num_steps_per_episode = 200
|
|
|
|
|
|
|
|
collected_rewards = []
|
|
|
|
for i in range(num_episodes):
|
|
|
|
s = env.reset()
|
2017-04-04 01:51:35 +04:00
|
|
|
print (s)
|
2017-04-01 14:01:16 +04:00
|
|
|
print ("starting new episode")
|
|
|
|
env.render()
|
|
|
|
print ("started")
|
|
|
|
total_reward = 0
|
|
|
|
done = False
|
2017-04-04 01:51:35 +04:00
|
|
|
om = 1
|
2017-04-01 14:01:16 +04:00
|
|
|
for j in range(num_steps_per_episode):
|
2017-04-04 19:46:38 +04:00
|
|
|
moves = env.move_generator()
|
|
|
|
print ("moves: ", moves)
|
|
|
|
if (not moves):
|
|
|
|
break
|
|
|
|
m = random.choice(moves)
|
|
|
|
print ("m: ", m)
|
2017-04-04 01:51:35 +04:00
|
|
|
a = env.action_space.sample()
|
|
|
|
print (a[0])
|
|
|
|
#sm = s['on_move']
|
|
|
|
#print (sm)
|
|
|
|
a = tuple((om, a[1]))
|
2017-04-04 19:46:38 +04:00
|
|
|
s1, reward, done, _ = env.step(m)
|
2017-04-04 01:51:35 +04:00
|
|
|
om = -om
|
2017-04-01 14:01:16 +04:00
|
|
|
env.render()
|
|
|
|
total_reward += reward
|
|
|
|
s = s1
|
|
|
|
if done:
|
|
|
|
break
|
|
|
|
collected_rewards.append(total_reward)
|
|
|
|
print ("total reward ", total_reward, " after episode: ", j)
|
|
|
|
print ("average score: ", sum(collected_rewards) / num_episodes)
|
|
|
|
print("#########")
|