2017-04-04 01:51:35 +04:00
import gym
from gym import spaces
import numpy as np
class TicTacToeEnv(gym.Env):
metadata = {'render.modes': ['human']}
def __init__(self):
2017-04-04 23:47:54 +04:00
self.action_space = spaces.Discrete(9)
self.observation_space = spaces.Discrete(9 * 3) # flattened
2017-04-04 01:51:35 +04:00
def _step(self, action):
done = False
reward = 0
p, square = action
# p = p*2 - 1
# check move legality
2017-04-04 20:04:18 +04:00
board = self.state['board']
proposed = board[square]
2017-04-04 01:51:35 +04:00
om = self.state['on_move']
2017-04-04 20:04:18 +04:00
if (proposed != 0): # wrong player, not empty
2017-04-04 01:51:35 +04:00
print("illegal move ", action, ". (square occupied): ", square)
done = True
2017-04-04 20:04:18 +04:00
reward = -2 * om # player who did NOT make the illegal move
if (p != om): # wrong player, not empty
2017-04-04 01:51:35 +04:00
print("illegal move ", action, " not on move: ", p)
done = True
2017-04-04 20:04:18 +04:00
reward = -2 * om # player who did NOT make the illegal move
2017-04-04 01:51:35 +04:00
2017-04-04 20:04:18 +04:00
board[square] = p
2017-04-04 01:51:35 +04:00
self.state['on_move'] = -p
# check game over
for i in range(3):
2017-04-04 20:04:18 +04:00
# horizontals and verticals
if ((board[i * 3] == p and board[i * 3 + 1] == p and board[i * 3 + 2 ] == p)
or (board[i + 0] == p and board[i + 3] == p and board[i + 6] == p)):
2017-04-04 01:51:35 +04:00
reward = p
done = True
return np.array(self.state), reward, done, {}
def _reset(self):
self.state = {}
2017-04-04 20:04:18 +04:00
self.state['board'] = [0, 0, 0, 0, 0, 0, 0, 0, 0]
2017-04-04 01:51:35 +04:00
self.state['on_move'] = 1
return self.state
def _render(self, mode='human', close=False):
if close:
print("on move: " , self.state['on_move'])
for i in range (9):
print (self.state['board'][i], end=" ")
2017-04-04 23:47:54 +04:00
def hash_ttt(state):
#of course this is just for the upper bound;
#we should really take advantage of the redundancies
# to reduce the number of states to 765 for the board
# and who is on move really is implicit in how many
# squares are occupied
retval = 0
low9 = 0
high9 = 0
lowmult = 2
highmult = 1024
board = state['board']
if (state['on_move'] == -1):
retval = 1
for i in range(9):
if (board[i] != 0):
retval += lowmult #todo bitwise logic in python how?
if (board[i] < 0):
retval += highmult
lowmult *=2
highmult *= 2
2017-04-04 01:51:35 +04:00
def move_generator(self):
moves = []
for i in range (9):
2017-04-04 19:45:58 +04:00
2017-04-04 20:04:18 +04:00
if (self.state['board'][i] == 0):
2017-04-04 19:36:32 +04:00
p = self.state['on_move']
2017-04-04 01:51:35 +04:00
m = [p, i]
2017-04-04 19:45:58 +04:00
return moves
2017-04-04 01:51:35 +04:00