diff --git a/gym_tic_tac_toe/envs/tic_tac_toe_env.py b/gym_tic_tac_toe/envs/tic_tac_toe_env.py index f4be54d..71007b3 100644 --- a/gym_tic_tac_toe/envs/tic_tac_toe_env.py +++ b/gym_tic_tac_toe/envs/tic_tac_toe_env.py @@ -7,7 +7,7 @@ class TicTacToeEnv(gym.Env): def __init__(self): self.action_space = spaces.Discrete(9) - self.observation_space = spaces.Discrete(9 * 3) # flattened + self.observation_space = spaces.Discrete(512*512*2) # flattened def _step(self, action): done = False reward = 0 @@ -22,11 +22,11 @@ class TicTacToeEnv(gym.Env): if (proposed != 0): # wrong player, not empty print("illegal move ", action, ". (square occupied): ", square) done = True - reward = -2 * om # player who did NOT make the illegal move + reward = -1 * om # player who did NOT make the illegal move if (p != om): # wrong player, not empty print("illegal move ", action, " not on move: ", p) done = True - reward = -2 * om # player who did NOT make the illegal move + reward = -1 * om # player who did NOT make the illegal move else: board[square] = p self.state['on_move'] = -p @@ -40,7 +40,7 @@ class TicTacToeEnv(gym.Env): done = True break - return np.array(self.state), reward, done, {} + return self.state, reward, done, {} def _reset(self): self.state = {} self.state['board'] = [0, 0, 0, 0, 0, 0, 0, 0, 0] @@ -53,27 +53,6 @@ class TicTacToeEnv(gym.Env): for i in range (9): print (self.state['board'][i], end=" ") print() - def hash_ttt(state): - #of course this is just for the upper bound; - #we should really take advantage of the redundancies - # to reduce the number of states to 765 for the board - # and who is on move really is implicit in how many - # squares are occupied - retval = 0 - low9 = 0 - high9 = 0 - lowmult = 2 - highmult = 1024 - board = state['board'] - if (state['on_move'] == -1): - retval = 1 - for i in range(9): - if (board[i] != 0): - retval += lowmult #todo bitwise logic in python how? - if (board[i] < 0): - retval += highmult - lowmult *=2 - highmult *= 2 def move_generator(self): moves = [] for i in range (9):