Commit 9ff0c414 authored by Wen Yao Jin's avatar Wen Yao Jin
Browse files

go

parent e4788e3d
......@@ -11,7 +11,7 @@ class Action(IntEnum):
RIGHT = 4
class afterstateAgent:
def __init__(self, mat, TD_lambda = 0.0, alpha = 0.0025, gamma = 0.95, epsilon = 0.0, verbose= True):
def __init__(self, mat, TD_lambda = 0.0, alpha = 0.0025, gamma = 0.95, epsilon = 0.01, verbose= True, symmetric=True):
self.state_per_tile = 12
self.commands = { Action.UP: up, Action.DOWN: down, Action.LEFT: left, Action.RIGHT: right}
self.alpha = alpha
......@@ -19,7 +19,13 @@ class afterstateAgent:
self.epsilon = epsilon # e-greedy
# self.TD_lambda = 1-epsilon # TD(lamdba)
self.TD_lambda = TD_lambda
self.tuple = self._tuple()
self.forget = self.TD_lambda
self.symmetric = symmetric
if self.symmetric:
# self.tuple = self._tuple_advance()
self.tuple = self._tuple_advance()
else:
self.tuple = self._tuple()
if verbose:
print(len(self.tuple))
self.W = self._generate_dict()
......@@ -33,11 +39,6 @@ class afterstateAgent:
self.verbose = verbose
self.reset()
# [[(0,0),(1,0),(2,0),(3,0)],\
# [(0,1),(1,1),(2,1),(3,1)],\
# [(0,1),(1,1),(2,1),(0,2),(1,2),(2,2)],\
# [(0,2),(1,2),(2,2),(0,3),(1,3),(2,3)]]
def _tuple(self):
list=[]
for i in range(4):
......@@ -56,11 +57,18 @@ class afterstateAgent:
list += [l]
print(list)
return list
def _tuple_advance(self):
return [[(0,0),(1,0),(2,0),(3,0),(3,1),(2,1)],\
[(0,1),(1,1),(2,1),(3,1),(3,2),(2,2)],\
[(0,1),(1,1),(2,1),(0,2),(1,2),(2,2)],\
[(0,2),(1,2),(2,2),(0,3),(1,3),(2,3)]]
def reset(self):
self._reset_trace() #eligibility trace
self.count = 0
self.first_step = True# used to avoid update the first time
self.explore = 0
return
def _reset_trace(self):
......@@ -103,19 +111,8 @@ class afterstateAgent:
# return len(mat)*len(mat)
def act(self):
i = np.random.rand();
if i > self.epsilon:
#e-greedy
#exploitation
self.forget = self.TD_lambda
# print([self._afterstate(a) for a in range(len(Action))])
action_index = np.argmax(np.array([self._afterstate(a) for a in range(len(Action))]))
# print(action_index)
# print(self._phi())
else:
# exploration
self.forget = 0.0
action_index = np.random.randint(0, len(Action))
self.forget = self.TD_lambda
action_index = np.argmax(np.array([self._afterstate(a,act=True) for a in range(len(Action))]))
self._action_index = action_index
return Action(action_index+1)
......@@ -146,7 +143,7 @@ class afterstateAgent:
afterstate,r = self.test_next(action_index,next_state)
return r + self._phi(afterstate) - self._phi()
def _afterstate(self, action_index, state=None):
def _afterstate(self, action_index, state=None, act=False):
if state==None:
state = self.state
afterstate,done = self.test_next(action_index,state)
......@@ -154,18 +151,22 @@ class afterstateAgent:
return -1
if done== 1:
done = 0 #1 means no reward
return self._phi(afterstate) + done
phi_array = np.array([w[i] for w,i in zip(self.W, self._index(state))])
if act:
# in the act phase, explore when there is 0
if 0 in phi_array:
if self.verbose:
print("explore")
i = np.random.rand();
if i < self.epsilon: #explore
self.explore += 1
return sum(phi_array) + 10000
return sum(phi_array) + done
def test_next(self,action_index,state):
return self.commands[Action(action_index+1)](state)
def update(self, next_state, reward):
# print(next_state)
if self.first_step == True:
#don't update the first time
self.first_step = False
return
afterstate,r = self.test_next(self._action_index,self.state)
def one_side_update(self, next_state, reward, afterstate):
self.set_state(afterstate)
self._update_trace()
target = self.alpha * self._target(next_state) # afterstate don't need reward
......@@ -181,6 +182,29 @@ class afterstateAgent:
self.count+= 1
return
def update(self, next_state, reward):
if self.first_step == True:
#don't update the first time
self.first_step = False
return
s,r = self.test_next(self._action_index,self.state)
n = next_state
if self.symmetric is True:
for i in range(4):
s = transpose(s)
self.set_state(s)
n = transpose(n)
self.one_side_update(n,reward,s)
s = reverse(s)
self.set_state(s)
n = reverse(n)
self.one_side_update(n,reward,s)
#one loop is one rotation
else:
one_side_update(next_state,reward,s)
assert s==self.state, str(s)+str(self.state)
return
def set_state(self, state):
self.state = state
self.index = self._index(self.state)
......
# import matplotlib
# matplotlib.use("TkAgg")
# import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
from tkinter import *
from logic import *
......@@ -42,7 +42,7 @@ class GameGrid(Frame):
self.train = args["train"]
args.pop("train")
else:
self.train = 10000
self.train = TRAIN
self.DISPLAY = False
if self.DISPLAY:
Frame.__init__(self)
......@@ -131,6 +131,8 @@ class GameGrid(Frame):
# self.grid_cells[1][1].configure(text="You",bg=BACKGROUND_COLOR_CELL_EMPTY)
# self.grid_cells[1][2].configure(text="Win!",bg=BACKGROUND_COLOR_CELL_EMPTY)
if game_state(self.matrix)=='lose':
if self.agent.explore>0:
print("explore: "+ str(self.agent.explore))
# reward = -10
# reward = np.log(np.max(self.matrix))
# self.grid_cells[1][1].configure(text="You",bg=BACKGROUND_COLOR_CELL_EMPTY)
......@@ -142,10 +144,10 @@ class GameGrid(Frame):
if (game_state(self.matrix)=='win' ) or (game_state(self.matrix)=='lose'):
# print(self.agent.W)
if (self.count == self.train):
f = open("train_" +str(self.agent.alpha) +"_"+str(self.agent.TD_lambda)+ " _result_after_"+str(self.count)+".txt",'wb')
f = open("train_" +str(self.agent.alpha) +"_"+str(self.agent.TD_lambda)+"_"+str(self.agent.symmetric)+"_result_after_"+str(self.count)+".txt",'wb')
pickle.dump(self.agent.W ,f)
f.close()
f = open("train_" +str(self.agent.alpha) +"_"+str(self.agent.TD_lambda)+ "_history_after_"+str(self.count)+".txt",'wb')
f = open("train_" +str(self.agent.alpha) +"_"+str(self.agent.TD_lambda)+"_"+str(self.agent.symmetric)+"_history_after_"+str(self.count)+".txt",'wb')
np.savetxt(f, self.history)
f.close()
self.history += [np.max(self.matrix)]
......@@ -160,7 +162,7 @@ class GameGrid(Frame):
if (self.DISPLAY):
# Tell Tkinter to wait DELTA_TIME seconds before next iteration
self.after(100, self.key_down)
self.after(20, self.key_down)
def generate_next(self):
index = (self.gen(), self.gen())
......@@ -173,6 +175,8 @@ if __name__ == '__main__':
parser.add_option("-g", "--TD", dest="TD_lambda", help ="TD_lambda the forget coefficient")
parser.add_option("-a", "--alpha", dest="alpha", help ="alpha the learning rate")
parser.add_option("-t", "--train", dest="train", help ="training episodes")
parser.add_option("-s", "--symmetric", dest="symmetric", help ="symmetric sampling")
parser.add_option("-e", "--epsilon", dest="epsilon", help ="epsilon the exploration")
(options,args)= parser.parse_args()
print(vars(options))
start_time = time.time()
......
......@@ -2,6 +2,8 @@ from tkinter import *
from logic import *
from random import *
from agent import *
from agent_afterstate import *
import numpy as np
import pickle
import time
......@@ -38,8 +40,8 @@ class GameGrid(Frame):
self.reset()
self.history = []
self.count = 0
self.agent = qLearningAgent(self.matrix)
f = open("train_result_after_"+str(TRAIN)+".txt",'rb')
self.agent = afterstateAgent(self.matrix)
f = open("train_0.0025_0.0_result_after_2000.txt",'rb')
self.agent.W = pickle.load(f)
f.close()
print(self.agent.W[0])
......@@ -101,11 +103,10 @@ class GameGrid(Frame):
self.update_grid_cells()
if done!=1:
reward = done
else:
reward = -10
# else:
# reward = -10
if game_state(self.matrix)=='win':
reward = 1024
print("win")
if game_state(self.matrix)=='lose':
print(np.max(self.matrix))
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment