Commit aa6fabe4 authored by Wen Yao Jin's avatar Wen Yao Jin
Browse files

update

parent c6f3d772
No preview for this file type
The MIT License (MIT)
Copyright (c) 2014 Tay Yang Shun
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
\ No newline at end of file
......@@ -14,6 +14,8 @@ class RandomAgent():
"""
Initialize your internal state
"""
self.W = 0
self.reset()
pass
def act(self):
......@@ -22,22 +24,33 @@ class RandomAgent():
"""
return Action(np.random.randint(4)+1)
def reset(self):
self.count=0
def update(self, next_state, reward):
"""
Update your internal state
"""
self.count+=1
pass
def set_state(self, state):
pass
class qLearningAgent:
def __init__(self, mat, TD_lambda = 0.95, alpha = 0.05, gamma = 0.95, epsilon = 0.0, verbose= True):
def __init__(self, mat, TD_lambda = 0.95, alpha = 0.05, gamma = 0.95, epsilon = 0.0, verbose= True, tuple = 2):
self.state_per_tile = 12
self.alpha = alpha
self.gamma = gamma
self.epsilon = epsilon # e-greedy
# self.TD_lambda = 1-epsilon # TD(lamdba)
self.TD_lambda = TD_lambda
self.tuple = self._tuple()
if tuple==0:
self.tuple = self._tuple()
elif tuple == 1:
self.tuple = self._tuple_restricted()
elif tuple == 2:
self.tuple = self._tuple_advance()
else:
self.tuple = self._tuple_advance_plus()
if verbose:
print(len(self.tuple))
self.W = self._generate_dict()
......@@ -51,11 +64,6 @@ class qLearningAgent:
self.verbose = verbose
self.reset()
# [[(0,0),(1,0),(2,0),(3,0)],\
# [(0,1),(1,1),(2,1),(3,1)],\
# [(0,1),(1,1),(2,1),(0,2),(1,2),(2,2)],\
# [(0,2),(1,2),(2,2),(0,3),(1,3),(2,3)]]
def _tuple(self):
list=[]
for i in range(4):
......@@ -74,6 +82,40 @@ class qLearningAgent:
list += [l]
print(list)
return list
def _tuple_restricted(self):
list=[]
for i in range(4):
l = []
for j in range(4):
l+=[(i,j)]
list+=[l]
for i in range(4):
l = []
for j in range(4):
l+=[(j,i)]
list+=[l]
for i in range(3):
for j in range(3):
if abs(i-j) == 1:
continue
l = [(i,j),(i,j+1),(i+1,j),(i+1,j+1)]
list += [l]
print(list)
return list
def _tuple_advance(self):
return [[(0,0),(1,0),(2,0),(3,0)],\
[(0,1),(1,1),(2,1),(3,1)],\
[(0,1),(1,1),(2,1),(0,2),(1,2),(2,2)],\
[(0,2),(1,2),(2,2),(0,3),(1,3),(2,3)]]
def _tuple_advance_plus(self):
return [[(0,0),(1,0),(2,0),(3,0),(3,1),(2,1)],\
[(0,1),(1,1),(2,1),(3,1),(3,2),(2,2)],\
[(0,1),(1,1),(2,1),(0,2),(1,2),(2,2)],\
[(0,2),(1,2),(2,2),(0,3),(1,3),(2,3)]]
def reset(self):
self._reset_trace() #eligibility trace
......@@ -140,7 +182,7 @@ class qLearningAgent:
for d in self.trace:
l = list(d.items())
for k,v in l:
upd = v*self.forget*self.gamma
upd = v*self.forget
if np.all(upd < 0.01):
d.pop(k)
else:
......@@ -154,11 +196,12 @@ class qLearningAgent:
tr[ind] = v
# print(self.trace[0])
# print(np.sum(self.trace,axis=1))
self.count+= 1
pass
def _target(self,next_state,reward):
#q learning target function
return reward + self.gamma * np.max(self._phi(next_state)) - self._phi()[self._action_index]
return reward + np.max(self._phi(next_state)) - self._phi()[self._action_index]
def update(self, next_state, reward):
# print(next_state)
......@@ -174,7 +217,7 @@ class qLearningAgent:
index = np.where(n[k]!=0)# can't divide by zeros :/
# print(n[k])
w[k][index] += target*tr[k][index]/n[k][index]
w[k][index] += target*tr[k][index]/np.sqrt(n[k][index])
# w[k] += target*tr[k]
# print(w[k])
if self.verbose:
......@@ -199,111 +242,3 @@ class qLearningAgent:
# print(self.index)
return
# class qLearningAgent2:
# def __init__(self, mat, TD_lambda = 0.0, alpha = 0.5, gamma = 0.8, epsilon = 0.01):
# self.state_per_tile = 10
# self.alpha = alpha
# self.gamma = gamma
# self.epsilon = epsilon # e-greedy
# self.TD_lambda = TD_lambda # TD(lamdba)
# self.tuple = [[(0,0),(1,0),(2,0),(3,0)],\
# [(0,1),(1,1),(2,1),(3,1)],\
# [(0,1),(1,1),(2,1),(0,2),(1,2),(2,2)],\
# [(0,2),(1,2),(2,2),(0,3),(1,3),(2,3)]]
# self.feature_size = sum([self.state_per_tile**len(k) for k in self.tuple])
# self.W = np.zeros((self.feature_size,len(Action))) #weight(Theta)
# self.set_state(mat)
# print(self.feature_size)
# self.reset()
# def reset(self):
# self.trace = np.zeros((self.feature_size,len(Action))) #eligibility trace
# self.first_step = True# used to avoid update the first time
# pass
# def _index(self, state):
# #value function
# sum = 0
# list_index = []
# for t in self.tuple:
# index = self._calculate_index(state,t)
# # assert sum+index < self.feature_size, "bad calculation of feature index"
# list_index += [sum+index]
# sum += self.state_per_tile**len(t)
# return list_index
# def _phi(self, state = None):
# #value function
# if state == None:
# return np.sum(self.W[self.index,:],axis=0)
# else:
# # print(self.W[self._index(state),:])
# return np.sum(self.W[self._index(state),:],axis=0)
# def _phi_gradient(self):
# #value function
# res = np.zeros(self.feature_size)
# res[self.index] = 1
# return res
# def _calculate_index(self, state, tuple):
# sum = 0
# for r,l in tuple:
# if state[r][l] != 0:
# sum += int(np.log2(state[r][l]))
# sum *= self.state_per_tile
# sum /= self.state_per_tile
# return int(sum)
# def _size(self, mat):
# return len(mat)*len(mat)
# def act(self):
# i = np.random.rand();
# if i > self.epsilon:
# #e-greedy
# #exploitation
# self.forget = self.TD_lambda
# action_index = np.argmax(self._phi())
# # print(self._phi())
# else:
# # exploration
# self.forget = 0.0
# action_index = np.random.randint(0, len(Action))
# self._action_index = action_index
# return Action(action_index+1)
# def _update_trace(self):
# self.trace *= self.forget*self.gamma
# self.trace[:,self._action_index] += self._phi_gradient()
# # print(np.sum(self.trace,axis=1))
# pass
# def _target(self,next_state,reward):
# #q learning target function
# return reward + self.gamma * np.max(self._phi(next_state))
# def update(self, next_state, reward):
# # print(next_state)
# if self.first_step == True:
# #don't update the first time
# self.first_step = False
# pass
# self._update_trace()
# self.W += self.alpha * (self._target(next_state,reward) \
# - self._phi()[self._action_index])\
# * self.trace
# # print(self._target(next_state,reward) \
# # - self._phi()[self._action_index])
# # #game stops, reset the agent
# # self._reset()
# pass
# def set_state(self, state):
# self.state = state
# # print(self.state)
# self.index = self._index(self.state)
# # assert len(self.phi) ==4,"wrong calculation of phi"
# # print(self.index)
# pass
......@@ -26,11 +26,11 @@ class afterstateAgent:
if tuple==0:
self.tuple = self._tuple()
elif tuple == 1:
self.tuple = self._tuple_advance()
self.tuple = self._tuple_restricted()
elif tuple == 2:
self.tuple = self._tuple_advance_plus()
self.tuple = self._tuple_advance()
else:
self.tuple = self._tuple_restricted()
self.tuple = self._tuple_advance_plus()
if verbose:
print(len(self.tuple))
self.W = self._generate_dict()
......@@ -197,7 +197,7 @@ class afterstateAgent:
for i,w,ind in zip(range(len(self.W)),self.W, self._index(afterstate)):
if self.mono > 0:
if i in [0,3,4,7]:
phi_array[i] += self._bonus(ind)
phi_array[i] += self._bonus_mono(ind)
# in the act phase, explore when there is 0
if 0 in phi_array:
......@@ -223,7 +223,7 @@ class afterstateAgent:
t = target
# if self.mono > 0:
# if i in [0,3,4,7]:
# t += self.mono*self._bonus(k)
# t += self.mono*self._bonus_mono(k)
w[k] += t*tr[k]/np.sqrt(n[k])
if self.verbose:
print("reward: "+str(reward))
......@@ -233,11 +233,13 @@ class afterstateAgent:
self.count+= 1
return
def _bonus(self,t):
def _bonus_mono(self,t):
dx = np.diff(list(t))
yes = int(np.all(dx <= 0) or np.all(dx >= 0))
return yes*(2**sum(list(t)))
# def _bonus_en
def update(self, next_state, reward):
if self.first_step == True:
#don't update the first time
......
......@@ -31,33 +31,14 @@ CELL_COLOR_DICT = { 2:"#776e65", 4:"#776e65", 8:"#f9f6f2", 16:"#f9f6f2", \
FONT = ("Verdana", 40, "bold")
class GameGrid(Frame):
def __init__(self,args=None):
def __init__(self,options):
if args["continue"] != None:
continue_filename = args["continue"]
if options.display:
self.DISPLAY = True
else:
continue_filename = None
args.pop("continue")
if args["display"] != None:
self.DISPLAY = int(args["display"])
else:
self.DISPLAY = 0
args.pop("display")
for k in list(args.keys()):
if args[k] == None:
args.pop(k)
else :
args[k] = float(args[k])
if "train" in args.keys():
self.train = args["train"]
args.pop("train")
else:
self.train = TRAIN
self.DISPLAY = False
self.train = options.train
if self.DISPLAY > 0:
Frame.__init__(self)
......@@ -73,10 +54,17 @@ class GameGrid(Frame):
self.reset()
self.history = []
self.count = 0
# self.agent = RandomAgent()
self.agent = afterstateAgent(self.matrix,**args)
if continue_filename != None:
f = open(continue_filename,'rb')
if options.policy == 0:
self.agent = RandomAgent()
elif options.policy == 1:
self.agent = qLearningAgent(self.matrix, TD_lambda = options.TD_lambda,
alpha = options.alpha, epsilon = options.epsilon, verbose= options.verbose, tuple = options.tuple)
else:
self.agent = afterstateAgent(self.matrix, TD_lambda = options.TD_lambda,
alpha = options.alpha, epsilon = options.epsilon, verbose= options.verbose,
symmetric=options.symmetric, tuple = options.tuple, mono = options.mono)
if options.continues != None:
f = open(options.continues,'rb')
self.agent.W = pickle.load(f)
f.close()
......@@ -136,40 +124,34 @@ class GameGrid(Frame):
key = self.agent.act()
self.matrix,done = self.commands[key](self.matrix)
reward = 0
if done:
self.matrix = add_two(self.matrix)
if self.DISPLAY:
self.update_grid_cells()
if done!=1:
reward += done
# print(reward)
# else:
# reward = -0.5
if game_state(self.matrix)=='win':
print("win")
# self.grid_cells[1][1].configure(text="You",bg=BACKGROUND_COLOR_CELL_EMPTY)
# self.grid_cells[1][2].configure(text="Win!",bg=BACKGROUND_COLOR_CELL_EMPTY)
if game_state(self.matrix)=='lose':
if self.agent.explore>0:
print("explore: "+ str(self.agent.explore))
# reward = -10
# reward = np.log(np.max(self.matrix))
# self.grid_cells[1][1].configure(text="You",bg=BACKGROUND_COLOR_CELL_EMPTY)
# self.grid_cells[1][2].configure(text="Lose!",bg=BACKGROUND_COLOR_CELL_EMPTY)
# if self.agent.explore>0:
# print("explore: "+ str(self.agent.explore))
print(str(self.count) + " : " + str(np.max(self.matrix)))
if options.policy == 1:
if done==0:
reward = -10
self.agent.update(self.matrix, reward)
if (game_state(self.matrix)=='win' ) or (game_state(self.matrix)=='lose'):
# print(self.agent.W)
if (self.count == self.train):
f = open("train_" +str(self.agent.alpha) +"_"+str(self.agent.TD_lambda)+"_"+str(self.agent.symmetric)+"_result_after_"+str(self.count)+".txt",'wb')
f = open("result.txt",'wb')
pickle.dump(self.agent.W ,f)
f.close()
f = open("train_" +str(self.agent.alpha) +"_"+str(self.agent.TD_lambda)+"_"+str(self.agent.symmetric)+"_history_after_"+str(self.count)+".txt",'wb')
f = open("history.txt",'wb')
np.savetxt(f, self.history)
f.close()
self.history += [np.max(self.matrix)]
......@@ -194,17 +176,20 @@ class GameGrid(Frame):
if __name__ == '__main__':
parser = OptionParser()
parser.add_option("-g", "--TD", dest="TD_lambda", help ="TD_lambda the forget coefficient")
parser.add_option("-a", "--alpha", dest="alpha", help ="alpha the learning rate")
parser.add_option("-t", "--train", dest="train", help ="training episodes")
parser.add_option("-s", "--symmetric", dest="symmetric", help ="symmetric sampling")
parser.add_option("-e", "--epsilon", dest="epsilon", help ="epsilon the exploration")
parser.add_option("-u", "--tuple", dest="tuple", help ="the tuple to use")
parser.add_option("-c", "--continue", dest="continue", help ="the file to continue training")
parser.add_option("-d", "--display", dest="display", help ="display result")
parser.add_option("-m", "--mono", dest="mono", help ="bonus for monotonicity")
parser.add_option("-p", "--policy", dest="policy", type = "int", help ="0 for random, 1 for qlearning, 2 for afterstate [default: %default]" ,default="2")
parser.add_option("-g", "--TD", dest="TD_lambda", type = "float", help ="TD_lambda the forget coefficient [default: %default]" ,default="0")
parser.add_option("-a", "--alpha", dest="alpha", type = "float", help ="alpha the learning rate [default: %default]" ,default="0.0025")
parser.add_option("-t", "--train", dest="train", type = "float", help ="number of training episodes [default: %default]" ,default="2000" )
parser.add_option("-s", "--symmetric", dest="symmetric", help ="symmetric sampling [default: %default]" , action = "store_true", default=False)
parser.add_option("-e", "--epsilon", dest="epsilon", type = "float", help ="epsilon the exploration rate [default: %default]" ,default="0")
parser.add_option("-u", "--tuple", dest="tuple", type = "int", help ="the tuple to use [default: %default]", default="0")
parser.add_option("-c", "--continue", dest="continues", type = "string", help ="the file to continue training [default: %default]", default='result.txt')
parser.add_option("-d", "--display", dest="display", help ="display result", action = "store_true", default=False)
parser.add_option("-m", "--mono", dest="mono", type = "float", help ="bonus for monotonicity [default: %default]", default = "0" )
parser.add_option("-v", "--verbose", dest="verbose", help ="print training steps for the first episode", action = "store_false", default=True)
(options,args)= parser.parse_args()
print(vars(options))
print(options)
start_time = time.time()
gamegrid = GameGrid(vars(options))
gamegrid = GameGrid(options)
print("--- %s seconds ---" % (time.time() - start_time))
import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
from tkinter import *
from logic import *
from random import *
from agent import *
from agent_afterstate import *
import numpy as np
import pickle
import time
import sys
from optparse import OptionParser
import os
TRAIN = 2000
SIZE = 500
GRID_LEN = 4
GRID_PADDING = 10
BACKGROUND_COLOR_GAME = "#92877d"
BACKGROUND_COLOR_CELL_EMPTY = "#9e948a"
BACKGROUND_COLOR_DICT = { 2:"#eee4da", 4:"#ede0c8", 8:"#f2b179", 16:"#f59563", \
32:"#f67c5f", 64:"#f65e3b", 128:"#edcf72", 256:"#edcc61", \
512:"#edc850", 1024:"#edc53f", 2048:"#edc22e" }
CELL_COLOR_DICT = { 2:"#776e65", 4:"#776e65", 8:"#f9f6f2", 16:"#f9f6f2", \
32:"#f9f6f2", 64:"#f9f6f2", 128:"#f9f6f2", 256:"#f9f6f2", \
512:"#f9f6f2", 1024:"#f9f6f2", 2048:"#f9f6f2" }
FONT = ("Verdana", 40, "bold")
class GameGrid(Frame):
def __init__(self,args=None):
for k in list(args.keys()):
if args[k] == None:
args.pop(k)
else :
args[k] = float(args[k])
if "train" in args.keys():
self.train = args["train"]
args.pop("train")
else:
self.train = TRAIN
self.DISPLAY = True
if self.DISPLAY:
Frame.__init__(self)
self.commands = { Action.UP: up, Action.DOWN: down, Action.LEFT: left, Action.RIGHT: right}
self.grid_cells = []
if self.DISPLAY:
self.grid()
self.master.title('2048')
self.init_grid()
self.reset()
self.history = []
self.count = 0
# self.agent = RandomAgent()
self.agent = afterstateAgent(self.matrix,**args)
f = open("train_0.0025_0.5_0.0_result_after_2000.txt",'rb')
self.agent.W = pickle.load(f)
if self.DISPLAY:
self.key_down()
self.mainloop()
else:
while self.count<=self.train:
self.key_down()
def reset(self):
self.init_matrix()
if self.DISPLAY:
self.update_grid_cells()
def init_grid(self):
background = Frame(self, bg=BACKGROUND_COLOR_GAME, width=SIZE, height=SIZE)
background.grid()
for i in range(GRID_LEN):
grid_row = []
for j in range(GRID_LEN):
cell = Frame(background, bg=BACKGROUND_COLOR_CELL_EMPTY, width=SIZE/GRID_LEN, height=SIZE/GRID_LEN)
cell.grid(row=i, column=j, padx=GRID_PADDING, pady=GRID_PADDING)
# font = Font(size=FONT_SIZE, family=FONT_FAMILY, weight=FONT_WEIGHT)
t = Label(master=cell, text="", bg=BACKGROUND_COLOR_CELL_EMPTY, justify=CENTER, font=FONT, width=4, height=2)
t.grid()
grid_row.append(t)
self.grid_cells.append(grid_row)
def gen(self):
return randint(0, GRID_LEN - 1)
def init_matrix(self):
self.matrix = new_game(4)
self.matrix=add_two(self.matrix)
self.matrix=add_two(self.matrix)
def update_grid_cells(self):
for i in range(GRID_LEN):
for j in range(GRID_LEN):
new_number = self.matrix[i][j]
if new_number == 0:
self.grid_cells[i][j].configure(text="", bg=BACKGROUND_COLOR_CELL_EMPTY)
else:
self.grid_cells[i][j].configure(text=str(new_number), bg=BACKGROUND_COLOR_DICT[new_number], fg=CELL_COLOR_DICT[new_number])
self.update_idletasks()
def key_down(self):
if self.count>=1:
self.agent.verbose = False
if self.agent.count >10000:
self.agent.verbose = True
self.agent.set_state(self.matrix)
key = self.agent.act()
self.matrix,done = self.commands[key](self.matrix)
reward = 0
if done:
self.matrix = add_two(self.matrix)
if self.DISPLAY:
self.update_grid_cells()
if done!=1:
reward += done
# print(reward)
# else:
# reward = -0.5
if game_state(self.matrix)=='win':
print("win")
# self.grid_cells[1][1].configure(text="You",bg=BACKGROUND_COLOR_CELL_EMPTY)
# self.grid_cells[1][2].configure(text="Win!",bg=BACKGROUND_COLOR_CELL_EMPTY)
if game_state(self.matrix)=='lose':
if self.agent.explore>0:
print("explore: "+ str(self.agent.explore))
# reward = -10
# reward = np.log(np.max(self.matrix))
# self.grid_cells[1][1].configure(text="You",bg=BACKGROUND_COLOR_CELL_EMPTY)
# self.grid_cells[1][2].configure(text="Lose!",bg=BACKGROUND_COLOR_CELL_EMPTY)
print(str(self.count) + " : " + str(np.max(self.matrix)))
# self.agent.update(self.matrix, reward)