-
Notifications
You must be signed in to change notification settings - Fork 0
/
Pratham.py
249 lines (200 loc) · 11.3 KB
/
Pratham.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
AGENT_NAME = "Pratham"
TEAM_NAME = "Epoch"
# These are algorithm parameters. You need to change this
DELTA = -0.0005 # How much should the epsilon shift by in every game
LR = 0.05 # This is the learning rate. High learning rate means faster learning"
GAMMA = 0.9 # gamma = discount factor. High gamma value means focus on future rewards "
STARTING_EPSILON = 1
#These are the reward parameters. You need to change this. Refer to the guide to know what each means in detail
DESTROY_SOFTBLOCK = 0.5
DO_DAMAGE = 10
TAKE_DAMAGE = -1
EARN_AMMO = 0.5
COLLIDE_WITH_WALLS = -0.25
DAMAGE_ITSELF = -1
NO_DESTRUCTION = -0.1
NO_KILL = -0.5
PLACING_BOMBS_WITH_NO_AMMO = -1
BOMB_IN_RANGE = -0.5
PLACING_BOMB_ON_TOP_OF_BOMB = -0.5
EXIST_PENALTY = -1
RL_ALGORITHM_TYPE = "Q_LEARNING" # "SARSA" , "Q_LEARNING", "DEEPQ"
RADIUS = 2
from coachPangolin import Agent as coachPango
from coachLizard import agent as coachLizzie
COACH = coachPango() # choose between coachPango, coachLizzie, coachSiss
GUIDANCE = 0.7 # 1 means the agent copies exactly as the coach. 0 means that the agent makes random moves Should be between 1 and 0
Q_NAME = AGENT_NAME + "_" +"QTable" + "_"+TEAM_NAME
EPISODE_NAME = AGENT_NAME + "_"+"G_Number"+ "_"+TEAM_NAME
EPSILON_NAME = AGENT_NAME + "_"+"epsilon" + "_"+TEAM_NAME
#Modules required. These provide extra functions so that we can do more operations
import random # to generate random numbers
from sre_parse import State # to load and save python data in files
import numpy as np # to do fast and efficient computation on data
#Import coaches
#import coachSnake
#import coachHornbill
from load_save import *
from state import *
from reward import *
# These are algorithm parameters. You need to change this
class Agent:
"""
This class defines the agent that plays your game.
"""
def __init__(self):
'''
Place any initialization code for your agent here (if any)
'''
self.cumulative_reward = 0 # this stores sum total reward in every iteration
# These attributes store the agent's previous state and action. These values are important for the QTable
self.old_state = None
self.old_action = None
self.scores= []
# The following attributes contain data that the agent should remember for each move
self.episode_number = setup_episode(EPISODE_NAME=EPISODE_NAME) # match number
self.Q_Table = setup_QTable(Q_NAME=Q_NAME) # Q Table
self.epsilon = setup_epsilon(EPSILON_NAME=EPSILON_NAME, STARTING_EPSILON=STARTING_EPSILON) # EPSILON
def next_move(self, game_state, player_state):
'''
This is the main method. It's called by the game to understand what's the agent's next move at every single tick.
'''
if(game_state.tick_number==0):
self.old_state = get_training_state_for_Q(game_state, player_state, radius=RADIUS)
ac = self.get_action(self.old_state, game_state, player_state)
self.old_action = ac.index(1)
ac = ['','u','d','l','r','p'][ac.index(1)]
return ac
#---------------------------------------------------------------- LEARNING-----------------------------------------------------------
reward = get_reward_for_agent(game_state._occurred_event, DESTROY_SOFTBLOCK,
DO_DAMAGE,
TAKE_DAMAGE,
EARN_AMMO,
COLLIDE_WITH_WALLS,
DAMAGE_ITSELF,
NO_DESTRUCTION,
NO_KILL,
PLACING_BOMBS_WITH_NO_AMMO,
PLACING_BOMB_ON_TOP_OF_BOMB,
BOMB_IN_RANGE,
EXIST_PENALTY,
player_state,
game_state) #reward collected from the previous action
new_state = get_training_state_for_Q(game_state, player_state, radius=RADIUS) #new state transitioned into after implementing the previous action
new_action = self.get_action(new_state, game_state, player_state).index(1)
self.cumulative_reward+=reward # update return_sum for calculating cumulative reward
self.learn(state = self.old_state, state2 = new_state, reward = reward, action = self.old_action, action2 = new_action) #learn method to learn from the state
#-------------------------------------------------------------------------------------------------------------------------------------
# Compute if the state is over
done = (game_state.is_over) or (player_state.hp==0) or (game_state.tick_number==1800)
if(done):
print("----------------\t\t" + RL_ALGORITHM_TYPE + "\t\t------------------------------")
if(player_state.hp==0):
print("LOST")
else:
print("WIN")
self.increment_episode() #Increase the game numer
self.shift_epsilon(DELTA) #Change epsilon value by DELTA
# store the information from the agent
self.scores.append(player_state.reward)
store_progress(self.Q_Table, self.episode_number, self.epsilon, Q_NAME=Q_NAME, EPISODE_NAME=EPISODE_NAME, EPSILON_NAME=EPSILON_NAME)
self.display_agent_result()
self.cumulative_reward = 0
#Store new_action as old acction and new_state as old_state.
self.old_action = new_action
self.old_state = new_state
#Return new action
new_action = ['','u','d','l','r','p'][new_action]
return new_action
def learn(self, state, state2, reward, action, action2):
"""
This method defines how the QTable needs to be updated
"""
ls_check = ["SARSA", "Q_LEARNING", "DEEPQ" ]
if(RL_ALGORITHM_TYPE not in ls_check):
print("Error in the RL_ALGORITHM_TYPE ENTERED")
return
if(RL_ALGORITHM_TYPE =="SARSA"):
action = action #old action
action2 = action2 #new action
#If the agent is exploring an already existing state, then the entry must already be in the QTable
#Otherwise, we need to create new entry into the QTable.
if(state not in self.Q_Table): self.Q_Table[state] = np.zeros(6)
if(state2 not in self.Q_Table): self.Q_Table[state2] = np.zeros(6)
# We assume that value inside the QTable is the best one that we have.
predict = self.Q_Table[state][action].item() # this gets you the particular value
target = reward + GAMMA * self.Q_Table[state2][action2].item()
#This is the self.Q_Table
self.Q_Table[state][action] = self.Q_Table[state][action].item() + LR * (target - predict)
elif(RL_ALGORITHM_TYPE == "Q_LEARNING"):
action = action # convert old action from list([0,0,1,0,0,0]) to index number
action2 = action2 # convert new action from list([0,0,1,0,0,0]) to index number
# if state already in Q table, then it means that particlar state has been explored, otherwise, new entry is added to the table
if(state not in self.Q_Table):
self.Q_Table[state] = np.random.uniform(0,1,6)
if(state2 not in self.Q_Table):
self.Q_Table[state2] = np.random.uniform(0,1,6)
# calculate predict and target
predict = self.Q_Table[state][action].item() # valuee for particular state and action given by the state
target = reward + GAMMA * np.argmax(self.Q_Table[state2]).item() # optimal value
#update the q value for the particular state and action
self.Q_Table[state][action] = self.Q_Table[state][action].item() + LR * (target - predict) #updating the self.q_value
elif(RL_ALGORITHM_TYPE=="DEEPQ"):
pass
def increment_episode(self):
'''
Called when match is over to increase game number
'''
self.episode_number+=1
def shift_epsilon(self, delta):
'''
Change epsilon as per the participant's function
'''
self.epsilon+=delta
def get_action(self, state, game_state, player_state):
'''
This method implements epsilon greedy strategy to choose action taken by the agent
a random number is generated between 0 and 1.
if it's smaller than epsilon, then a random move is implemented
else action is taken from the q table
'''
final_move = [0,0,0,0,0,0]
if(self.choose_action()):
move = self.explore(state, game_state, player_state)
else:
move = self.exploit(state)
final_move[move]=1
return final_move
def choose_action(self):
'''
This method decides whether the agent should explore or not
TRUE = exploration
FALSE = exploitation
'''
final_move = [0,0,0,0,0,0]
if np.random.uniform(0,1)< self.epsilon: return True
else: return False
def explore(self, state, game_state, player_state):
'''
This method returns a random move. We have decreased the probability of agent placing bombs. So that it doesn't kill itself very often.
'''
#implements epsilon greedy
y = random.random()
if(y<GUIDANCE):
guided_move = COACH.next_move(game_state, player_state)
return ['','u','d','l','r','p'].index(guided_move)
else:
x = random.random()
if(x>0.99): return 5
else: return random.randint(0, 4)
def exploit(self, state):
'''
This method chooses the best move as learned by the QTable
'''
if(state not in self.Q_Table): return random.randint(0, 5)
return np.argmax(self.Q_Table[state])
def display_agent_result(self):
print("Game Number \t \t : \t", self.episode_number)
print("Reward Earned \t \t : \t", self.cumulative_reward)
print("Epsilon \t \t : \t", self.epsilon)
print("_____________________________________________________________________________________________________")