I’m working on a DQN implementation using TensorFlow to solve the LunarLander-v2 environment, but my agent isn’t learning as expected. Initially, the training seems promising for the first few thousand episodes, but then the performance decreases significantly. Below is my code example:
import numpy as np
import gym
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import optimizers
def should_explore(exploration_rate):
return np.random.uniform() < exploration_rate
def estimate_q_value(state, action_idx):
state_action = np.concatenate((state, action_encoding[action_idx]), axis=0)
input_data = np.zeros(shape=(1, state_size + action_count))
input_data[0] = state_action
prediction = neural_net.predict(input_data[0].reshape(1, input_data.shape[1]))
return prediction[0][0]
scores = []
exploration_prob = 0.1
learning_rate = 0.001
discount_factor = 0.95
state_size = 8
action_count = 4
warmup_episodes = 20
memory_capacity = 50000
training_epochs = 2
total_runs = 10000
action_list = np.arange(0, action_count)
action_encoding = np.zeros((action_count, action_count))
action_encoding[np.arange(action_count), action_list] = 1
environment = gym.make('LunarLander-v2')
environment.reset()
sample_input = np.random.random((3, state_size + action_count))
sample_output = np.random.random((3, 1))
neural_net = Sequential()
neural_net.add(Dense(256, activation='relu', input_dim=sample_input.shape[1]))
neural_net.add(Dense(sample_output.shape[1]))
optimizer = optimizers.Adam(learning_rate=learning_rate)
neural_net.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])
experience_states = np.zeros(shape=(1, state_size + action_count))
experience_rewards = np.zeros(shape=(1, 1))
max_episode_steps = 1000
for episode_num in range(total_runs):
episode_states = np.zeros(shape=(1, state_size + action_count))
episode_rewards = np.zeros(shape=(1, 1))
current_state = environment.reset()
total_reward = 0
for step in range(max_episode_steps):
if episode_num < warmup_episodes:
chosen_action = environment.action_space.sample()
else:
if should_explore(exploration_prob):
chosen_action = environment.action_space.sample()
else:
q_values = np.zeros(shape=action_count)
for j in range(action_count):
q_values[j] = estimate_q_value(current_state, j)
chosen_action = np.argmax(q_values)
state_action_pair = np.concatenate((current_state, action_encoding[chosen_action]), axis=0)
next_state, reward, done, info = environment.step(chosen_action)
total_reward += reward
if step == 0:
episode_states[0] = state_action_pair
episode_rewards[0] = np.array([reward])
experience_states[0] = state_action_pair
experience_rewards[0] = np.array([reward])
episode_states = np.vstack((episode_states, state_action_pair))
episode_rewards = np.vstack((episode_rewards, np.array([reward])))
if done:
for k in range(episode_rewards.shape[0]):
if k == 0:
episode_rewards[-(k+1)][0] = episode_rewards[-(k+1)][0]
else:
episode_rewards[-(k+1)][0] = episode_rewards[-(k+1)][0] + discount_factor * episode_rewards[-k][0]
if experience_states.shape[0] == 1:
experience_states = episode_states
experience_rewards = episode_rewards
else:
experience_states = np.concatenate((experience_states, episode_states), axis=0)
experience_rewards = np.concatenate((experience_rewards, episode_rewards), axis=0)
if len(experience_states) >= memory_capacity:
for m in range(len(episode_states)):
experience_states = np.delete(experience_states, 0, axis=0)
experience_rewards = np.delete(experience_rewards, 0, axis=0)
current_state = next_state
if done and episode_num >= warmup_episodes:
if episode_num % 15 == 0:
neural_net.fit(experience_states, experience_rewards, batch_size=64, epochs=training_epochs, verbose=0)
if done:
scores.append(total_reward)
break
Initially, the agent shows some improvement, but then performance significantly declines after several thousand episodes. I have tried modifying hyperparameters, but achieving stable learning has been challenging. Any suggestions on what might be causing this instability?