I’m working on a reinforcement learning project using Keras to create an agent that can play the LunarLander game from OpenAI gym. My neural network starts showing progress around 5000 episodes with better rewards, but then performance suddenly drops and gets worse instead of improving.
import numpy as np
import gym
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers
def should_explore(exploration_rate):
return np.random.random() < exploration_rate
def predict_q_value(state, action):
state_action = np.concatenate((state, action_encoding[action]), axis=0)
input_vector = np.zeros(shape=(1, state_size + action_count))
input_vector[0] = state_action
prediction = network.predict(input_vector[0].reshape(1, input_vector.shape[1]))
return prediction[0][0]
# Training parameters
exploration_prob = 0.05
learning_rate = 0.003
discount_factor = 0.3
state_size = 8
action_count = 4
warmup_episodes = 15
memory_size = 100000
training_epochs = 3
total_training_episodes = 15000
# Create action encoding matrix
actions_available = np.arange(0, action_count)
action_encoding = np.zeros((action_count, action_count))
action_encoding[np.arange(action_count), actions_available] = 1
# Initialize environment
env = gym.make('LunarLander-v2')
env.reset()
# Build neural network
network = Sequential()
network.add(Dense(512, activation='relu', input_dim=state_size + action_count))
network.add(Dense(1))
optimizer = optimizers.adam(lr=learning_rate)
network.compile(loss='mse', optimizer=optimizer, metrics=['accuracy'])
# Training loop
episode_scores = []
memory_states = np.zeros(shape=(1, state_size + action_count))
memory_targets = np.zeros(shape=(1, 1))
max_episode_steps = 40000
for episode_num in range(total_training_episodes):
episode_states = np.zeros(shape=(1, state_size + action_count))
episode_rewards = np.zeros(shape=(1, 1))
current_state = env.reset()
total_reward = 0
for step in range(max_episode_steps):
if episode_num < warmup_episodes:
chosen_action = env.action_space.sample()
else:
if should_explore(exploration_prob):
chosen_action = env.action_space.sample()
else:
q_values = np.zeros(shape=action_count)
for action_idx in range(action_count):
q_values[action_idx] = predict_q_value(current_state, action_idx)
chosen_action = np.argmax(q_values)
state_action_pair = np.concatenate((current_state, action_encoding[chosen_action]), axis=0)
next_state, reward, done, info = env.step(chosen_action)
total_reward += reward
if step == 0:
episode_states[0] = state_action_pair
episode_rewards[0] = np.array([reward])
memory_states[0] = state_action_pair
memory_targets[0] = np.array([reward])
episode_states = np.vstack((episode_states, state_action_pair))
episode_rewards = np.vstack((episode_rewards, np.array([reward])))
if done:
# Apply discounted rewards
for i in range(episode_rewards.shape[0]):
if i == 0:
episode_rewards[-(i+1)][0] = episode_rewards[-(i+1)][0]
else:
episode_rewards[-(i+1)][0] = episode_rewards[-(i+1)][0] + discount_factor * episode_rewards[-i][0]
# Update memory buffer
if memory_states.shape[0] == 1:
memory_states = episode_states
memory_targets = episode_rewards
else:
memory_states = np.concatenate((memory_states, episode_states), axis=0)
memory_targets = np.concatenate((memory_targets, episode_rewards), axis=0)
# Limit memory size
if len(memory_states) >= memory_size:
for j in range(len(episode_states)):
memory_states = np.delete(memory_states, 0, axis=0)
memory_targets = np.delete(memory_targets, 0, axis=0)
break
current_state = next_state
# Train network
if done and episode_num >= warmup_episodes and episode_num % 10 == 0:
network.fit(memory_states, memory_targets, batch_size=32, epochs=training_epochs, verbose=0)
if done:
episode_scores.append(total_reward)
I’ve been running this for thousands of episodes but can’t get stable convergence. The agent improves initially but then performance degrades. I’ve tried adjusting hyperparameters but nothing works. Any ideas what might be causing this instability?