I’m working on a reinforcement learning project where I built a DQN agent to master Breakout using the RAM observations. The agent runs through episodes but doesn’t seem to get better at the game even after training for many episodes.
I’ve tried adjusting different hyperparameters like batch size and replay buffer size. I also modified the reward system to penalize the agent when it loses a life. However, the learning progress is still not happening.
Here’s my implementation:
import gym
import numpy as np
import random
import tensorflow as tf
from collections import deque
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.replay_buffer = deque(maxlen=50000)
self.discount_factor = 0.95
self.exploration_rate = 1.0
self.min_exploration = 0.01
self.exploration_decay = 0.995
# Build neural network
self.q_network = self.build_network()
def build_network(self):
network = tf.keras.Sequential([
tf.keras.layers.Dense(512, activation='relu', input_shape=(self.state_size,)),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(self.action_size, activation='linear')
])
network.compile(optimizer='adam', loss='mse')
return network
def store_experience(self, state, action, reward, next_state, terminal):
self.replay_buffer.append((state, action, reward, next_state, terminal))
def choose_action(self, state):
if np.random.random() <= self.exploration_rate:
return random.choice(range(self.action_size))
q_values = self.q_network.predict(state.reshape(1, -1), verbose=0)
return np.argmax(q_values[0])
def train_model(self, batch_size=64):
if len(self.replay_buffer) < batch_size:
return
batch = random.sample(self.replay_buffer, batch_size)
states = np.array([experience[0] for experience in batch])
actions = np.array([experience[1] for experience in batch])
rewards = np.array([experience[2] for experience in batch])
next_states = np.array([experience[3] for experience in batch])
terminals = np.array([experience[4] for experience in batch])
current_q = self.q_network.predict(states, verbose=0)
next_q = self.q_network.predict(next_states, verbose=0)
for i in range(batch_size):
if terminals[i]:
current_q[i][actions[i]] = rewards[i]
else:
current_q[i][actions[i]] = rewards[i] + self.discount_factor * np.max(next_q[i])
self.q_network.fit(states, current_q, verbose=0, epochs=1)
if self.exploration_rate > self.min_exploration:
self.exploration_rate *= self.exploration_decay
def run_training():
environment = gym.make('Breakout-ram-v4')
agent = DQNAgent(128, environment.action_space.n)
episodes = 1000
for episode in range(episodes):
current_state = environment.reset()
total_reward = 0
lives = 5
while True:
action = agent.choose_action(current_state)
new_state, reward, done, info = environment.step(action)
# Penalty for losing life
if info['ale.lives'] < lives:
lives = info['ale.lives']
reward = -10
agent.store_experience(current_state/255.0, action, reward, new_state/255.0, done)
current_state = new_state
total_reward += reward
if done:
break
agent.train_model()
print(f"Episode: {episode}, Score: {total_reward}, Epsilon: {agent.exploration_rate:.3f}")
if __name__ == '__main__':
run_training()
The agent runs without errors but shows no improvement in performance. What could be preventing the learning process from working properly?