Deep Q-Learning agent fails to improve on Atari Breakout RAM environment

I’m working on a reinforcement learning project where I built a DQN agent to master Breakout using the RAM observations. The agent runs through episodes but doesn’t seem to get better at the game even after training for many episodes.

I’ve tried adjusting different hyperparameters like batch size and replay buffer size. I also modified the reward system to penalize the agent when it loses a life. However, the learning progress is still not happening.

Here’s my implementation:

import gym
import numpy as np
import random
import tensorflow as tf
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.replay_buffer = deque(maxlen=50000)
        self.discount_factor = 0.95
        self.exploration_rate = 1.0
        self.min_exploration = 0.01
        self.exploration_decay = 0.995
        
        # Build neural network
        self.q_network = self.build_network()
        
    def build_network(self):
        network = tf.keras.Sequential([
            tf.keras.layers.Dense(512, activation='relu', input_shape=(self.state_size,)),
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        network.compile(optimizer='adam', loss='mse')
        return network
    
    def store_experience(self, state, action, reward, next_state, terminal):
        self.replay_buffer.append((state, action, reward, next_state, terminal))
    
    def choose_action(self, state):
        if np.random.random() <= self.exploration_rate:
            return random.choice(range(self.action_size))
        
        q_values = self.q_network.predict(state.reshape(1, -1), verbose=0)
        return np.argmax(q_values[0])
    
    def train_model(self, batch_size=64):
        if len(self.replay_buffer) < batch_size:
            return
            
        batch = random.sample(self.replay_buffer, batch_size)
        states = np.array([experience[0] for experience in batch])
        actions = np.array([experience[1] for experience in batch])
        rewards = np.array([experience[2] for experience in batch])
        next_states = np.array([experience[3] for experience in batch])
        terminals = np.array([experience[4] for experience in batch])
        
        current_q = self.q_network.predict(states, verbose=0)
        next_q = self.q_network.predict(next_states, verbose=0)
        
        for i in range(batch_size):
            if terminals[i]:
                current_q[i][actions[i]] = rewards[i]
            else:
                current_q[i][actions[i]] = rewards[i] + self.discount_factor * np.max(next_q[i])
        
        self.q_network.fit(states, current_q, verbose=0, epochs=1)
        
        if self.exploration_rate > self.min_exploration:
            self.exploration_rate *= self.exploration_decay

def run_training():
    environment = gym.make('Breakout-ram-v4')
    agent = DQNAgent(128, environment.action_space.n)
    
    episodes = 1000
    
    for episode in range(episodes):
        current_state = environment.reset()
        total_reward = 0
        lives = 5
        
        while True:
            action = agent.choose_action(current_state)
            new_state, reward, done, info = environment.step(action)
            
            # Penalty for losing life
            if info['ale.lives'] < lives:
                lives = info['ale.lives']
                reward = -10
            
            agent.store_experience(current_state/255.0, action, reward, new_state/255.0, done)
            current_state = new_state
            total_reward += reward
            
            if done:
                break
        
        agent.train_model()
        print(f"Episode: {episode}, Score: {total_reward}, Epsilon: {agent.exploration_rate:.3f}")

if __name__ == '__main__':
    run_training()

The agent runs without errors but shows no improvement in performance. What could be preventing the learning process from working properly?

you’re missing a target network! dqn needs a separate target network that updates less frequently - that’s what stabilizes training. also, drop your learning rate and train more often. try every 4 steps instead of waiting for the episode to end.