CUDA Memory Management for Deep Learning #

Efficient GPU memory management is critical for deep learning applications. Memory mismanagement leads to out-of-memory errors, performance degradation, and wasted resources. This guide covers the fundamentals of CUDA memory and best practices for deep learning workloads.

GPU Memory Hierarchy #

Understanding the memory hierarchy is fundamental to optimization:

┌─────────────────────────────────────────┐
│  Registers (per-thread)                 │  ~1 cycle
│  - Fastest, smallest                    │
│  - ~256 KB per SM                       │
└─────────────────────────────────────────┘
             ↓
┌─────────────────────────────────────────┐
│  Shared Memory / L1 Cache               │  ~30 cycles
│  - Shared within thread block           │
│  - ~100 KB per SM (configurable)        │
└─────────────────────────────────────────┘
             ↓
┌─────────────────────────────────────────┐
│  L2 Cache                               │  ~200 cycles
│  - Shared across all SMs                │
│  - 40-60 MB (GPU dependent)             │
└─────────────────────────────────────────┘
             ↓
┌─────────────────────────────────────────┐
│  High-Bandwidth Memory (HBM)            │  ~400-600 cycles
│  - Main GPU memory                      │
│  - 40-80 GB (A100/H100)                 │
│  - Bandwidth: 2-3 TB/s                  │
└─────────────────────────────────────────┘

Memory Allocation Basics #

cudaMalloc and cudaFree #

// Basic allocation
float* d_data;
cudaMalloc(&d_data, size * sizeof(float));
 
// Always check for errors
cudaError_t err = cudaMalloc(&d_data, size * sizeof(float));
if (err != cudaSuccess) {
    printf("cudaMalloc failed: %s\n", cudaGetErrorString(err));
}
 
// Free when done
cudaFree(d_data);

Pinned (Page-Locked) Host Memory #

float* h_data;
cudaMallocHost(&h_data, size * sizeof(float));  // Pinned memory
 
// Benefits:
// - Faster host-device transfers (can use DMA)
// - Required for async transfers
// - 2-3x faster than pageable memory
 
cudaFreeHost(h_data);

Memory Pools for Deep Learning #

The Problem with Naive Allocation #

# BAD: Allocating inside training loop
for epoch in range(num_epochs):
    for batch in dataloader:
        temp = torch.zeros(batch_size, hidden_dim).cuda()  # Allocation!
        output = model(batch)
        # ... training code

Each cudaMalloc call has overhead (~1ms). In a tight loop, this adds up.

Memory Pool Solution #

class MemoryPool {
private:
    struct Block {
        void* ptr;
        size_t size;
        bool in_use;
    };
    std::vector<Block> blocks;
 
public:
    void* allocate(size_t size) {
        // Try to find existing block
        for (auto& block : blocks) {
            if (!block.in_use && block.size >= size) {
                block.in_use = true;
                return block.ptr;
            }
        }
 
        // Allocate new block
        void* ptr;
        cudaMalloc(&ptr, size);
        blocks.push_back({ptr, size, true});
        return ptr;
    }
 
    void deallocate(void* ptr) {
        for (auto& block : blocks) {
            if (block.ptr == ptr) {
                block.in_use = false;
                return;
            }
        }
    }
};

PyTorch's caching allocator does this automatically:

# Check PyTorch's memory management
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
 
# Clear cache if needed
torch.cuda.empty_cache()

Memory Coalescing #

Why Coalescing Matters #

Memory accesses are most efficient when consecutive threads access consecutive memory addresses:

// GOOD: Coalesced access - threads 0-31 access consecutive elements
__global__ void coalesced(float* data, int N) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < N) {
        float val = data[idx];  // Threads access data[0], data[1], ...
    }
}
 
// BAD: Strided access - threads 0-31 access every 32nd element
__global__ void strided(float* data, int N, int stride) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx * stride < N) {
        float val = data[idx * stride];  // Threads access data[0], data[32], ...
    }
}

Structure of Arrays vs Array of Structures #

// Array of Structures (AoS) - BAD for GPU
struct Particle {
    float x, y, z;
    float vx, vy, vz;
};
Particle particles[N];
 
// Structure of Arrays (SoA) - GOOD for GPU
struct ParticlesSoA {
    float *x, *y, *z;
    float *vx, *vy, *vz;
};

SoA enables coalesced access when processing one property across all particles.

Unified Memory #

When to Use #

float* data;
cudaMallocManaged(&data, size * sizeof(float));
 
// CPU can access it
for (int i = 0; i < N; i++) {
    data[i] = static_cast<float>(i);
}
 
// GPU can access it
myKernel<<<blocks, threads>>>(data, N);
cudaDeviceSynchronize();
 
cudaFree(data);

Use Unified Memory for:

Rapid prototyping
Large datasets that don't fit in GPU memory
Sparse access patterns

Avoid for:

High-frequency small transfers
Training loops (page fault overhead)
Predictable access patterns (explicit transfers are faster)

Prefetching #

// Prefetch to GPU before kernel launch
cudaMemPrefetchAsync(data, size, device_id, stream);
 
// Advise the driver about access patterns
cudaMemAdvise(data, size, cudaMemAdviseSetReadMostly, device_id);
cudaMemAdvise(data, size, cudaMemAdviseSetPreferredLocation, device_id);

Memory Profiling #

PyTorch Profiling #

import torch
from torch.profiler import profile, ProfilerActivity
 
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    profile_memory=True
) as prof:
    output = model(input_data)
    loss = criterion(output, target)
    loss.backward()
 
print(prof.key_averages().table(sort_by="cuda_memory_usage", row_limit=10))

Memory Tracking #

def monitor_memory():
    torch.cuda.reset_peak_memory_stats()
 
    # Training code here
 
    print(f"Peak memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
    print(f"Current memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"Cached memory: {torch.cuda.memory_reserved() / 1e9:.2f} GB")

CUDA Tools #

# Nsight Systems for timeline profiling
nsys profile --stats=true ./my_training_app
 
# Nsight Compute for kernel-level metrics
ncu --metrics dram_throughput,l1tex_hit_rate ./my_app
 
# Memory leak detection
cuda-memcheck --leak-check full ./my_app

Common Pitfalls #

Memory Leaks #

// LEAK: Forgetting to free
void memoryLeakExample() {
    for (int i = 0; i < 1000; i++) {
        float* d_temp;
        cudaMalloc(&d_temp, 1024 * sizeof(float));
        processData<<<...>>>(d_temp);
        // Missing cudaFree(d_temp)!
    }
}
 
// FIX: RAII wrapper
template<typename T>
class CudaMemory {
    T* ptr = nullptr;
public:
    CudaMemory(size_t n) { cudaMalloc(&ptr, n * sizeof(T)); }
    ~CudaMemory() { if (ptr) cudaFree(ptr); }
    T* get() { return ptr; }
};

Memory Fragmentation #

# Fragmentation from allocate/free cycles
# Solution: Pre-allocate workspace
 
workspace = torch.zeros(max_batch_size, hidden_dim, device='cuda')
 
for batch in dataloader:
    # Reuse workspace instead of allocating
    batch_workspace = workspace[:batch.size(0)]

Out-of-Memory Errors #

# Solution 1: Reduce batch size
# Solution 2: Gradient accumulation
optimizer.zero_grad()
for i, batch in enumerate(dataloader):
    loss = model(batch) / accumulation_steps
    loss.backward()
 
    if (i + 1) % accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()
 
# Solution 3: Gradient checkpointing
from torch.utils.checkpoint import checkpoint
 
class CheckpointedModel(nn.Module):
    def forward(self, x):
        x = checkpoint(self.layer1, x)
        x = checkpoint(self.layer2, x)
        return x

Production Best Practices #

Pre-allocate Workspaces #

class TrainingSession {
    float* d_weights;
    float* d_gradients;
    float* d_activations;
 
public:
    TrainingSession(size_t num_params, size_t max_batch_size) {
        cudaMalloc(&d_weights, num_params * sizeof(float));
        cudaMalloc(&d_gradients, num_params * sizeof(float));
        cudaMalloc(&d_activations, max_batch_size * hidden * sizeof(float));
    }
 
    void trainEpoch(DataLoader& loader) {
        for (auto& batch : loader) {
            // No allocations in hot path!
            forward(batch, d_weights, d_activations);
            backward(d_weights, d_gradients, d_activations);
            update(d_weights, d_gradients);
        }
    }
};

Memory Budget Management #

class MemoryBudget:
    def __init__(self, fraction=0.9):
        free, total = torch.cuda.mem_get_info()
        self.budget = int(total * fraction)
        self.allocated = 0
 
    def can_allocate(self, bytes):
        return self.allocated + bytes <= self.budget
 
    def allocate(self, bytes):
        if not self.can_allocate(bytes):
            raise RuntimeError("Memory budget exceeded")
        self.allocated += bytes

Asynchronous Memory Operations #

// Overlap computation with data transfer
void pipelinedTraining(int num_batches) {
    cudaStream_t streams[3];
    for (int i = 0; i < 3; i++) {
        cudaStreamCreate(&streams[i]);
    }
 
    for (int batch = 0; batch < num_batches; batch++) {
        int stream_idx = batch % 3;
        cudaStream_t stream = streams[stream_idx];
 
        // Transfer batch to device (async)
        cudaMemcpyAsync(d_batch, h_batch, size,
                       cudaMemcpyHostToDevice, stream);
 
        // Process batch (async)
        processKernel<<<blocks, threads, 0, stream>>>(d_batch);
    }
 
    // Wait for all streams
    cudaDeviceSynchronize();
}

Conclusion #

Effective CUDA memory management requires understanding:

Memory hierarchy: Registers → Shared → L2 → HBM
Allocation patterns: Pre-allocate, use pools, avoid in-loop allocation
Access patterns: Coalesced access, SoA layout
Profiling: Use PyTorch profiler, Nsight tools
Common pitfalls: Leaks, fragmentation, OOM

Master these fundamentals and you'll avoid the majority of GPU memory issues in deep learning applications.