CUDA Memory Management for Deep Learning
Efficient GPU memory management is critical for deep learning applications. Memory mismanagement leads to out-of-memory errors, performance degradation, and wasted resources. This guide covers the fundamentals of CUDA memory and best practices for deep learning workloads.
GPU Memory Hierarchy
Understanding the memory hierarchy is fundamental to optimization:
┌─────────────────────────────────────────┐
│ Registers (per-thread) │ ~1 cycle
│ - Fastest, smallest │
│ - ~256 KB per SM │
└─────────────────────────────────────────┘
↓
┌─────────────────────────────────────────┐
│ Shared Memory / L1 Cache │ ~30 cycles
│ - Shared within thread block │
│ - ~100 KB per SM (configurable) │
└─────────────────────────────────────────┘
↓
┌─────────────────────────────────────────┐
│ L2 Cache │ ~200 cycles
│ - Shared across all SMs │
│ - 40-60 MB (GPU dependent) │
└─────────────────────────────────────────┘
↓
┌─────────────────────────────────────────┐
│ High-Bandwidth Memory (HBM) │ ~400-600 cycles
│ - Main GPU memory │
│ - 40-80 GB (A100/H100) │
│ - Bandwidth: 2-3 TB/s │
└─────────────────────────────────────────┘
Memory Allocation Basics
cudaMalloc and cudaFree
// Basic allocation
float* d_data;
cudaMalloc(&d_data, size * sizeof(float));
// Always check for errors
cudaError_t err = cudaMalloc(&d_data, size * sizeof(float));
if (err != cudaSuccess) {
printf("cudaMalloc failed: %s\n", cudaGetErrorString(err));
}
// Free when done
cudaFree(d_data);
Pinned (Page-Locked) Host Memory
float* h_data;
cudaMallocHost(&h_data, size * sizeof(float)); // Pinned memory
// Benefits:
// - Faster host-device transfers (can use DMA)
// - Required for async transfers
// - 2-3x faster than pageable memory
cudaFreeHost(h_data);
Memory Pools for Deep Learning
The Problem with Naive Allocation
# BAD: Allocating inside training loop
for epoch in range(num_epochs):
for batch in dataloader:
temp = torch.zeros(batch_size, hidden_dim).cuda() # Allocation!
output = model(batch)
# ... training code
Each cudaMalloc call has overhead (~1ms). In a tight loop, this adds up.
Memory Pool Solution
class MemoryPool {
private:
struct Block {
void* ptr;
size_t size;
bool in_use;
};
std::vector<Block> blocks;
public:
void* allocate(size_t size) {
// Try to find existing block
for (auto& block : blocks) {
if (!block.in_use && block.size >= size) {
block.in_use = true;
return block.ptr;
}
}
// Allocate new block
void* ptr;
cudaMalloc(&ptr, size);
blocks.push_back({ptr, size, true});
return ptr;
}
void deallocate(void* ptr) {
for (auto& block : blocks) {
if (block.ptr == ptr) {
block.in_use = false;
return;
}
}
}
};
PyTorch's caching allocator does this automatically:
# Check PyTorch's memory management
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
# Clear cache if needed
torch.cuda.empty_cache()
Memory Coalescing
Why Coalescing Matters
Memory accesses are most efficient when consecutive threads access consecutive memory addresses:
// GOOD: Coalesced access - threads 0-31 access consecutive elements
__global__ void coalesced(float* data, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
float val = data[idx]; // Threads access data[0], data[1], ...
}
}
// BAD: Strided access - threads 0-31 access every 32nd element
__global__ void strided(float* data, int N, int stride) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx * stride < N) {
float val = data[idx * stride]; // Threads access data[0], data[32], ...
}
}
Structure of Arrays vs Array of Structures
// Array of Structures (AoS) - BAD for GPU
struct Particle {
float x, y, z;
float vx, vy, vz;
};
Particle particles[N];
// Structure of Arrays (SoA) - GOOD for GPU
struct ParticlesSoA {
float *x, *y, *z;
float *vx, *vy, *vz;
};
SoA enables coalesced access when processing one property across all particles.
Unified Memory
When to Use
float* data;
cudaMallocManaged(&data, size * sizeof(float));
// CPU can access it
for (int i = 0; i < N; i++) {
data[i] = static_cast<float>(i);
}
// GPU can access it
myKernel<<<blocks, threads>>>(data, N);
cudaDeviceSynchronize();
cudaFree(data);
Use Unified Memory for:
- Rapid prototyping
- Large datasets that don't fit in GPU memory
- Sparse access patterns
Avoid for:
- High-frequency small transfers
- Training loops (page fault overhead)
- Predictable access patterns (explicit transfers are faster)
Prefetching
// Prefetch to GPU before kernel launch
cudaMemPrefetchAsync(data, size, device_id, stream);
// Advise the driver about access patterns
cudaMemAdvise(data, size, cudaMemAdviseSetReadMostly, device_id);
cudaMemAdvise(data, size, cudaMemAdviseSetPreferredLocation, device_id);
Memory Profiling
PyTorch Profiling
import torch
from torch.profiler import profile, ProfilerActivity
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
profile_memory=True
) as prof:
output = model(input_data)
loss = criterion(output, target)
loss.backward()
print(prof.key_averages().table(sort_by="cuda_memory_usage", row_limit=10))
Memory Tracking
def monitor_memory():
torch.cuda.reset_peak_memory_stats()
# Training code here
print(f"Peak memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
print(f"Current memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Cached memory: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
CUDA Tools
# Nsight Systems for timeline profiling
nsys profile --stats=true ./my_training_app
# Nsight Compute for kernel-level metrics
ncu --metrics dram_throughput,l1tex_hit_rate ./my_app
# Memory leak detection
cuda-memcheck --leak-check full ./my_app
Common Pitfalls
Memory Leaks
// LEAK: Forgetting to free
void memoryLeakExample() {
for (int i = 0; i < 1000; i++) {
float* d_temp;
cudaMalloc(&d_temp, 1024 * sizeof(float));
processData<<<...>>>(d_temp);
// Missing cudaFree(d_temp)!
}
}
// FIX: RAII wrapper
template<typename T>
class CudaMemory {
T* ptr = nullptr;
public:
CudaMemory(size_t n) { cudaMalloc(&ptr, n * sizeof(T)); }
~CudaMemory() { if (ptr) cudaFree(ptr); }
T* get() { return ptr; }
};
Memory Fragmentation
# Fragmentation from allocate/free cycles
# Solution: Pre-allocate workspace
workspace = torch.zeros(max_batch_size, hidden_dim, device='cuda')
for batch in dataloader:
# Reuse workspace instead of allocating
batch_workspace = workspace[:batch.size(0)]
Out-of-Memory Errors
# Solution 1: Reduce batch size
# Solution 2: Gradient accumulation
optimizer.zero_grad()
for i, batch in enumerate(dataloader):
loss = model(batch) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
# Solution 3: Gradient checkpointing
from torch.utils.checkpoint import checkpoint
class CheckpointedModel(nn.Module):
def forward(self, x):
x = checkpoint(self.layer1, x)
x = checkpoint(self.layer2, x)
return x
Production Best Practices
Pre-allocate Workspaces
class TrainingSession {
float* d_weights;
float* d_gradients;
float* d_activations;
public:
TrainingSession(size_t num_params, size_t max_batch_size) {
cudaMalloc(&d_weights, num_params * sizeof(float));
cudaMalloc(&d_gradients, num_params * sizeof(float));
cudaMalloc(&d_activations, max_batch_size * hidden * sizeof(float));
}
void trainEpoch(DataLoader& loader) {
for (auto& batch : loader) {
// No allocations in hot path!
forward(batch, d_weights, d_activations);
backward(d_weights, d_gradients, d_activations);
update(d_weights, d_gradients);
}
}
};
Memory Budget Management
class MemoryBudget:
def __init__(self, fraction=0.9):
free, total = torch.cuda.mem_get_info()
self.budget = int(total * fraction)
self.allocated = 0
def can_allocate(self, bytes):
return self.allocated + bytes <= self.budget
def allocate(self, bytes):
if not self.can_allocate(bytes):
raise RuntimeError("Memory budget exceeded")
self.allocated += bytes
Asynchronous Memory Operations
// Overlap computation with data transfer
void pipelinedTraining(int num_batches) {
cudaStream_t streams[3];
for (int i = 0; i < 3; i++) {
cudaStreamCreate(&streams[i]);
}
for (int batch = 0; batch < num_batches; batch++) {
int stream_idx = batch % 3;
cudaStream_t stream = streams[stream_idx];
// Transfer batch to device (async)
cudaMemcpyAsync(d_batch, h_batch, size,
cudaMemcpyHostToDevice, stream);
// Process batch (async)
processKernel<<<blocks, threads, 0, stream>>>(d_batch);
}
// Wait for all streams
cudaDeviceSynchronize();
}
Conclusion
Effective CUDA memory management requires understanding:
- Memory hierarchy: Registers → Shared → L2 → HBM
- Allocation patterns: Pre-allocate, use pools, avoid in-loop allocation
- Access patterns: Coalesced access, SoA layout
- Profiling: Use PyTorch profiler, Nsight tools
- Common pitfalls: Leaks, fragmentation, OOM
Master these fundamentals and you'll avoid the majority of GPU memory issues in deep learning applications.