CUDA Memory Management for Deep Learning#
Efficient GPU memory management is critical for deep learning applications. Memory mismanagement leads to out-of-memory errors, performance degradation, and wasted resources. This guide covers the fundamentals of CUDA memory and best practices for deep learning workloads.
GPU Memory Hierarchy#
Understanding the memory hierarchy is fundamental to optimization:
┌─────────────────────────────────────────┐
│ Registers (per-thread) │ ~1 cycle
│ - Fastest, smallest │
│ - ~256 KB per SM │
└─────────────────────────────────────────┘
↓
┌─────────────────────────────────────────┐
│ Shared Memory / L1 Cache │ ~30 cycles
│ - Shared within thread block │
│ - ~100 KB per SM (configurable) │
└─────────────────────────────────────────┘
↓
┌─────────────────────────────────────────┐
│ L2 Cache │ ~200 cycles
│ - Shared across all SMs │
│ - 40-60 MB (GPU dependent) │
└─────────────────────────────────────────┘
↓
┌─────────────────────────────────────────┐
│ High-Bandwidth Memory (HBM) │ ~400-600 cycles
│ - Main GPU memory │
│ - 40-80 GB (A100/H100) │
│ - Bandwidth: 2-3 TB/s │
└─────────────────────────────────────────┘Memory Allocation Basics#
cudaMalloc and cudaFree#
// Basic allocation
float* d_data;
cudaMalloc(&d_data, size * sizeof(float));
// Always check for errors
cudaError_t err = cudaMalloc(&d_data, size * sizeof(float));
if (err != cudaSuccess) {
printf("cudaMalloc failed: %s\n", cudaGetErrorString(err));
}
// Free when done
cudaFree(d_data);Pinned (Page-Locked) Host Memory#
float* h_data;
cudaMallocHost(&h_data, size * sizeof(float)); // Pinned memory
// Benefits:
// - Faster host-device transfers (can use DMA)
// - Required for async transfers
// - 2-3x faster than pageable memory
cudaFreeHost(h_data);Memory Pools for Deep Learning#
The Problem with Naive Allocation#
# BAD: Allocating inside training loop
for epoch in range(num_epochs):
for batch in dataloader:
temp = torch.zeros(batch_size, hidden_dim).cuda() # Allocation!
output = model(batch)
# ... training codeEach cudaMalloc call has overhead (~1ms). In a tight loop, this adds up.
Memory Pool Solution#
class MemoryPool {
private:
struct Block {
void* ptr;
size_t size;
bool in_use;
};
std::vector<Block> blocks;
public:
void* allocate(size_t size) {
// Try to find existing block
for (auto& block : blocks) {
if (!block.in_use && block.size >= size) {
block.in_use = true;
return block.ptr;
}
}
// Allocate new block
void* ptr;
cudaMalloc(&ptr, size);
blocks.push_back({ptr, size, true});
return ptr;
}
void deallocate(void* ptr) {
for (auto& block : blocks) {
if (block.ptr == ptr) {
block.in_use = false;
return;
}
}
}
};PyTorch's caching allocator does this automatically:
# Check PyTorch's memory management
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
# Clear cache if needed
torch.cuda.empty_cache()Memory Coalescing#
Why Coalescing Matters#
Memory accesses are most efficient when consecutive threads access consecutive memory addresses:
// GOOD: Coalesced access - threads 0-31 access consecutive elements
__global__ void coalesced(float* data, int N) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) {
float val = data[idx]; // Threads access data[0], data[1], ...
}
}
// BAD: Strided access - threads 0-31 access every 32nd element
__global__ void strided(float* data, int N, int stride) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx * stride < N) {
float val = data[idx * stride]; // Threads access data[0], data[32], ...
}
}Structure of Arrays vs Array of Structures#
// Array of Structures (AoS) - BAD for GPU
struct Particle {
float x, y, z;
float vx, vy, vz;
};
Particle particles[N];
// Structure of Arrays (SoA) - GOOD for GPU
struct ParticlesSoA {
float *x, *y, *z;
float *vx, *vy, *vz;
};SoA enables coalesced access when processing one property across all particles.
Unified Memory#
When to Use#
float* data;
cudaMallocManaged(&data, size * sizeof(float));
// CPU can access it
for (int i = 0; i < N; i++) {
data[i] = static_cast<float>(i);
}
// GPU can access it
myKernel<<<blocks, threads>>>(data, N);
cudaDeviceSynchronize();
cudaFree(data);Use Unified Memory for:
- Rapid prototyping
- Large datasets that don't fit in GPU memory
- Sparse access patterns
Avoid for:
- High-frequency small transfers
- Training loops (page fault overhead)
- Predictable access patterns (explicit transfers are faster)
Prefetching#
// Prefetch to GPU before kernel launch
cudaMemPrefetchAsync(data, size, device_id, stream);
// Advise the driver about access patterns
cudaMemAdvise(data, size, cudaMemAdviseSetReadMostly, device_id);
cudaMemAdvise(data, size, cudaMemAdviseSetPreferredLocation, device_id);Memory Profiling#
PyTorch Profiling#
import torch
from torch.profiler import profile, ProfilerActivity
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
profile_memory=True
) as prof:
output = model(input_data)
loss = criterion(output, target)
loss.backward()
print(prof.key_averages().table(sort_by="cuda_memory_usage", row_limit=10))Memory Tracking#
def monitor_memory():
torch.cuda.reset_peak_memory_stats()
# Training code here
print(f"Peak memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
print(f"Current memory: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Cached memory: {torch.cuda.memory_reserved() / 1e9:.2f} GB")CUDA Tools#
# Nsight Systems for timeline profiling
nsys profile --stats=true ./my_training_app
# Nsight Compute for kernel-level metrics
ncu --metrics dram_throughput,l1tex_hit_rate ./my_app
# Memory leak detection
cuda-memcheck --leak-check full ./my_appCommon Pitfalls#
Memory Leaks#
// LEAK: Forgetting to free
void memoryLeakExample() {
for (int i = 0; i < 1000; i++) {
float* d_temp;
cudaMalloc(&d_temp, 1024 * sizeof(float));
processData<<<...>>>(d_temp);
// Missing cudaFree(d_temp)!
}
}
// FIX: RAII wrapper
template<typename T>
class CudaMemory {
T* ptr = nullptr;
public:
CudaMemory(size_t n) { cudaMalloc(&ptr, n * sizeof(T)); }
~CudaMemory() { if (ptr) cudaFree(ptr); }
T* get() { return ptr; }
};Memory Fragmentation#
# Fragmentation from allocate/free cycles
# Solution: Pre-allocate workspace
workspace = torch.zeros(max_batch_size, hidden_dim, device='cuda')
for batch in dataloader:
# Reuse workspace instead of allocating
batch_workspace = workspace[:batch.size(0)]Out-of-Memory Errors#
# Solution 1: Reduce batch size
# Solution 2: Gradient accumulation
optimizer.zero_grad()
for i, batch in enumerate(dataloader):
loss = model(batch) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
# Solution 3: Gradient checkpointing
from torch.utils.checkpoint import checkpoint
class CheckpointedModel(nn.Module):
def forward(self, x):
x = checkpoint(self.layer1, x)
x = checkpoint(self.layer2, x)
return xProduction Best Practices#
Pre-allocate Workspaces#
class TrainingSession {
float* d_weights;
float* d_gradients;
float* d_activations;
public:
TrainingSession(size_t num_params, size_t max_batch_size) {
cudaMalloc(&d_weights, num_params * sizeof(float));
cudaMalloc(&d_gradients, num_params * sizeof(float));
cudaMalloc(&d_activations, max_batch_size * hidden * sizeof(float));
}
void trainEpoch(DataLoader& loader) {
for (auto& batch : loader) {
// No allocations in hot path!
forward(batch, d_weights, d_activations);
backward(d_weights, d_gradients, d_activations);
update(d_weights, d_gradients);
}
}
};Memory Budget Management#
class MemoryBudget:
def __init__(self, fraction=0.9):
free, total = torch.cuda.mem_get_info()
self.budget = int(total * fraction)
self.allocated = 0
def can_allocate(self, bytes):
return self.allocated + bytes <= self.budget
def allocate(self, bytes):
if not self.can_allocate(bytes):
raise RuntimeError("Memory budget exceeded")
self.allocated += bytesAsynchronous Memory Operations#
// Overlap computation with data transfer
void pipelinedTraining(int num_batches) {
cudaStream_t streams[3];
for (int i = 0; i < 3; i++) {
cudaStreamCreate(&streams[i]);
}
for (int batch = 0; batch < num_batches; batch++) {
int stream_idx = batch % 3;
cudaStream_t stream = streams[stream_idx];
// Transfer batch to device (async)
cudaMemcpyAsync(d_batch, h_batch, size,
cudaMemcpyHostToDevice, stream);
// Process batch (async)
processKernel<<<blocks, threads, 0, stream>>>(d_batch);
}
// Wait for all streams
cudaDeviceSynchronize();
}Conclusion#
Effective CUDA memory management requires understanding:
- Memory hierarchy: Registers → Shared → L2 → HBM
- Allocation patterns: Pre-allocate, use pools, avoid in-loop allocation
- Access patterns: Coalesced access, SoA layout
- Profiling: Use PyTorch profiler, Nsight tools
- Common pitfalls: Leaks, fragmentation, OOM
Master these fundamentals and you'll avoid the majority of GPU memory issues in deep learning applications.