MPSHeapAllocatorImpl Class — pytorch Architecture
Architecture documentation for the MPSHeapAllocatorImpl class in MPSAllocator.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/mps/MPSAllocator.h lines 264–435
class MPSHeapAllocatorImpl {
public:
explicit MPSHeapAllocatorImpl()
: m_device(at::mps::MPSDevice::getInstance()->device()),
m_max_buffer_size([m_device maxBufferLength]),
m_stream(getDefaultMPSStream()),
m_event_pool(getMPSEventPool()) {
init_allocator();
}
~MPSHeapAllocatorImpl() {
emptyCache();
}
// interface exposed to at::Allocator
id<MTLBuffer> malloc(size_t size, uint32_t usage);
// frees a buffer and returns it into buffer pool
void free(void* ptr);
// releases all the cached buffers and their associated heaps
void emptyCache();
// free inactive buffers that are pending to be freed
void freeInactiveBuffers();
// returns true if buffer was allocated from the shared pool
bool isSharedBuffer(const void* ptr);
// get the requested unaligned size of an MTLBuffer
ssize_t getUnalignedBufferSize(const void* ptr);
// set the shape of a base tensor from a view tensor
void setBufferShape(const void* ptr, const IntArrayRef& shape);
// retrieve the shape of a base tensor from a view tensor
IntArrayRef getBufferShape(const void* ptr);
// get the unique ID of the buffer
id_t getBufferId(const void* ptr);
// allocate a buffer from a specialized pool to import CPU scalars into GPU
id<MTLBuffer> allocScalarBufferWithValue(void* value, size_t size);
// returns a CPU-mapping of the input buffer and its retainCount,
// if only it has Shared storage-mode and allocated on MPSAllocator
std::pair<const void*, uint32_t> getSharedBufferPtr(const void* buffer);
// records events for a list of MTLBuffers (list is used to lock the mutex once)
// returns true if records any event (given if passed buffers exist and are shared-storage)
bool recordEvents(c10::ArrayRef<const void*> buffers);
// waits for the event to signal the completion of GPU execution
// on the passed shared buffers (list is used to lock the mutex once)
// returns true if actually waited on any event
bool waitForEvents(c10::ArrayRef<const void*> buffers);
// this indicates how far (in Megabytes) the current total allocations are from the
// low watermark limit which is used to detect if we're under memory pressure
// This returns zero if we've reached the low watermark limit
ssize_t getLowWatermarkValue();
// (see m_low_watermark_ratio for description)
void setLowWatermarkRatio(double ratio);
// (see m_high_watermark_ratio for description)
void setHighWatermarkRatio(double ratio);
// (see m_low_watermark_limit for description)
size_t getLowWatermarkLimit() const {
return m_low_watermark_limit;
}
// (see m_max_total_allowed_size for description)
size_t getHighWatermarkLimit() const {
return m_max_total_allowed_size;
}
// (see m_total_allocated_memory for description)
size_t getTotalAllocatedMemory() const {
return m_total_allocated_memory;
}
// (see m_current_allocated_memory for description)
size_t getCurrentAllocatedMemory() const {
return m_current_allocated_memory;
}
// total GPU memory allocated in the process by Metal driver; including
// implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl.
size_t getDriverAllocatedMemory() const {
return current_allocated_size();
}
// recommended Max memory for Metal
size_t getRecommendedMaxMemory() const {
return max_device_size();
}
// (see enum DebugVerbosity for description)
uint32_t getDebugVerbosity() const {
return m_debug_verbosity;
}
// returns the device that we allocate from
inline id<MTLDevice> Device() const {
return m_device;
}
inline std::string format_size(uint64_t size) const;
private:
// (see m_high_watermark_ratio for description)
constexpr static double default_high_watermark_ratio = 1.7;
// we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize.
constexpr static double default_high_watermark_upper_bound = 2.0;
// (see m_low_watermark_ratio for description)
// on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize
constexpr static double default_low_watermark_ratio_unified = 1.4;
constexpr static double default_low_watermark_ratio_discrete = 1.0;
const id<MTLDevice> m_device;
std::recursive_mutex m_mutex;
// allocated buffers by device pointer
ska::flat_hash_map<const void*, BufferBlock*> m_allocated_buffers;
// using a container for pools to simplify iterating them
ska::flat_hash_map<BufferPool::Kind, std::unique_ptr<BufferPool>> m_pools;
// total memory allocated by HeapAllocator (including blocks in pools)
size_t m_total_allocated_memory = 0;
// currently active memory allocations in use (i.e., blocks not in pools)
size_t m_current_allocated_memory = 0;
// max buffer size allowed by Metal
size_t m_max_buffer_size = 0;
// maximum total size allowed to be allocated
size_t m_max_total_allowed_size = 0;
// high watermark ratio is a hard limit for the total allowed allocations
// 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs)
// 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize)
// >1.: allows limits beyond the device.recommendedMaxWorkingSetSize
// e.g., value 0.95 means we allocate up to 95% of recommended maximum
// allocation size; beyond that, the allocations would fail with OOM error.
double m_high_watermark_ratio;
// low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark
// level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit).
// Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection)
// e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum
// allocation size.
double m_low_watermark_ratio;
// low watermark size limit (in Bytes) at the time we initialize the allocator
size_t m_low_watermark_limit;
// use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity
uint32_t m_debug_verbosity;
// default MPS stream
MPSStream* m_stream;
// we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator
std::shared_ptr<MPSEventPool> m_event_pool;
void init_allocator();
void init_buffer_pools();
HeapBlock* get_free_heap(AllocParams& params);
bool get_free_buffer(AllocParams& params);
BufferBlock* get_allocated_buffer_block(const void* ptr);
BufferBlock* alloc_buffer_block(size_t size, uint32_t usage);
bool alloc_buffer(AllocParams& params);
void free_buffer(BufferBlock* buffer_block);
// returns true if the container heap is also released
bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true);
void release_buffers(BufferPool& pool);
bool release_available_cached_buffers(AllocParams& params);
bool release_cached_buffers();
// free unused cached blocks to reclaim GPU memory if memory pressure is high
void garbage_collect_cached_buffers(AllocParams& params);
// returns the suitable buffer pool type for the usage or
// requested/allocated sizes
BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage);
// returns the aligned allocation size that is optimized
// for the buffers to get reused frequently
size_t get_allocation_size(size_t size, uint32_t usage) const;
// maximum size of device memory available for allocation in current process
// Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory.
size_t max_device_size() const {
return [m_device recommendedMaxWorkingSetSize];
}
// there are implicit allocations from MPS backend, so we need to query the 'device' for
// total allocated size instead of manually tracking in MPSAllocator
size_t current_allocated_size() const {
return [m_device currentAllocatedSize];
}
bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const {
for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) {
MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(
buffer_block ? buffer_block->buffer : nullptr, event);
}
return true;
}
};
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free