GemmParams Class — pytorch Architecture
Architecture documentation for the GemmParams class in GemmCommon.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/cuda/tunable/GemmCommon.h lines 277–375
template <typename T>
struct GemmParams : OpParams {
GemmParams() = default;
GemmParams(const GemmParams&) = default;
GemmParams& operator=(const GemmParams&) = default;
~GemmParams() override = default;
std::string BLASSignature() const override {
std::string alpha_str = to_string_opmath<T>(alpha);
std::string beta_str = to_string_opmath<T>(beta);
return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
"alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: 1, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, bias_type: %s, compute_type: %s }",
m, n, k, lda, ldb, ldc, ldc, alpha_str, beta_str, transa, transb,
BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>(), ComputeTypeFor<T>());
}
std::string Signature() const override {
return fmt::sprintf("%c%c_%ld_%ld_%ld_ld_%ld_%ld_%ld", transa, transb, m, n, k, lda, ldb, ldc);
}
size_t GetSizeA() const {
size_t size_stride = lda * ((transa == 'n' || transa == 'N') ? k : m);
size_t size_dense = m * k;
return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
}
size_t GetSizeB() const {
size_t size_stride = ldb * ((transb == 'n' || transb == 'N') ? n : k);
size_t size_dense = k * n;
return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
}
size_t GetSizeC() const {
size_t size_stride = ldc * n;
size_t size_dense = m * n;
return sizeof(T) * (size_stride > size_dense ? size_stride : size_dense);
}
size_t GetSize(bool duplicate_inputs) const {
size_t size = GetSizeC();
if (duplicate_inputs) {
size += GetSizeA();
size += GetSizeB();
}
return size;
}
GemmParams* DeepCopy(bool duplicate_inputs) const {
GemmParams* copy = new GemmParams(*this);
c10::DeviceIndex device = 0;
AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
size_t c_size = GetSizeC();
copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
if (duplicate_inputs) {
size_t a_size = GetSizeA();
size_t b_size = GetSizeB();
copy->a = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(a_size));
copy->b = static_cast<const T*>(c10::cuda::CUDACachingAllocator::raw_alloc(b_size));
copy->duplicate_inputs_ = true;
}
return copy;
}
// only call on object returned by DeepCopy
void Delete() {
c10::cuda::CUDACachingAllocator::raw_delete(c);
if (duplicate_inputs_) {
// NOLINTNEXTLINE(*const-cast*)
c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(a));
// NOLINTNEXTLINE(*const-cast*)
c10::cuda::CUDACachingAllocator::raw_delete(const_cast<T*>(b));
}
}
TuningStatus NumericalCheck(GemmParams<T> *other) {
auto* ctx = getTuningContext();
auto cfg = ctx->GetNumericalCheckConfig();
auto c_dtype = c10::CppTypeToScalarType<T>::value;
return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T), cfg) ? OK : FAIL;
}
char transa{};
char transb{};
int64_t m{};
int64_t n{};
int64_t k{};
at::opmath_type<T> alpha;
const T* a{};
int64_t lda{};
const T* b{};
int64_t ldb{};
at::opmath_type<T> beta;
T* c{};
int64_t ldc{};
private:
bool duplicate_inputs_{false};
};
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free