convertToBf16Impl Class — pytorch Architecture

Architecture documentation for the convertToBf16Impl class in vec128_convert.h from the pytorch codebase.

Class c

Entity Profile

Source Code

aten/src/ATen/cpu/vec/vec128/vec128_convert.h lines 231–267

template <typename from_type>
inline void convertToBf16Impl(
    const from_type* __restrict src,
    c10::BFloat16* __restrict dst,
    uint64_t n) {
  bfloat16_t* dstPtr = reinterpret_cast<bfloat16_t*>(dst);
  uint64_t loopBound = n - (n % 16);
  uint64_t i = 0;
  for (; i < loopBound; i += 16) {
    float32x4_t a, b, c, d;
    a[0] = static_cast<float>(src[i]);
    a[1] = static_cast<float>(src[i + 1]);
    a[2] = static_cast<float>(src[i + 2]);
    a[3] = static_cast<float>(src[i + 3]);
    b[0] = static_cast<float>(src[i + 4]);
    b[1] = static_cast<float>(src[i + 5]);
    b[2] = static_cast<float>(src[i + 6]);
    b[3] = static_cast<float>(src[i + 7]);
    c[0] = static_cast<float>(src[i + 8]);
    c[1] = static_cast<float>(src[i + 9]);
    c[2] = static_cast<float>(src[i + 10]);
    c[3] = static_cast<float>(src[i + 11]);
    d[0] = static_cast<float>(src[i + 12]);
    d[1] = static_cast<float>(src[i + 13]);
    d[2] = static_cast<float>(src[i + 14]);
    d[3] = static_cast<float>(src[i + 15]);

    vst1q_bf16(dstPtr + i, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(a), b));
    vst1q_bf16(dstPtr + i + 8, vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(c), d));
  }

#pragma clang loop vectorize(disable) interleave(disable) unroll(disable)
  for (; i < n; i++) {
    float a = static_cast<float>(src[i]);
    dstPtr[i] = vcvth_bf16_f32(a);
  }
}

Source

View on GitHub

Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.

Try Supermodel Free