VECTOR_WIDTH Class — pytorch Architecture
Architecture documentation for the VECTOR_WIDTH class in vec_float.h from the pytorch codebase.
Entity Profile
Source Code
aten/src/ATen/cpu/vec/sve/vec_float.h lines 30–523
template <>
class Vectorized<float> {
private:
vls_float32_t values;
public:
using value_type = float;
using size_type = int;
static constexpr size_type size() {
return VECTOR_WIDTH / sizeof(float);
}
Vectorized() {
values = svdup_n_f32(0);
}
Vectorized(svfloat32_t v) : values(v) {}
Vectorized(float val) {
values = svdup_n_f32(val);
}
template <
typename... Args,
typename = std::enable_if_t<(sizeof...(Args) == size())>>
Vectorized(Args... vals) {
__at_align__ float buffer[size()] = {vals...};
values = svld1_f32(ptrue, buffer);
}
operator svfloat32_t() const {
return values;
}
template <uint64_t mask>
static Vectorized<float> blend(
const Vectorized<float>& a,
const Vectorized<float>& b) {
// Build an array of flags: each element is 1 if the corresponding bit in
// 'mask' is set, 0 otherwise.
__at_align__ int32_t flag_arr[size()];
for (int i = 0; i < size(); i++) {
flag_arr[i] = (mask & (1ULL << i)) ? 1 : 0;
}
// Load the flag array into an SVE int32 vector.
svint32_t int_mask = svld1_s32(svptrue_b32(), flag_arr);
// Compare each lane of int_mask to 0; returns an svbool_t predicate where
// true indicates a nonzero flag.
svbool_t blend_mask = svcmpne_n_s32(svptrue_b32(), int_mask, 0);
// Use svsel to select elements from b where the predicate is true, else
// from a.
svfloat32_t result = svsel_f32(blend_mask, b.values, a.values);
return Vectorized<float>(result);
}
static Vectorized<float> blendv(
const Vectorized<float>& a,
const Vectorized<float>& b,
const Vectorized<float>& mask_) {
svbool_t mask =
svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_), ALL_S32_TRUE_MASK);
return svsel_f32(mask, b, a);
}
template <typename step_t>
static Vectorized<float> arange(
float base = 0.f,
step_t step = static_cast<step_t>(1)) {
__at_align__ float buffer[size()];
for (int64_t i = 0; i < size(); i++) {
buffer[i] = base + i * step;
}
return svld1_f32(ptrue, buffer);
}
static Vectorized<float> set(
const Vectorized<float>& a,
const Vectorized<float>& b,
int64_t count = size()) {
if (count == 0) {
return a;
} else if (count < size()) {
return svsel_f32(svwhilelt_b32(0ull, count), b, a);
}
return b;
}
static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
if (count == size())
return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
svbool_t pg = svwhilelt_b32(0ull, count);
return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
}
void store(void* ptr, int64_t count = size()) const {
if (count == size()) {
svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
} else {
svbool_t pg = svwhilelt_b32(0ull, count);
svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
}
}
const float& operator[](int idx) const = delete;
float& operator[](int idx) = delete;
int64_t zero_mask() const {
// returns an integer mask where all zero elements are translated to 1-bit
// and others are translated to 0-bit
int64_t mask = 0;
__at_align__ int32_t mask_array[size()];
svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
svst1_s32(
ptrue,
mask_array,
svsel_s32(svbool_mask, ALL_S32_TRUE_MASK, ALL_S32_FALSE_MASK));
for (int64_t i = 0; i < size(); ++i) {
if (mask_array[i])
mask |= (1ull << i);
}
return mask;
}
Vectorized<float> isnan() const {
// NaN check
svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
}
bool has_inf_nan() const {
return svptest_any(
ptrue,
svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
}
Vectorized<float> map(float (*f)(float)) const {
__at_align__ float tmp[size()];
store(tmp);
for (int64_t i = 0; i < size(); ++i) {
tmp[i] = f(tmp[i]);
}
return loadu(tmp);
}
Vectorized<float> abs() const {
return svabs_f32_x(ptrue, values);
}
Vectorized<float> angle() const {
const auto nan_vec = svdup_n_f32(NAN);
const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
const auto pi = svdup_n_f32(c10::pi<float>);
const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
angle = svsel_f32(nan_mask, nan_vec, angle);
return angle;
}
Vectorized<float> real() const {
return values;
}
Vectorized<float> imag() const {
return Vectorized<float>(0.f);
}
Vectorized<float> conj() const {
return values;
}
Vectorized<float> acos() const {
return USE_SLEEF(
Vectorized<float>(Sleef_acosfx_u10sve(values)), map(std::acos));
}
Vectorized<float> acosh() const {
return USE_SLEEF(
Vectorized<float>(Sleef_acoshfx_u10sve(values)), map(std::acosh));
}
Vectorized<float> asin() const {
return USE_SLEEF(
Vectorized<float>(Sleef_asinfx_u10sve(values)), map(std::asin));
}
Vectorized<float> asinh() const {
return USE_SLEEF(
Vectorized<float>(Sleef_asinhfx_u10sve(values)), map(std::asinh));
}
Vectorized<float> atan() const {
return USE_SLEEF(
Vectorized<float>(Sleef_atanfx_u10sve(values)), map(std::atan));
}
Vectorized<float> atanh() const {
return USE_SLEEF(
Vectorized<float>(Sleef_atanhfx_u10sve(values)), map(std::atanh));
}
Vectorized<float> atan2(const Vectorized<float>& b) const {
USE_SLEEF(
{ return Vectorized<float>(Sleef_atan2fx_u10sve(values, b)); },
{
__at_align__ float tmp[size()];
__at_align__ float tmp_b[size()];
store(tmp);
b.store(tmp_b);
for (int64_t i = 0; i < size(); i++) {
tmp[i] = std::atan2(tmp[i], tmp_b[i]);
}
return loadu(tmp);
});
}
Vectorized<float> copysign(const Vectorized<float>& sign) const {
USE_SLEEF(
{ return Vectorized<float>(Sleef_copysignfx_sve(values, sign)); },
{
__at_align__ float tmp[size()];
__at_align__ float tmp_sign[size()];
store(tmp);
sign.store(tmp_sign);
for (int64_t i = 0; i < size(); ++i) {
tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
}
return loadu(tmp);
});
}
Vectorized<float> erf() const {
return USE_SLEEF(
Vectorized<float>(Sleef_erffx_u10sve(values)), map(std::erf));
}
Vectorized<float> erfc() const {
return USE_SLEEF(
Vectorized<float>(Sleef_erfcfx_u15sve(values)), map(std::erfc));
}
Vectorized<float> erfinv() const {
return map(calc_erfinv);
}
Vectorized<float> exp() const {
return USE_SLEEF(
Vectorized<float>(Sleef_expfx_u10sve(values)), map(std::exp));
}
Vectorized<float> exp2() const {
return USE_SLEEF(
Vectorized<float>(Sleef_exp2fx_u10sve(values)), map(std::exp2));
}
Vectorized<float> expm1() const {
return USE_SLEEF(
Vectorized<float>(Sleef_expm1fx_u10sve(values)), map(std::expm1));
}
// Implementation copied from Arm Optimized Routines:
// https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/sve/expf.c
Vectorized<float> exp_u20() const {
// special case to handle special inputs that are too large or too small
// i.e. where there's at least one element x, s.t. |x| >= 87.3...
svbool_t is_special_case = svacgt(svptrue_b32(), values, 0x1.5d5e2ap+6f);
if (svptest_any(svptrue_b32(), is_special_case)) {
return exp();
}
const svfloat32_t ln2_hi = svdup_n_f32(0x1.62e4p-1f);
const svfloat32_t ln2_lo = svdup_n_f32(0x1.7f7d1cp-20f);
const svfloat32_t c1 = svdup_n_f32(0.5f);
const svfloat32_t inv_ln2 = svdup_n_f32(0x1.715476p+0f);
const float shift = 0x1.803f8p17f;
/* n = round(x/(ln2/N)). */
svfloat32_t z = svmad_x(svptrue_b32(), inv_ln2, values, shift);
svfloat32_t n = svsub_x(svptrue_b32(), z, shift);
/* r = x - n*ln2/N. */
svfloat32_t r = values;
r = svmls_x(svptrue_b32(), r, n, ln2_hi);
r = svmls_x(svptrue_b32(), r, n, ln2_lo);
/* scale = 2^(n/N). */
svfloat32_t scale = svexpa(svreinterpret_u32(z));
/* poly(r) = exp(r) - 1 ~= r + 0.5 r^2. */
svfloat32_t r2 = svmul_x(svptrue_b32(), r, r);
svfloat32_t poly = svmla_x(svptrue_b32(), r, r2, c1);
return svmla_x(svptrue_b32(), scale, scale, poly);
}
Vectorized<float> fexp_u20() const {
return exp_u20();
}
Vectorized<float> fmod(const Vectorized<float>& q) const {
USE_SLEEF(
{ return Vectorized<float>(Sleef_fmodfx_sve(values, q)); },
{
__at_align__ float tmp[size()];
__at_align__ float tmp_q[size()];
store(tmp);
q.store(tmp_q);
for (int64_t i = 0; i < size(); ++i) {
tmp[i] = std::fmod(tmp[i], tmp_q[i]);
}
return loadu(tmp);
});
}
Vectorized<float> hypot(const Vectorized<float>& b) const {
USE_SLEEF(
{ return Vectorized<float>(Sleef_hypotfx_u05sve(values, b)); },
{
__at_align__ float tmp[size()];
__at_align__ float tmp_b[size()];
store(tmp);
b.store(tmp_b);
for (int64_t i = 0; i < size(); i++) {
tmp[i] = std::hypot(tmp[i], tmp_b[i]);
}
return loadu(tmp);
});
}
Vectorized<float> i0() const {
return map(calc_i0);
}
Vectorized<float> i0e() const {
return map(calc_i0e);
}
Vectorized<float> digamma() const {
return map(calc_digamma);
}
Vectorized<float> igamma(const Vectorized<float>& x) const {
__at_align__ float tmp[size()];
__at_align__ float tmp_x[size()];
store(tmp);
x.store(tmp_x);
for (int64_t i = 0; i < size(); i++) {
tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
}
return loadu(tmp);
}
Vectorized<float> igammac(const Vectorized<float>& x) const {
__at_align__ float tmp[size()];
__at_align__ float tmp_x[size()];
store(tmp);
x.store(tmp_x);
for (int64_t i = 0; i < size(); i++) {
tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
}
return loadu(tmp);
}
Vectorized<float> nextafter(const Vectorized<float>& b) const {
USE_SLEEF(
{ return Vectorized<float>(Sleef_nextafterfx_sve(values, b)); },
{
__at_align__ float tmp[size()];
__at_align__ float tmp_b[size()];
store(tmp);
b.store(tmp_b);
for (int64_t i = 0; i < size(); ++i) {
tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
}
return loadu(tmp);
});
}
Vectorized<float> log() const {
return USE_SLEEF(
Vectorized<float>(Sleef_logfx_u10sve(values)), map(std::log));
}
Vectorized<float> log2() const {
return USE_SLEEF(
Vectorized<float>(Sleef_log2fx_u10sve(values)), map(std::log2));
}
Vectorized<float> log10() const {
return USE_SLEEF(
Vectorized<float>(Sleef_log10fx_u10sve(values)), map(std::log10));
}
Vectorized<float> log1p() const {
return USE_SLEEF(
Vectorized<float>(Sleef_log1pfx_u10sve(values)), map(std::log1p));
}
Vectorized<float> frac() const;
Vectorized<float> sin() const {
return USE_SLEEF(
Vectorized<float>(Sleef_sinfx_u10sve(values)), map(std::sin));
}
Vectorized<float> sinh() const {
return USE_SLEEF(
Vectorized<float>(Sleef_sinhfx_u10sve(values)), map(std::sinh));
}
Vectorized<float> cos() const {
return USE_SLEEF(
Vectorized<float>(Sleef_cosfx_u10sve(values)), map(std::cos));
}
Vectorized<float> cosh() const {
return USE_SLEEF(
Vectorized<float>(Sleef_coshfx_u10sve(values)), map(std::cosh));
}
Vectorized<float> ceil() const {
return svrintp_f32_x(ptrue, values);
}
Vectorized<float> floor() const {
return svrintm_f32_x(ptrue, values);
}
Vectorized<float> neg() const {
return svneg_f32_x(ptrue, values);
}
Vectorized<float> round() const {
return svrinti_f32_x(ptrue, values);
}
Vectorized<float> tan() const {
return USE_SLEEF(
Vectorized<float>(Sleef_tanfx_u10sve(values)), map(std::tan));
}
// Implementation is picked from
// https://github.com/ARM-software/ComputeLibrary/blob/v25.01/src/core/NEON/SVEMath.inl#L179
Vectorized<float> tanh() const {
// Constants used for the tanh calculation.
const svfloat32_t CONST_1 =
svdup_n_f32(1.f); // Constant 1.0f for the tanh formula.
const svfloat32_t CONST_2 = svdup_n_f32(
2.f); // Constant 2.0f for the tanh formula (used in exp(2x)).
const svfloat32_t CONST_MIN_TANH = svdup_n_f32(
-10.f); // Minimum threshold for input values to prevent overflow.
const svfloat32_t CONST_MAX_TANH = svdup_n_f32(
10.f); // Maximum threshold for input values to prevent overflow.
// Step 1: Clamp the values within the range [-10, 10] to prevent overflow
// during exponentiation. The tanh function approaches ±1 rapidly as the
// input grows large, so we limit the input range to avoid numerical
// instability. svmax_f32_z ensures values are greater than -10, and
// svmin_f32_z ensures they are less than 10.
svfloat32_t x = svmin_f32_z(
ptrue, svmax_f32_z(ptrue, values, CONST_MIN_TANH), CONST_MAX_TANH);
// Step 2: Calculate exp(2 * x), where x is the clamped value.
// svmul_f32_z computes 2 * x, and exp_u20() computes the exponential of
// the result (via Vectorized<float>, then auto-converts back to
// svfloat32_t).
svfloat32_t exp2x =
Vectorized<float>(svmul_f32_z(ptrue, CONST_2, x)).exp_u20();
// Step 3: Calculate the numerator of the tanh function, which is exp(2x)
// - 1.
svfloat32_t num = svsub_f32_z(ptrue, exp2x, CONST_1);
// Step 4: Calculate the denominator of the tanh function, which is exp(2x)
// + 1.
svfloat32_t den = svadd_f32_z(ptrue, exp2x, CONST_1);
// Step 5: Calculate the tanh function as the ratio of the numerator and
// denominator: num / den.
svfloat32_t tanh = svdiv_f32_z(ptrue, num, den);
// Return the calculated tanh values.
return tanh;
}
Vectorized<float> trunc() const {
return svrintz_f32_x(ptrue, values);
}
Vectorized<float> lgamma() const {
return USE_SLEEF(
Vectorized<float>(Sleef_lgammafx_u10sve(values)), map(std::lgamma));
}
Vectorized<float> sqrt() const {
return svsqrt_f32_x(ptrue, values);
}
Vectorized<float> reciprocal() const {
return svdivr_f32_x(ptrue, values, ONE_F32);
}
Vectorized<float> rsqrt() const {
return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
}
Vectorized<float> pow(const Vectorized<float>& b) const {
USE_SLEEF(
{ return Vectorized<float>(Sleef_powfx_u10sve(values, b)); },
{
__at_align__ float tmp[size()];
__at_align__ float tmp_b[size()];
store(tmp);
b.store(tmp_b);
for (int64_t i = 0; i < size(); i++) {
tmp[i] = std::pow(tmp[i], tmp_b[i]);
}
return loadu(tmp);
});
}
// Comparison using the _CMP_**_OQ predicate.
// `O`: get false if an operand is NaN
// `Q`: do not raise if an operand is NaN
Vectorized<float> operator==(const Vectorized<float>& other) const {
svbool_t mask = svcmpeq_f32(ptrue, values, other);
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
}
Vectorized<float> operator!=(const Vectorized<float>& other) const {
svbool_t mask = svcmpne_f32(ptrue, values, other);
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
}
Vectorized<float> operator<(const Vectorized<float>& other) const {
svbool_t mask = svcmplt_f32(ptrue, values, other);
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
}
Vectorized<float> operator<=(const Vectorized<float>& other) const {
svbool_t mask = svcmple_f32(ptrue, values, other);
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
}
Vectorized<float> operator>(const Vectorized<float>& other) const {
svbool_t mask = svcmpgt_f32(ptrue, values, other);
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
}
Vectorized<float> operator>=(const Vectorized<float>& other) const {
svbool_t mask = svcmpge_f32(ptrue, values, other);
return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
}
Vectorized<float> eq(const Vectorized<float>& other) const;
Vectorized<float> ne(const Vectorized<float>& other) const;
Vectorized<float> gt(const Vectorized<float>& other) const;
Vectorized<float> ge(const Vectorized<float>& other) const;
Vectorized<float> lt(const Vectorized<float>& other) const;
Vectorized<float> le(const Vectorized<float>& other) const;
};
Source
Analyze Your Own Codebase
Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free