max_static_len Class — pytorch Architecture

Architecture documentation for the max_static_len class in SparseBinaryOpIntersectionCommon.h from the pytorch codebase.
Class c
Entity Profile

Source Code

aten/src/ATen/native/sparse/SparseBinaryOpIntersectionCommon.h lines 126–417
template <
  template <typename func_t> class kernel_t,
  typename value_selection_intersection_kernel_t,
  typename index_t = int64_t,
  int64_t max_static_len = 0>
void _sparse_binary_op_intersection_kernel_impl(
    Tensor& res,
    const Tensor& x_,
    const Tensor& y_,
    const std::vector<int64_t>& broadcasted_shape,
    const std::optional<Tensor>& x_hash_opt_ = std::nullopt,
    const std::optional<Tensor>& y_hash_opt_ = std::nullopt,
    const bool accumulate_matches = true,
    const bool distributive_with_sum = true
) {
  // The common dtype check is relevant when op is done in-place.
  // This is because binary_of_t produces new values and it could be that
  // new_values.dtype != res.dtype. In such a case we should error out
  // as soon as possible to avoid redundant kernel runs.
  const auto common_dtype = at::result_type(x_, y_);
  TORCH_CHECK(canCast(common_dtype, res.scalar_type()),
      "Can't convert result type ", common_dtype,
      " to output ", res.scalar_type());

  using KernelLauncher = KernelLauncher<kernel_t>;
  using OptTensor = std::optional<Tensor>;

  // If the op and sum are not distributive, coalesce is required.
  const auto coalesce_if_not_distributive = [distributive_with_sum](const Tensor& t, const OptTensor& t_hash_opt) -> auto {
    // No need to coalesce in such a case.
    if (distributive_with_sum) {
      return std::make_tuple(t, t_hash_opt);
    } else {
      // Otherwise coalesce and force hash recompute.
      return std::make_tuple(t.coalesce(), static_cast<OptTensor>(std::nullopt));
    }
  };

  Tensor x, y;
  OptTensor x_hash_opt, y_hash_opt;
  std::tie(x, x_hash_opt) = coalesce_if_not_distributive(x_, x_hash_opt_);
  std::tie(y, y_hash_opt) = coalesce_if_not_distributive(y_, y_hash_opt_);

  // Given sparse tensors x and y we decide which one is source, and which one
  // is probably_coalesced. The indices of both source and probably_coalesced are
  // hashed and then the hash values of the source's indices are binary-searched
  // into the hash values of the probably_coalesced's indices.
  // If probably_coalesce is coalesced, by the property of the hashing method
  // (see below), the hash values are already sorted and we can avoid any
  // explicit sorting routines.
  Tensor probably_coalesced, source;
  OptTensor probably_coalesced_indices_hash_opt, source_indices_hash_opt;
  std::tie(probably_coalesced, probably_coalesced_indices_hash_opt, source, source_indices_hash_opt) = [&]() -> auto {
    // Case 1: either x or y is coalesced.
    if ((x.is_coalesced() ^ y.is_coalesced())) {
      return x.is_coalesced()
        ? std::make_tuple(x, x_hash_opt, y, y_hash_opt)
        : std::make_tuple(y, y_hash_opt, x, x_hash_opt);
    }
    // Case 2: Both x and y are either coalesced or non-coalesced.
    // If both are coalesced, search into the larger tensor is faster.
    // Same holds when both are non-coalesced.
    else {
      Tensor larger, smaller;
      OptTensor larger_hash_opt, smaller_hash_opt;
      std::tie(larger, larger_hash_opt, smaller, smaller_hash_opt) = [&]() -> auto {
        return x._nnz() >= y._nnz()
          ? std::make_tuple(x, x_hash_opt, y, y_hash_opt)
          : std::make_tuple(y, y_hash_opt, x, x_hash_opt);
      }();

      // If under a uniform distribution it is likely to hit many elements in larger,
      // it is best to coalesce it for better performance.
      const auto larger_sizes = larger.sizes();
      const auto sparse_dim_numel = std::accumulate(
          larger_sizes.begin(),
          larger_sizes.begin() + larger.sparse_dim(),
          1,
          std::multiplies<int64_t>());
      // If nnz > prod(larger.shape[:sparse_dim]), by the pidgeonhole principle,
      // there is at least one bucket with nnz / prod(larger.shape[:sparse_dim]) elements.
      // It provides a lower bound for the max count in the intersection.
      // This condition is very conservative as we do not check whether such an event
      // actually occurred, although it is very likely under a uniform distribution,
      // the distribution with the highest uncertainty (maximizes entropy).
      const auto max_count_lower_bound = larger._nnz() / sparse_dim_numel;
      constexpr int64_t MAX_COPIES_PER_THREAD = 50;
      return max_count_lower_bound > MAX_COPIES_PER_THREAD
        // coalesce invalidates hash values, so force-recompute
        ? std::make_tuple(larger.coalesce(), static_cast<OptTensor>(std::nullopt), smaller, smaller_hash_opt)
        : std::make_tuple(larger, larger_hash_opt, smaller, smaller_hash_opt);
    }
  }();

  // The employed hash function maps a d-dim index to a linear offset
  // into a contiguous memory that is sufficient to fit a dense tensor
  // of shape broadcasted_shape(x.shape, y.shape), i.e.
  // idx -> \sum_{i = 0}^d idx[i] * hash_coeffs[i], where
  // hash_coeffs are the strides of a contiguous tensor of shape
  // broadcasted_shape(x.shape, y.shape).
  // Assuming the following order on the dimensions, i.e. the right-most dim is the
  // fastest-changing dim, and the left-most is the slowest-changing dim,
  // which is implicit in the definition of hash_coeffs,
  // it could be shown that the hash function is actually bijective and, hence,
  // is a perfect hash function (no collisions ever).

  // Need owning storage in case of the Tensor class.
  const auto hash_coeffs_storage = [&]() -> auto {
    const auto broadcasted_sparse_dim_shape = std::vector<int64_t>(
      broadcasted_shape.begin(),
      broadcasted_shape.begin() + probably_coalesced.sparse_dim()
    );
    auto strides = c10::contiguous_strides(broadcasted_sparse_dim_shape);
    return at::sparse::TensorGeometryHolder<max_static_len>(strides, strides, probably_coalesced.options());
  }();

  const auto hash_coeffs = std::get<0>(*hash_coeffs_storage);

  const auto nnz_arange = at::arange(
      std::max(probably_coalesced._nnz(), source._nnz()),
      source._indices().options());
  const auto probably_coalesced_nnz_arange = nnz_arange.narrow(-1, 0, probably_coalesced._nnz());

  const auto sparse_dim = probably_coalesced.sparse_dim();

  // Apply the hash function to probably_coalesced.indices
  const auto probably_coalesced_indices_hash = [&]() -> Tensor {
    // probably_coalesced is coalesced and hash provided? Reuse it!
    if (probably_coalesced_indices_hash_opt.has_value()) {
      return (*probably_coalesced_indices_hash_opt).contiguous();
    }

    const auto indices = probably_coalesced._indices();
    const auto indices_dim_stride = indices.stride(0);
    const auto indices_nnz_stride = indices.stride(1);

    auto hash = at::empty({probably_coalesced._nnz()}, indices.options().dtype(kLong));

    auto iter = TensorIteratorConfig()
      .check_all_same_dtype(false)
      .add_output(hash)
      .add_input(probably_coalesced_nnz_arange)
      .build();

    {
      const auto* RESTRICT ptr_indices = indices.const_data_ptr<index_t>();

      KernelLauncher::launch(iter,
          // NOTE: capture by value required by CUDA
          [=] FUNCAPI (index_t nnz_idx) -> int64_t {
          int64_t hash = 0;
          if (!ptr_indices) {
            return hash;
          }
          const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
          for (int64_t dim = 0; dim < sparse_dim; ++dim) {
            const auto dim_hash_coeff = hash_coeffs[dim];
            const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
            hash += dim_index * dim_hash_coeff;
          }
          return hash;
      });
    }

    return hash;
  }();

  // Now that we have hash values of probably_coalesced.indices,
  // we need to decide whether they need to get sorted.
  // The sort is not requires if probably_coalesced is coalesced.
  Tensor sorted_hash, argsort_hash;
  std::tie(sorted_hash, argsort_hash) = [&]() -> std::tuple<Tensor, Tensor> {
    if (probably_coalesced.is_coalesced()) {
      // NOTE: argsort.dtype == nnz_arange.dtype
      const auto argsort = nnz_arange.narrow(-1, 0, probably_coalesced._nnz());
      return std::make_tuple(probably_coalesced_indices_hash, argsort);
    } else {
      // NOTE: we want argsort.dtype == nnz_arange.dtype,
      // but sort() produces indices of type int64_t,
      // so we convert to nnz_arange.dtype to avoid issues
      // with pointer types in the kernels below.
      Tensor sorted, argsort;
      std::tie(sorted, argsort) = probably_coalesced_indices_hash.sort();
      return std::make_tuple(sorted, argsort.to(nnz_arange.scalar_type()));
    }
  }();

  // Perform hash intersection.
  // Let  s_hash = hash(source.indices),
  //     pc_hash = hash(probably_coalesced.indices), then
  // for i = 0, ..., len(s_hash) - 1:
  //     lb = <index of a value in pc_hash[argsort_hash] which is a lower bound for s_hash[i]>,
  //     up = <index of a value in pc_hash[argsort_hash] which is an upper bound for s_hash[i]>,
  //     intersection_count[i] = up - lb
  //     intersection_first_idx[i] = lb.
  //
  // intersection_count and intersection_first_idx are used to form indices at which
  // intersection values are selected.
  auto [intersection_count, intersection_first_idx] = [&]() -> std::tuple<Tensor, Tensor> {
    const auto source_nnz = source._nnz();
    auto intersection_buffer = at::empty({2, source_nnz}, sorted_hash.options());
    auto intersection_count = intersection_buffer.select(0, 0);
    auto intersection_first_idx = intersection_buffer.select(0, 1);

    const auto source_indices = source._indices();
    const auto source_arange = nnz_arange.narrow(-1, 0, source_nnz);
    // non-const because of gcc-5/clang-5 issues
    auto indices_dim_stride = source_indices.stride(0);
    auto indices_nnz_stride = source_indices.stride(1);
    auto dummy = at::empty({1}, source_arange.options());

    auto hash = source_indices_hash_opt.has_value()
      ? (*source_indices_hash_opt).contiguous()
      : at::empty({0}, probably_coalesced._indices().options().dtype(kLong));
    const auto* RESTRICT hash_ptr = source_indices_hash_opt.has_value()
      ? hash.data_ptr<int64_t>()
      : nullptr;

    auto iter = TensorIteratorConfig()
      .set_check_mem_overlap(false)
      .add_owned_output(dummy.expand_as(source_arange))
      .add_input(source_arange)
      .build();

    {
      const auto* RESTRICT ptr_indices = source_indices.const_data_ptr<index_t>();
      const auto* RESTRICT ptr_sorted_hash = sorted_hash.const_data_ptr<int64_t>();
      const auto sorted_hash_len = sorted_hash.numel();
      auto* RESTRICT ptr_intersection_count = intersection_count.data_ptr<int64_t>();
      auto* RESTRICT ptr_intersection_first_idx = intersection_first_idx.data_ptr<int64_t>();

      // Fusing hash computation with hash intersection.
      KernelLauncher::launch(iter,
          // NOTE: capture by value required by CUDA
          [=] FUNCAPI (index_t nnz_idx) -> index_t {
          int64_t hash = 0;
          if (hash_ptr) {
            hash = hash_ptr[nnz_idx];
          } else if (sparse_dim) {
            // Compute hash value
            const auto* RESTRICT ptr_indices_dim = ptr_indices + nnz_idx * indices_nnz_stride;
            for (int64_t dim = 0; dim < sparse_dim; ++dim) {
              const auto dim_hash_coeff = hash_coeffs[dim];
              const auto dim_index = ptr_indices_dim[dim * indices_dim_stride];
              hash += dim_index * dim_hash_coeff;
            }
          }

          // Perform hash values intersection
          const auto* RESTRICT lb = find_bound<const int64_t*, int64_t, /*is_lower=*/true>(
              ptr_sorted_hash,
              ptr_sorted_hash + sorted_hash_len,
              hash
          );

          const auto* RESTRICT ub = find_bound<const int64_t*, int64_t, /*is_lower=*/false>(
              ptr_sorted_hash,
              ptr_sorted_hash + sorted_hash_len,
              hash
          );

          ptr_intersection_count[nnz_idx] = ub - lb;
          ptr_intersection_first_idx[nnz_idx] = lb - ptr_sorted_hash;

          return 0;
      });
    }

    return std::make_tuple(intersection_count, intersection_first_idx);
  }();

  const auto res_indices = source._indices().clone();
  const auto binary_op_res_dtype = at::result_type(source._values(), probably_coalesced._values());
  const auto res_values = value_selection_intersection_kernel_t::apply(
      source._values().to(binary_op_res_dtype),
      nnz_arange.narrow(-1, 0, source._nnz()),
      probably_coalesced._values().to(binary_op_res_dtype),
      intersection_first_idx.to(nnz_arange.scalar_type()),
      intersection_count,
      argsort_hash,
      accumulate_matches).to(res.scalar_type());
  const auto res_sparse_dim = source.sparse_dim();
  const auto res_dense_dim = source.dense_dim();
  const auto& res_shape = broadcasted_shape;
  const auto res_nnz = source._nnz();

  auto* res_sparse_impl = get_sparse_impl(res);
  res_sparse_impl->raw_resize_(res_sparse_dim, res_dense_dim, res_shape);
  res_sparse_impl->set_indices_and_values_unsafe(res_indices, res_values);
  res_sparse_impl->set_nnz_and_narrow(res_nnz);
  res._coalesced_(source.is_coalesced());
}
Source

View on GitHub
Analyze Your Own Codebase

Get architecture documentation, dependency graphs, and domain analysis for your codebase in minutes.
Try Supermodel Free