mlir/lib/ExecutionEngine/SparseTensorUtils.cpp - rust-lang/llvm-project - Git at Google

 //===- SparseTensorUtils.cpp - Sparse Tensor Utils for MLIR execution -----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements a light-weight runtime support library that is useful
 // for sparse tensor manipulations. The functionality provided in this library
 // is meant to simplify benchmarking, testing, and debugging MLIR code that
 // operates on sparse tensors. The provided functionality is **not** part
 // of core MLIR, however.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/ExecutionEngine/SparseTensorUtils.h"

 #ifdef MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS

 #include <algorithm>
 #include <cassert>
 #include <cctype>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <functional>
 #include <iostream>
 #include <limits>
 #include <numeric>

 //===----------------------------------------------------------------------===//
 //
 // Internal support for storing and reading sparse tensors.
 //
 // The following memory-resident sparse storage schemes are supported:
 //
 // (a) A coordinate scheme for temporarily storing and lexicographically
 //     sorting a sparse tensor by index (SparseTensorCOO).
 //
 // (b) A "one-size-fits-all" sparse tensor storage scheme defined by
 //     per-dimension sparse/dense annnotations together with a dimension
 //     ordering used by MLIR compiler-generated code (SparseTensorStorage).
 //
 // The following external formats are supported:
 //
 // (1) Matrix Market Exchange (MME): *.mtx
 //     https://math.nist.gov/MatrixMarket/formats.html
 //
 // (2) Formidable Repository of Open Sparse Tensors and Tools (FROSTT): *.tns
 //     http://frostt.io/tensors/file-formats.html
 //
 // Two public APIs are supported:
 //
 // (I) Methods operating on MLIR buffers (memrefs) to interact with sparse
 //     tensors. These methods should be used exclusively by MLIR
 //     compiler-generated code.
 //
 // (II) Methods that accept C-style data structures to interact with sparse
 //      tensors. These methods can be used by any external runtime that wants
 //      to interact with MLIR compiler-generated code.
 //
 // In both cases (I) and (II), the SparseTensorStorage format is externally
 // only visible as an opaque pointer.
 //
 //===----------------------------------------------------------------------===//

 namespace {

 static constexpr int kColWidth = 1025;

 /// A version of `operator*` on `uint64_t` which checks for overflows.
 static inline uint64_t checkedMul(uint64_t lhs, uint64_t rhs) {
   assert((lhs == 0 || rhs <= std::numeric_limits<uint64_t>::max() / lhs) &&
          "Integer overflow");
   return lhs * rhs;
 }

 // This macro helps minimize repetition of this idiom, as well as ensuring
 // we have some additional output indicating where the error is coming from.
 // (Since `fprintf` doesn't provide a stacktrace, this helps make it easier
 // to track down whether an error is coming from our code vs somewhere else
 // in MLIR.)
 #define FATAL(...)                                                             \
   do {                                                                         \
     fprintf(stderr, "SparseTensorUtils: " __VA_ARGS__);                        \
     exit(1);                                                                   \
   } while (0)

 // TODO: try to unify this with `SparseTensorFile::assertMatchesShape`
 // which is used by `openSparseTensorCOO`.  It's easy enough to resolve
 // the `std::vector` vs pointer mismatch for `dimSizes`; but it's trickier
 // to resolve the presence/absence of `perm` (without introducing extra
 // overhead), so perhaps the code duplication is unavoidable.
 //
 /// Asserts that the `dimSizes` (in target-order) under the `perm` (mapping
 /// semantic-order to target-order) are a refinement of the desired `shape`
 /// (in semantic-order).
 ///
 /// Precondition: `perm` and `shape` must be valid for `rank`.
 static inline void
 assertPermutedSizesMatchShape(const std::vector<uint64_t> &dimSizes,
                               uint64_t rank, const uint64_t *perm,
                               const uint64_t *shape) {
   assert(perm && shape);
   assert(rank == dimSizes.size() && "Rank mismatch");
   for (uint64_t r = 0; r < rank; r++)
     assert((shape[r] == 0 || shape[r] == dimSizes[perm[r]]) &&
            "Dimension size mismatch");
 }

 /// A sparse tensor element in coordinate scheme (value and indices).
 /// For example, a rank-1 vector element would look like
 ///   ({i}, a[i])
 /// and a rank-5 tensor element like
 ///   ({i,j,k,l,m}, a[i,j,k,l,m])
 /// We use pointer to a shared index pool rather than e.g. a direct
 /// vector since that (1) reduces the per-element memory footprint, and
 /// (2) centralizes the memory reservation and (re)allocation to one place.
 template <typename V>
 struct Element final {
   Element(uint64_t *ind, V val) : indices(ind), value(val){};
   uint64_t *indices; // pointer into shared index pool
   V value;
 };

 /// The type of callback functions which receive an element.  We avoid
 /// packaging the coordinates and value together as an `Element` object
 /// because this helps keep code somewhat cleaner.
 template <typename V>
 using ElementConsumer =
     const std::function<void(const std::vector<uint64_t> &, V)> &;

 /// A memory-resident sparse tensor in coordinate scheme (collection of
 /// elements). This data structure is used to read a sparse tensor from
 /// any external format into memory and sort the elements lexicographically
 /// by indices before passing it back to the client (most packed storage
 /// formats require the elements to appear in lexicographic index order).
 template <typename V>
 struct SparseTensorCOO final {
 public:
   SparseTensorCOO(const std::vector<uint64_t> &dimSizes, uint64_t capacity)
       : dimSizes(dimSizes) {
     if (capacity) {
       elements.reserve(capacity);
       indices.reserve(capacity * getRank());
     }
   }

   /// Adds element as indices and value.
   void add(const std::vector<uint64_t> &ind, V val) {
     assert(!iteratorLocked && "Attempt to add() after startIterator()");
     uint64_t *base = indices.data();
     uint64_t size = indices.size();
     uint64_t rank = getRank();
     assert(ind.size() == rank && "Element rank mismatch");
     for (uint64_t r = 0; r < rank; r++) {
       assert(ind[r] < dimSizes[r] && "Index is too large for the dimension");
       indices.push_back(ind[r]);
     }
     // This base only changes if indices were reallocated. In that case, we
     // need to correct all previous pointers into the vector. Note that this
     // only happens if we did not set the initial capacity right, and then only
     // for every internal vector reallocation (which with the doubling rule
     // should only incur an amortized linear overhead).
     uint64_t *newBase = indices.data();
     if (newBase != base) {
       for (uint64_t i = 0, n = elements.size(); i < n; i++)
         elements[i].indices = newBase + (elements[i].indices - base);
       base = newBase;
     }
     // Add element as (pointer into shared index pool, value) pair.
     elements.emplace_back(base + size, val);
   }

   /// Sorts elements lexicographically by index.
   void sort() {
     assert(!iteratorLocked && "Attempt to sort() after startIterator()");
     // TODO: we may want to cache an `isSorted` bit, to avoid
     // unnecessary/redundant sorting.
     uint64_t rank = getRank();
     std::sort(elements.begin(), elements.end(),
               [rank](const Element<V> &e1, const Element<V> &e2) {
                 for (uint64_t r = 0; r < rank; r++) {
                   if (e1.indices[r] == e2.indices[r])
                     continue;
                   return e1.indices[r] < e2.indices[r];
                 }
                 return false;
               });
   }

   /// Get the rank of the tensor.
   uint64_t getRank() const { return dimSizes.size(); }

   /// Getter for the dimension-sizes array.
   const std::vector<uint64_t> &getDimSizes() const { return dimSizes; }

   /// Getter for the elements array.
   const std::vector<Element<V>> &getElements() const { return elements; }

   /// Switch into iterator mode.
   void startIterator() {
     iteratorLocked = true;
     iteratorPos = 0;
   }

   /// Get the next element.
   const Element<V> *getNext() {
     assert(iteratorLocked && "Attempt to getNext() before startIterator()");
     if (iteratorPos < elements.size())
       return &(elements[iteratorPos++]);
     iteratorLocked = false;
     return nullptr;
   }

   /// Factory method. Permutes the original dimensions according to
   /// the given ordering and expects subsequent add() calls to honor
   /// that same ordering for the given indices. The result is a
   /// fully permuted coordinate scheme.
   ///
   /// Precondition: `dimSizes` and `perm` must be valid for `rank`.
   static SparseTensorCOO<V> *newSparseTensorCOO(uint64_t rank,
                                                 const uint64_t *dimSizes,
                                                 const uint64_t *perm,
                                                 uint64_t capacity = 0) {
     std::vector<uint64_t> permsz(rank);
     for (uint64_t r = 0; r < rank; r++) {
       assert(dimSizes[r] > 0 && "Dimension size zero has trivial storage");
       permsz[perm[r]] = dimSizes[r];
     }
     return new SparseTensorCOO<V>(permsz, capacity);
   }

 private:
   const std::vector<uint64_t> dimSizes; // per-dimension sizes
   std::vector<Element<V>> elements;     // all COO elements
   std::vector<uint64_t> indices;        // shared index pool
   bool iteratorLocked = false;
   unsigned iteratorPos = 0;
 };

 // Forward.
 template <typename V>
 class SparseTensorEnumeratorBase;

 // Helper macro for generating error messages when some
 // `SparseTensorStorage<P,I,V>` is cast to `SparseTensorStorageBase`
 // and then the wrong "partial method specialization" is called.
 #define FATAL_PIV(NAME) FATAL("<P,I,V> type mismatch for: " #NAME);

 /// Abstract base class for `SparseTensorStorage<P,I,V>`.  This class
 /// takes responsibility for all the `<P,I,V>`-independent aspects
 /// of the tensor (e.g., shape, sparsity, permutation).  In addition,
 /// we use function overloading to implement "partial" method
 /// specialization, which the C-API relies on to catch type errors
 /// arising from our use of opaque pointers.
 class SparseTensorStorageBase {
 public:
   /// Constructs a new storage object.  The `perm` maps the tensor's
   /// semantic-ordering of dimensions to this object's storage-order.
   /// The `dimSizes` and `sparsity` arrays are already in storage-order.
   ///
   /// Precondition: `perm` and `sparsity` must be valid for `dimSizes.size()`.
   SparseTensorStorageBase(const std::vector<uint64_t> &dimSizes,
                           const uint64_t *perm, const DimLevelType *sparsity)
       : dimSizes(dimSizes), rev(getRank()),
         dimTypes(sparsity, sparsity + getRank()) {
     assert(perm && sparsity);
     const uint64_t rank = getRank();
     // Validate parameters.
     assert(rank > 0 && "Trivial shape is unsupported");
     for (uint64_t r = 0; r < rank; r++) {
       assert(dimSizes[r] > 0 && "Dimension size zero has trivial storage");
       assert((dimTypes[r] == DimLevelType::kDense ||
               dimTypes[r] == DimLevelType::kCompressed) &&
              "Unsupported DimLevelType");
     }
     // Construct the "reverse" (i.e., inverse) permutation.
     for (uint64_t r = 0; r < rank; r++)
       rev[perm[r]] = r;
   }

   virtual ~SparseTensorStorageBase() = default;

   /// Get the rank of the tensor.
   uint64_t getRank() const { return dimSizes.size(); }

   /// Getter for the dimension-sizes array, in storage-order.
   const std::vector<uint64_t> &getDimSizes() const { return dimSizes; }

   /// Safely lookup the size of the given (storage-order) dimension.
   uint64_t getDimSize(uint64_t d) const {
     assert(d < getRank());
     return dimSizes[d];
   }

   /// Getter for the "reverse" permutation, which maps this object's
   /// storage-order to the tensor's semantic-order.
   const std::vector<uint64_t> &getRev() const { return rev; }

   /// Getter for the dimension-types array, in storage-order.
   const std::vector<DimLevelType> &getDimTypes() const { return dimTypes; }

   /// Safely check if the (storage-order) dimension uses compressed storage.
   bool isCompressedDim(uint64_t d) const {
     assert(d < getRank());
     return (dimTypes[d] == DimLevelType::kCompressed);
   }

   /// Allocate a new enumerator.
 #define DECL_NEWENUMERATOR(VNAME, V)                                           \
   virtual void newEnumerator(SparseTensorEnumeratorBase<V> **, uint64_t,       \
                              const uint64_t *) const {                         \
     FATAL_PIV("newEnumerator" #VNAME);                                         \
   }
   FOREVERY_V(DECL_NEWENUMERATOR)
 #undef DECL_NEWENUMERATOR

   /// Overhead storage.
 #define DECL_GETPOINTERS(PNAME, P)                                             \
   virtual void getPointers(std::vector<P> **, uint64_t) {                      \
     FATAL_PIV("getPointers" #PNAME);                                           \
   }
   FOREVERY_FIXED_O(DECL_GETPOINTERS)
 #undef DECL_GETPOINTERS
 #define DECL_GETINDICES(INAME, I)                                              \
   virtual void getIndices(std::vector<I> **, uint64_t) {                       \
     FATAL_PIV("getIndices" #INAME);                                            \
   }
   FOREVERY_FIXED_O(DECL_GETINDICES)
 #undef DECL_GETINDICES

   /// Primary storage.
 #define DECL_GETVALUES(VNAME, V)                                               \
   virtual void getValues(std::vector<V> **) { FATAL_PIV("getValues" #VNAME); }
   FOREVERY_V(DECL_GETVALUES)
 #undef DECL_GETVALUES

   /// Element-wise insertion in lexicographic index order.
 #define DECL_LEXINSERT(VNAME, V)                                               \
   virtual void lexInsert(const uint64_t *, V) { FATAL_PIV("lexInsert" #VNAME); }
   FOREVERY_V(DECL_LEXINSERT)
 #undef DECL_LEXINSERT

   /// Expanded insertion.
 #define DECL_EXPINSERT(VNAME, V)                                               \
   virtual void expInsert(uint64_t *, V *, bool *, uint64_t *, uint64_t) {      \
     FATAL_PIV("expInsert" #VNAME);                                             \
   }
   FOREVERY_V(DECL_EXPINSERT)
 #undef DECL_EXPINSERT

   /// Finishes insertion.
   virtual void endInsert() = 0;

 protected:
   // Since this class is virtual, we must disallow public copying in
   // order to avoid "slicing".  Since this class has data members,
   // that means making copying protected.
   // <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rc-copy-virtual>
   SparseTensorStorageBase(const SparseTensorStorageBase &) = default;
   // Copy-assignment would be implicitly deleted (because `dimSizes`
   // is const), so we explicitly delete it for clarity.
   SparseTensorStorageBase &operator=(const SparseTensorStorageBase &) = delete;

 private:
   const std::vector<uint64_t> dimSizes;
   std::vector<uint64_t> rev;
   const std::vector<DimLevelType> dimTypes;
 };

 #undef FATAL_PIV

 // Forward.
 template <typename P, typename I, typename V>
 class SparseTensorEnumerator;

 /// A memory-resident sparse tensor using a storage scheme based on
 /// per-dimension sparse/dense annotations. This data structure provides a
 /// bufferized form of a sparse tensor type. In contrast to generating setup
 /// methods for each differently annotated sparse tensor, this method provides
 /// a convenient "one-size-fits-all" solution that simply takes an input tensor
 /// and annotations to implement all required setup in a general manner.
 template <typename P, typename I, typename V>
 class SparseTensorStorage final : public SparseTensorStorageBase {
   /// Private constructor to share code between the other constructors.
   /// Beware that the object is not necessarily guaranteed to be in a
   /// valid state after this constructor alone; e.g., `isCompressedDim(d)`
   /// doesn't entail `!(pointers[d].empty())`.
   ///
   /// Precondition: `perm` and `sparsity` must be valid for `dimSizes.size()`.
   SparseTensorStorage(const std::vector<uint64_t> &dimSizes,
                       const uint64_t *perm, const DimLevelType *sparsity)
       : SparseTensorStorageBase(dimSizes, perm, sparsity), pointers(getRank()),
         indices(getRank()), idx(getRank()) {}

 public:
   /// Constructs a sparse tensor storage scheme with the given dimensions,
   /// permutation, and per-dimension dense/sparse annotations, using
   /// the coordinate scheme tensor for the initial contents if provided.
   ///
   /// Precondition: `perm` and `sparsity` must be valid for `dimSizes.size()`.
   SparseTensorStorage(const std::vector<uint64_t> &dimSizes,
                       const uint64_t *perm, const DimLevelType *sparsity,
                       SparseTensorCOO<V> *coo)
       : SparseTensorStorage(dimSizes, perm, sparsity) {
     // Provide hints on capacity of pointers and indices.
     // TODO: needs much fine-tuning based on actual sparsity; currently
     //       we reserve pointer/index space based on all previous dense
     //       dimensions, which works well up to first sparse dim; but
     //       we should really use nnz and dense/sparse distribution.
     bool allDense = true;
     uint64_t sz = 1;
     for (uint64_t r = 0, rank = getRank(); r < rank; r++) {
       if (isCompressedDim(r)) {
         // TODO: Take a parameter between 1 and `dimSizes[r]`, and multiply
         // `sz` by that before reserving. (For now we just use 1.)
         pointers[r].reserve(sz + 1);
         pointers[r].push_back(0);
         indices[r].reserve(sz);
         sz = 1;
         allDense = false;
       } else { // Dense dimension.
         sz = checkedMul(sz, getDimSizes()[r]);
       }
     }
     // Then assign contents from coordinate scheme tensor if provided.
     if (coo) {
       // Ensure both preconditions of `fromCOO`.
       assert(coo->getDimSizes() == getDimSizes() && "Tensor size mismatch");
       coo->sort();
       // Now actually insert the `elements`.
       const std::vector<Element<V>> &elements = coo->getElements();
       uint64_t nnz = elements.size();
       values.reserve(nnz);
       fromCOO(elements, 0, nnz, 0);
     } else if (allDense) {
       values.resize(sz, 0);
     }
   }

   /// Constructs a sparse tensor storage scheme with the given dimensions,
   /// permutation, and per-dimension dense/sparse annotations, using
   /// the given sparse tensor for the initial contents.
   ///
   /// Preconditions:
   /// * `perm` and `sparsity` must be valid for `dimSizes.size()`.
   /// * The `tensor` must have the same value type `V`.
   SparseTensorStorage(const std::vector<uint64_t> &dimSizes,
                       const uint64_t *perm, const DimLevelType *sparsity,
                       const SparseTensorStorageBase &tensor);

   ~SparseTensorStorage() final = default;

   /// Partially specialize these getter methods based on template types.
   void getPointers(std::vector<P> **out, uint64_t d) final {
     assert(d < getRank());
     *out = &pointers[d];
   }
   void getIndices(std::vector<I> **out, uint64_t d) final {
     assert(d < getRank());
     *out = &indices[d];
   }
   void getValues(std::vector<V> **out) final { *out = &values; }

   /// Partially specialize lexicographical insertions based on template types.
   void lexInsert(const uint64_t *cursor, V val) final {
     // First, wrap up pending insertion path.
     uint64_t diff = 0;
     uint64_t top = 0;
     if (!values.empty()) {
       diff = lexDiff(cursor);
       endPath(diff + 1);
       top = idx[diff] + 1;
     }
     // Then continue with insertion path.
     insPath(cursor, diff, top, val);
   }

   /// Partially specialize expanded insertions based on template types.
   /// Note that this method resets the values/filled-switch array back
   /// to all-zero/false while only iterating over the nonzero elements.
   void expInsert(uint64_t *cursor, V *values, bool *filled, uint64_t *added,
                  uint64_t count) final {
     if (count == 0)
       return;
     // Sort.
     std::sort(added, added + count);
     // Restore insertion path for first insert.
     const uint64_t lastDim = getRank() - 1;
     uint64_t index = added[0];
     cursor[lastDim] = index;
     lexInsert(cursor, values[index]);
     assert(filled[index]);
     values[index] = 0;
     filled[index] = false;
     // Subsequent insertions are quick.
     for (uint64_t i = 1; i < count; i++) {
       assert(index < added[i] && "non-lexicographic insertion");
       index = added[i];
       cursor[lastDim] = index;
       insPath(cursor, lastDim, added[i - 1] + 1, values[index]);
       assert(filled[index]);
       values[index] = 0;
       filled[index] = false;
     }
   }

   /// Finalizes lexicographic insertions.
   void endInsert() final {
     if (values.empty())
       finalizeSegment(0);
     else
       endPath(0);
   }

   void newEnumerator(SparseTensorEnumeratorBase<V> **out, uint64_t rank,
                      const uint64_t *perm) const final {
     *out = new SparseTensorEnumerator<P, I, V>(*this, rank, perm);
   }

   /// Returns this sparse tensor storage scheme as a new memory-resident
   /// sparse tensor in coordinate scheme with the given dimension order.
   ///
   /// Precondition: `perm` must be valid for `getRank()`.
   SparseTensorCOO<V> *toCOO(const uint64_t *perm) const {
     SparseTensorEnumeratorBase<V> *enumerator;
     newEnumerator(&enumerator, getRank(), perm);
     SparseTensorCOO<V> *coo =
         new SparseTensorCOO<V>(enumerator->permutedSizes(), values.size());
     enumerator->forallElements([&coo](const std::vector<uint64_t> &ind, V val) {
       coo->add(ind, val);
     });
     // TODO: This assertion assumes there are no stored zeros,
     // or if there are then that we don't filter them out.
     // Cf., <https://github.com/llvm/llvm-project/issues/54179>
     assert(coo->getElements().size() == values.size());
     delete enumerator;
     return coo;
   }

   /// Factory method. Constructs a sparse tensor storage scheme with the given
   /// dimensions, permutation, and per-dimension dense/sparse annotations,
   /// using the coordinate scheme tensor for the initial contents if provided.
   /// In the latter case, the coordinate scheme must respect the same
   /// permutation as is desired for the new sparse tensor storage.
   ///
   /// Precondition: `shape`, `perm`, and `sparsity` must be valid for `rank`.
   static SparseTensorStorage<P, I, V> *
   newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm,
                   const DimLevelType *sparsity, SparseTensorCOO<V> *coo) {
     SparseTensorStorage<P, I, V> *n = nullptr;
     if (coo) {
       const auto &coosz = coo->getDimSizes();
       assertPermutedSizesMatchShape(coosz, rank, perm, shape);
       n = new SparseTensorStorage<P, I, V>(coosz, perm, sparsity, coo);
     } else {
       std::vector<uint64_t> permsz(rank);
       for (uint64_t r = 0; r < rank; r++) {
         assert(shape[r] > 0 && "Dimension size zero has trivial storage");
         permsz[perm[r]] = shape[r];
       }
       // We pass the null `coo` to ensure we select the intended constructor.
       n = new SparseTensorStorage<P, I, V>(permsz, perm, sparsity, coo);
     }
     return n;
   }

   /// Factory method. Constructs a sparse tensor storage scheme with
   /// the given dimensions, permutation, and per-dimension dense/sparse
   /// annotations, using the sparse tensor for the initial contents.
   ///
   /// Preconditions:
   /// * `shape`, `perm`, and `sparsity` must be valid for `rank`.
   /// * The `tensor` must have the same value type `V`.
   static SparseTensorStorage<P, I, V> *
   newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm,
                   const DimLevelType *sparsity,
                   const SparseTensorStorageBase *source) {
     assert(source && "Got nullptr for source");
     SparseTensorEnumeratorBase<V> *enumerator;
     source->newEnumerator(&enumerator, rank, perm);
     const auto &permsz = enumerator->permutedSizes();
     assertPermutedSizesMatchShape(permsz, rank, perm, shape);
     auto *tensor =
         new SparseTensorStorage<P, I, V>(permsz, perm, sparsity, *source);
     delete enumerator;
     return tensor;
   }

 private:
   /// Appends an arbitrary new position to `pointers[d]`.  This method
   /// checks that `pos` is representable in the `P` type; however, it
   /// does not check that `pos` is semantically valid (i.e., larger than
   /// the previous position and smaller than `indices[d].capacity()`).
   void appendPointer(uint64_t d, uint64_t pos, uint64_t count = 1) {
     assert(isCompressedDim(d));
     assert(pos <= std::numeric_limits<P>::max() &&
            "Pointer value is too large for the P-type");
     pointers[d].insert(pointers[d].end(), count, static_cast<P>(pos));
   }

   /// Appends index `i` to dimension `d`, in the semantically general
   /// sense.  For non-dense dimensions, that means appending to the
   /// `indices[d]` array, checking that `i` is representable in the `I`
   /// type; however, we do not verify other semantic requirements (e.g.,
   /// that `i` is in bounds for `dimSizes[d]`, and not previously occurring
   /// in the same segment).  For dense dimensions, this method instead
   /// appends the appropriate number of zeros to the `values` array,
   /// where `full` is the number of "entries" already written to `values`
   /// for this segment (aka one after the highest index previously appended).
   void appendIndex(uint64_t d, uint64_t full, uint64_t i) {
     if (isCompressedDim(d)) {
       assert(i <= std::numeric_limits<I>::max() &&
              "Index value is too large for the I-type");
       indices[d].push_back(static_cast<I>(i));
     } else { // Dense dimension.
       assert(i >= full && "Index was already filled");
       if (i == full)
         return; // Short-circuit, since it'll be a nop.
       if (d + 1 == getRank())
         values.insert(values.end(), i - full, 0);
       else
         finalizeSegment(d + 1, 0, i - full);
     }
   }

   /// Writes the given coordinate to `indices[d][pos]`.  This method
   /// checks that `i` is representable in the `I` type; however, it
   /// does not check that `i` is semantically valid (i.e., in bounds
   /// for `dimSizes[d]` and not elsewhere occurring in the same segment).
   void writeIndex(uint64_t d, uint64_t pos, uint64_t i) {
     assert(isCompressedDim(d));
     // Subscript assignment to `std::vector` requires that the `pos`-th
     // entry has been initialized; thus we must be sure to check `size()`
     // here, instead of `capacity()` as would be ideal.
     assert(pos < indices[d].size() && "Index position is out of bounds");
     assert(i <= std::numeric_limits<I>::max() &&
            "Index value is too large for the I-type");
     indices[d][pos] = static_cast<I>(i);
   }

   /// Computes the assembled-size associated with the `d`-th dimension,
   /// given the assembled-size associated with the `(d-1)`-th dimension.
   /// "Assembled-sizes" correspond to the (nominal) sizes of overhead
   /// storage, as opposed to "dimension-sizes" which are the cardinality
   /// of coordinates for that dimension.
   ///
   /// Precondition: the `pointers[d]` array must be fully initialized
   /// before calling this method.
   uint64_t assembledSize(uint64_t parentSz, uint64_t d) const {
     if (isCompressedDim(d))
       return pointers[d][parentSz];
     // else if dense:
     return parentSz * getDimSizes()[d];
   }

   /// Initializes sparse tensor storage scheme from a memory-resident sparse
   /// tensor in coordinate scheme. This method prepares the pointers and
   /// indices arrays under the given per-dimension dense/sparse annotations.
   ///
   /// Preconditions:
   /// (1) the `elements` must be lexicographically sorted.
   /// (2) the indices of every element are valid for `dimSizes` (equal rank
   ///     and pointwise less-than).
   void fromCOO(const std::vector<Element<V>> &elements, uint64_t lo,
                uint64_t hi, uint64_t d) {
     uint64_t rank = getRank();
     assert(d <= rank && hi <= elements.size());
     // Once dimensions are exhausted, insert the numerical values.
     if (d == rank) {
       assert(lo < hi);
       values.push_back(elements[lo].value);
       return;
     }
     // Visit all elements in this interval.
     uint64_t full = 0;
     while (lo < hi) { // If `hi` is unchanged, then `lo < elements.size()`.
       // Find segment in interval with same index elements in this dimension.
       uint64_t i = elements[lo].indices[d];
       uint64_t seg = lo + 1;
       while (seg < hi && elements[seg].indices[d] == i)
         seg++;
       // Handle segment in interval for sparse or dense dimension.
       appendIndex(d, full, i);
       full = i + 1;
       fromCOO(elements, lo, seg, d + 1);
       // And move on to next segment in interval.
       lo = seg;
     }
     // Finalize the sparse pointer structure at this dimension.
     finalizeSegment(d, full);
   }

   /// Finalize the sparse pointer structure at this dimension.
   void finalizeSegment(uint64_t d, uint64_t full = 0, uint64_t count = 1) {
     if (count == 0)
       return; // Short-circuit, since it'll be a nop.
     if (isCompressedDim(d)) {
       appendPointer(d, indices[d].size(), count);
     } else { // Dense dimension.
       const uint64_t sz = getDimSizes()[d];
       assert(sz >= full && "Segment is overfull");
       count = checkedMul(count, sz - full);
       // For dense storage we must enumerate all the remaining coordinates
       // in this dimension (i.e., coordinates after the last non-zero
       // element), and either fill in their zero values or else recurse
       // to finalize some deeper dimension.
       if (d + 1 == getRank())
         values.insert(values.end(), count, 0);
       else
         finalizeSegment(d + 1, 0, count);
     }
   }

   /// Wraps up a single insertion path, inner to outer.
   void endPath(uint64_t diff) {
     uint64_t rank = getRank();
     assert(diff <= rank);
     for (uint64_t i = 0; i < rank - diff; i++) {
       const uint64_t d = rank - i - 1;
       finalizeSegment(d, idx[d] + 1);
     }
   }

   /// Continues a single insertion path, outer to inner.
   void insPath(const uint64_t *cursor, uint64_t diff, uint64_t top, V val) {
     uint64_t rank = getRank();
     assert(diff < rank);
     for (uint64_t d = diff; d < rank; d++) {
       uint64_t i = cursor[d];
       appendIndex(d, top, i);
       top = 0;
       idx[d] = i;
     }
     values.push_back(val);
   }

   /// Finds the lexicographic differing dimension.
   uint64_t lexDiff(const uint64_t *cursor) const {
     for (uint64_t r = 0, rank = getRank(); r < rank; r++)
       if (cursor[r] > idx[r])
         return r;
       else
         assert(cursor[r] == idx[r] && "non-lexicographic insertion");
     assert(0 && "duplication insertion");
     return -1u;
   }

   // Allow `SparseTensorEnumerator` to access the data-members (to avoid
   // the cost of virtual-function dispatch in inner loops), without
   // making them public to other client code.
   friend class SparseTensorEnumerator<P, I, V>;

   std::vector<std::vector<P>> pointers;
   std::vector<std::vector<I>> indices;
   std::vector<V> values;
   std::vector<uint64_t> idx; // index cursor for lexicographic insertion.
 };

 /// A (higher-order) function object for enumerating the elements of some
 /// `SparseTensorStorage` under a permutation.  That is, the `forallElements`
 /// method encapsulates the loop-nest for enumerating the elements of
 /// the source tensor (in whatever order is best for the source tensor),
 /// and applies a permutation to the coordinates/indices before handing
 /// each element to the callback.  A single enumerator object can be
 /// freely reused for several calls to `forallElements`, just so long
 /// as each call is sequential with respect to one another.
 ///
 /// N.B., this class stores a reference to the `SparseTensorStorageBase`
 /// passed to the constructor; thus, objects of this class must not
 /// outlive the sparse tensor they depend on.
 ///
 /// Design Note: The reason we define this class instead of simply using
 /// `SparseTensorEnumerator<P,I,V>` is because we need to hide/generalize
 /// the `<P,I>` template parameters from MLIR client code (to simplify the
 /// type parameters used for direct sparse-to-sparse conversion).  And the
 /// reason we define the `SparseTensorEnumerator<P,I,V>` subclasses rather
 /// than simply using this class, is to avoid the cost of virtual-method
 /// dispatch within the loop-nest.
 template <typename V>
 class SparseTensorEnumeratorBase {
 public:
   /// Constructs an enumerator with the given permutation for mapping
   /// the semantic-ordering of dimensions to the desired target-ordering.
   ///
   /// Preconditions:
   /// * the `tensor` must have the same `V` value type.
   /// * `perm` must be valid for `rank`.
   SparseTensorEnumeratorBase(const SparseTensorStorageBase &tensor,
                              uint64_t rank, const uint64_t *perm)
       : src(tensor), permsz(src.getRev().size()), reord(getRank()),
         cursor(getRank()) {
     assert(perm && "Received nullptr for permutation");
     assert(rank == getRank() && "Permutation rank mismatch");
     const auto &rev = src.getRev();           // source-order -> semantic-order
     const auto &dimSizes = src.getDimSizes(); // in source storage-order
     for (uint64_t s = 0; s < rank; s++) {     // `s` source storage-order
       uint64_t t = perm[rev[s]];              // `t` target-order
       reord[s] = t;
       permsz[t] = dimSizes[s];
     }
   }

   virtual ~SparseTensorEnumeratorBase() = default;

   // We disallow copying to help avoid leaking the `src` reference.
   // (In addition to avoiding the problem of slicing.)
   SparseTensorEnumeratorBase(const SparseTensorEnumeratorBase &) = delete;
   SparseTensorEnumeratorBase &
   operator=(const SparseTensorEnumeratorBase &) = delete;

   /// Returns the source/target tensor's rank.  (The source-rank and
   /// target-rank are always equal since we only support permutations.
   /// Though once we add support for other dimension mappings, this
   /// method will have to be split in two.)
   uint64_t getRank() const { return permsz.size(); }

   /// Returns the target tensor's dimension sizes.
   const std::vector<uint64_t> &permutedSizes() const { return permsz; }

   /// Enumerates all elements of the source tensor, permutes their
   /// indices, and passes the permuted element to the callback.
   /// The callback must not store the cursor reference directly,
   /// since this function reuses the storage.  Instead, the callback
   /// must copy it if they want to keep it.
   virtual void forallElements(ElementConsumer<V> yield) = 0;

 protected:
   const SparseTensorStorageBase &src;
   std::vector<uint64_t> permsz; // in target order.
   std::vector<uint64_t> reord;  // source storage-order -> target order.
   std::vector<uint64_t> cursor; // in target order.
 };

 template <typename P, typename I, typename V>
 class SparseTensorEnumerator final : public SparseTensorEnumeratorBase<V> {
   using Base = SparseTensorEnumeratorBase<V>;

 public:
   /// Constructs an enumerator with the given permutation for mapping
   /// the semantic-ordering of dimensions to the desired target-ordering.
   ///
   /// Precondition: `perm` must be valid for `rank`.
   SparseTensorEnumerator(const SparseTensorStorage<P, I, V> &tensor,
                          uint64_t rank, const uint64_t *perm)
       : Base(tensor, rank, perm) {}

   ~SparseTensorEnumerator() final = default;

   void forallElements(ElementConsumer<V> yield) final {
     forallElements(yield, 0, 0);
   }

 private:
   /// The recursive component of the public `forallElements`.
   void forallElements(ElementConsumer<V> yield, uint64_t parentPos,
                       uint64_t d) {
     // Recover the `<P,I,V>` type parameters of `src`.
     const auto &src =
         static_cast<const SparseTensorStorage<P, I, V> &>(this->src);
     if (d == Base::getRank()) {
       assert(parentPos < src.values.size() &&
              "Value position is out of bounds");
       // TODO: <https://github.com/llvm/llvm-project/issues/54179>
       yield(this->cursor, src.values[parentPos]);
     } else if (src.isCompressedDim(d)) {
       // Look up the bounds of the `d`-level segment determined by the
       // `d-1`-level position `parentPos`.
       const std::vector<P> &pointersD = src.pointers[d];
       assert(parentPos + 1 < pointersD.size() &&
              "Parent pointer position is out of bounds");
       const uint64_t pstart = static_cast<uint64_t>(pointersD[parentPos]);
       const uint64_t pstop = static_cast<uint64_t>(pointersD[parentPos + 1]);
       // Loop-invariant code for looking up the `d`-level coordinates/indices.
       const std::vector<I> &indicesD = src.indices[d];
       assert(pstop <= indicesD.size() && "Index position is out of bounds");
       uint64_t &cursorReordD = this->cursor[this->reord[d]];
       for (uint64_t pos = pstart; pos < pstop; pos++) {
         cursorReordD = static_cast<uint64_t>(indicesD[pos]);
         forallElements(yield, pos, d + 1);
       }
     } else { // Dense dimension.
       const uint64_t sz = src.getDimSizes()[d];
       const uint64_t pstart = parentPos * sz;
       uint64_t &cursorReordD = this->cursor[this->reord[d]];
       for (uint64_t i = 0; i < sz; i++) {
         cursorReordD = i;
         forallElements(yield, pstart + i, d + 1);
       }
     }
   }
 };

 /// Statistics regarding the number of nonzero subtensors in
 /// a source tensor, for direct sparse=>sparse conversion a la
 /// <https://arxiv.org/abs/2001.02609>.
 ///
 /// N.B., this class stores references to the parameters passed to
 /// the constructor; thus, objects of this class must not outlive
 /// those parameters.
 class SparseTensorNNZ final {
 public:
   /// Allocate the statistics structure for the desired sizes and
   /// sparsity (in the target tensor's storage-order).  This constructor
   /// does not actually populate the statistics, however; for that see
   /// `initialize`.
   ///
   /// Precondition: `dimSizes` must not contain zeros.
   SparseTensorNNZ(const std::vector<uint64_t> &dimSizes,
                   const std::vector<DimLevelType> &sparsity)
       : dimSizes(dimSizes), dimTypes(sparsity), nnz(getRank()) {
     assert(dimSizes.size() == dimTypes.size() && "Rank mismatch");
     bool uncompressed = true;
     (void)uncompressed;
     uint64_t sz = 1; // the product of all `dimSizes` strictly less than `r`.
     for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
       switch (dimTypes[r]) {
       case DimLevelType::kCompressed:
         assert(uncompressed &&
                "Multiple compressed layers not currently supported");
         uncompressed = false;
         nnz[r].resize(sz, 0); // Both allocate and zero-initialize.
         break;
       case DimLevelType::kDense:
         assert(uncompressed &&
                "Dense after compressed not currently supported");
         break;
       case DimLevelType::kSingleton:
         // Singleton after Compressed causes no problems for allocating
         // `nnz` nor for the yieldPos loop.  This remains true even
         // when adding support for multiple compressed dimensions or
         // for dense-after-compressed.
         break;
       }
       sz = checkedMul(sz, dimSizes[r]);
     }
   }

   // We disallow copying to help avoid leaking the stored references.
   SparseTensorNNZ(const SparseTensorNNZ &) = delete;
   SparseTensorNNZ &operator=(const SparseTensorNNZ &) = delete;

   /// Returns the rank of the target tensor.
   uint64_t getRank() const { return dimSizes.size(); }

   /// Enumerate the source tensor to fill in the statistics.  The
   /// enumerator should already incorporate the permutation (from
   /// semantic-order to the target storage-order).
   template <typename V>
   void initialize(SparseTensorEnumeratorBase<V> &enumerator) {
     assert(enumerator.getRank() == getRank() && "Tensor rank mismatch");
     assert(enumerator.permutedSizes() == dimSizes && "Tensor size mismatch");
     enumerator.forallElements(
         [this](const std::vector<uint64_t> &ind, V) { add(ind); });
   }

   /// The type of callback functions which receive an nnz-statistic.
   using NNZConsumer = const std::function<void(uint64_t)> &;

   /// Lexicographically enumerates all indicies for dimensions strictly
   /// less than `stopDim`, and passes their nnz statistic to the callback.
   /// Since our use-case only requires the statistic not the coordinates
   /// themselves, we do not bother to construct those coordinates.
   void forallIndices(uint64_t stopDim, NNZConsumer yield) const {
     assert(stopDim < getRank() && "Stopping-dimension is out of bounds");
     assert(dimTypes[stopDim] == DimLevelType::kCompressed &&
            "Cannot look up non-compressed dimensions");
     forallIndices(yield, stopDim, 0, 0);
   }

 private:
   /// Adds a new element (i.e., increment its statistics).  We use
   /// a method rather than inlining into the lambda in `initialize`,
   /// to avoid spurious templating over `V`.  And this method is private
   /// to avoid needing to re-assert validity of `ind` (which is guaranteed
   /// by `forallElements`).
   void add(const std::vector<uint64_t> &ind) {
     uint64_t parentPos = 0;
     for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
       if (dimTypes[r] == DimLevelType::kCompressed)
         nnz[r][parentPos]++;
       parentPos = parentPos * dimSizes[r] + ind[r];
     }
   }

   /// Recursive component of the public `forallIndices`.
   void forallIndices(NNZConsumer yield, uint64_t stopDim, uint64_t parentPos,
                      uint64_t d) const {
     assert(d <= stopDim);
     if (d == stopDim) {
       assert(parentPos < nnz[d].size() && "Cursor is out of range");
       yield(nnz[d][parentPos]);
     } else {
       const uint64_t sz = dimSizes[d];
       const uint64_t pstart = parentPos * sz;
       for (uint64_t i = 0; i < sz; i++)
         forallIndices(yield, stopDim, pstart + i, d + 1);
     }
   }

   // All of these are in the target storage-order.
   const std::vector<uint64_t> &dimSizes;
   const std::vector<DimLevelType> &dimTypes;
   std::vector<std::vector<uint64_t>> nnz;
 };

 template <typename P, typename I, typename V>
 SparseTensorStorage<P, I, V>::SparseTensorStorage(
     const std::vector<uint64_t> &dimSizes, const uint64_t *perm,
     const DimLevelType *sparsity, const SparseTensorStorageBase &tensor)
     : SparseTensorStorage(dimSizes, perm, sparsity) {
   SparseTensorEnumeratorBase<V> *enumerator;
   tensor.newEnumerator(&enumerator, getRank(), perm);
   {
     // Initialize the statistics structure.
     SparseTensorNNZ nnz(getDimSizes(), getDimTypes());
     nnz.initialize(*enumerator);
     // Initialize "pointers" overhead (and allocate "indices", "values").
     uint64_t parentSz = 1; // assembled-size (not dimension-size) of `r-1`.
     for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
       if (isCompressedDim(r)) {
         pointers[r].reserve(parentSz + 1);
         pointers[r].push_back(0);
         uint64_t currentPos = 0;
         nnz.forallIndices(r, [this, &currentPos, r](uint64_t n) {
           currentPos += n;
           appendPointer(r, currentPos);
         });
         assert(pointers[r].size() == parentSz + 1 &&
                "Final pointers size doesn't match allocated size");
         // That assertion entails `assembledSize(parentSz, r)`
         // is now in a valid state.  That is, `pointers[r][parentSz]`
         // equals the present value of `currentPos`, which is the
         // correct assembled-size for `indices[r]`.
       }
       // Update assembled-size for the next iteration.
       parentSz = assembledSize(parentSz, r);
       // Ideally we need only `indices[r].reserve(parentSz)`, however
       // the `std::vector` implementation forces us to initialize it too.
       // That is, in the yieldPos loop we need random-access assignment
       // to `indices[r]`; however, `std::vector`'s subscript-assignment
       // only allows assigning to already-initialized positions.
       if (isCompressedDim(r))
         indices[r].resize(parentSz, 0);
     }
     values.resize(parentSz, 0); // Both allocate and zero-initialize.
   }
   // The yieldPos loop
   enumerator->forallElements([this](const std::vector<uint64_t> &ind, V val) {
     uint64_t parentSz = 1, parentPos = 0;
     for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
       if (isCompressedDim(r)) {
         // If `parentPos == parentSz` then it's valid as an array-lookup;
         // however, it's semantically invalid here since that entry
         // does not represent a segment of `indices[r]`.  Moreover, that
         // entry must be immutable for `assembledSize` to remain valid.
         assert(parentPos < parentSz && "Pointers position is out of bounds");
         const uint64_t currentPos = pointers[r][parentPos];
         // This increment won't overflow the `P` type, since it can't
         // exceed the original value of `pointers[r][parentPos+1]`
         // which was already verified to be within bounds for `P`
         // when it was written to the array.
         pointers[r][parentPos]++;
         writeIndex(r, currentPos, ind[r]);
         parentPos = currentPos;
       } else { // Dense dimension.
         parentPos = parentPos * getDimSizes()[r] + ind[r];
       }
       parentSz = assembledSize(parentSz, r);
     }
     assert(parentPos < values.size() && "Value position is out of bounds");
     values[parentPos] = val;
   });
   // No longer need the enumerator, so we'll delete it ASAP.
   delete enumerator;
   // The finalizeYieldPos loop
   for (uint64_t parentSz = 1, rank = getRank(), r = 0; r < rank; r++) {
     if (isCompressedDim(r)) {
       assert(parentSz == pointers[r].size() - 1 &&
              "Actual pointers size doesn't match the expected size");
       // Can't check all of them, but at least we can check the last one.
       assert(pointers[r][parentSz - 1] == pointers[r][parentSz] &&
              "Pointers got corrupted");
       // TODO: optimize this by using `memmove` or similar.
       for (uint64_t n = 0; n < parentSz; n++) {
         const uint64_t parentPos = parentSz - n;
         pointers[r][parentPos] = pointers[r][parentPos - 1];
       }
       pointers[r][0] = 0;
     }
     parentSz = assembledSize(parentSz, r);
   }
 }

 /// Helper to convert string to lower case.
 static char *toLower(char *token) {
   for (char *c = token; *c; c++)
     *c = tolower(*c);
   return token;
 }

 /// This class abstracts over the information stored in file headers,
 /// as well as providing the buffers and methods for parsing those headers.
 class SparseTensorFile final {
 public:
   enum class ValueKind {
     kInvalid = 0,
     kPattern = 1,
     kReal = 2,
     kInteger = 3,
     kComplex = 4,
     kUndefined = 5
   };

   explicit SparseTensorFile(char *filename) : filename(filename) {
     assert(filename && "Received nullptr for filename");
   }

   // Disallows copying, to avoid duplicating the `file` pointer.
   SparseTensorFile(const SparseTensorFile &) = delete;
   SparseTensorFile &operator=(const SparseTensorFile &) = delete;

   // This dtor tries to avoid leaking the `file`.  (Though it's better
   // to call `closeFile` explicitly when possible, since there are
   // circumstances where dtors are not called reliably.)
   ~SparseTensorFile() { closeFile(); }

   /// Opens the file for reading.
   void openFile() {
     if (file)
       FATAL("Already opened file %s\n", filename);
     file = fopen(filename, "r");
     if (!file)
       FATAL("Cannot find file %s\n", filename);
   }

   /// Closes the file.
   void closeFile() {
     if (file) {
       fclose(file);
       file = nullptr;
     }
   }

   // TODO(wrengr/bixia): figure out how to reorganize the element-parsing
   // loop of `openSparseTensorCOO` into methods of this class, so we can
   // avoid leaking access to the `line` pointer (both for general hygiene
   // and because we can't mark it const due to the second argument of
   // `strtoul`/`strtoud` being `char * *restrict` rather than
   // `char const* *restrict`).
   //
   /// Attempts to read a line from the file.
   char *readLine() {
     if (fgets(line, kColWidth, file))
       return line;
     FATAL("Cannot read next line of %s\n", filename);
   }

   /// Reads and parses the file's header.
   void readHeader() {
     assert(file && "Attempt to readHeader() before openFile()");
     if (strstr(filename, ".mtx"))
       readMMEHeader();
     else if (strstr(filename, ".tns"))
       readExtFROSTTHeader();
     else
       FATAL("Unknown format %s\n", filename);
     assert(isValid() && "Failed to read the header");
   }

   ValueKind getValueKind() const { return valueKind_; }

   bool isValid() const { return valueKind_ != ValueKind::kInvalid; }

   /// Gets the MME "pattern" property setting.  Is only valid after
   /// parsing the header.
   bool isPattern() const {
     assert(isValid() && "Attempt to isPattern() before readHeader()");
     return valueKind_ == ValueKind::kPattern;
   }

   /// Gets the MME "symmetric" property setting.  Is only valid after
   /// parsing the header.
   bool isSymmetric() const {
     assert(isValid() && "Attempt to isSymmetric() before readHeader()");
     return isSymmetric_;
   }

   /// Gets the rank of the tensor.  Is only valid after parsing the header.
   uint64_t getRank() const {
     assert(isValid() && "Attempt to getRank() before readHeader()");
     return idata[0];
   }

   /// Gets the number of non-zeros.  Is only valid after parsing the header.
   uint64_t getNNZ() const {
     assert(isValid() && "Attempt to getNNZ() before readHeader()");
     return idata[1];
   }

   /// Gets the dimension-sizes array.  The pointer itself is always
   /// valid; however, the values stored therein are only valid after
   /// parsing the header.
   const uint64_t *getDimSizes() const { return idata + 2; }

   /// Safely gets the size of the given dimension.  Is only valid
   /// after parsing the header.
   uint64_t getDimSize(uint64_t d) const {
     assert(d < getRank());
     return idata[2 + d];
   }

   /// Asserts the shape subsumes the actual dimension sizes.  Is only
   /// valid after parsing the header.
   void assertMatchesShape(uint64_t rank, const uint64_t *shape) const {
     assert(rank == getRank() && "Rank mismatch");
     for (uint64_t r = 0; r < rank; r++)
       assert((shape[r] == 0 || shape[r] == idata[2 + r]) &&
              "Dimension size mismatch");
   }

 private:
   void readMMEHeader();
   void readExtFROSTTHeader();

   const char *filename;
   FILE *file = nullptr;
   ValueKind valueKind_ = ValueKind::kInvalid;
   bool isSymmetric_ = false;
   uint64_t idata[512];
   char line[kColWidth];
 };

 /// Read the MME header of a general sparse matrix of type real.
 void SparseTensorFile::readMMEHeader() {
   char header[64];
   char object[64];
   char format[64];
   char field[64];
   char symmetry[64];
   // Read header line.
   if (fscanf(file, "%63s %63s %63s %63s %63s\n", header, object, format, field,
              symmetry) != 5)
     FATAL("Corrupt header in %s\n", filename);
   // Process `field`, which specify pattern or the data type of the values.
   if (strcmp(toLower(field), "pattern") == 0)
     valueKind_ = ValueKind::kPattern;
   else if (strcmp(toLower(field), "real") == 0)
     valueKind_ = ValueKind::kReal;
   else if (strcmp(toLower(field), "integer") == 0)
     valueKind_ = ValueKind::kInteger;
   else if (strcmp(toLower(field), "complex") == 0)
     valueKind_ = ValueKind::kComplex;
   else
     FATAL("Unexpected header field value in %s\n", filename);

   // Set properties.
   isSymmetric_ = (strcmp(toLower(symmetry), "symmetric") == 0);
   // Make sure this is a general sparse matrix.
   if (strcmp(toLower(header), "%%matrixmarket") ||
       strcmp(toLower(object), "matrix") ||
       strcmp(toLower(format), "coordinate") ||
       (strcmp(toLower(symmetry), "general") && !isSymmetric_))
     FATAL("Cannot find a general sparse matrix in %s\n", filename);
   // Skip comments.
   while (true) {
     readLine();
     if (line[0] != '%')
       break;
   }
   // Next line contains M N NNZ.
   idata[0] = 2; // rank
   if (sscanf(line, "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n", idata + 2, idata + 3,
              idata + 1) != 3)
     FATAL("Cannot find size in %s\n", filename);
 }

 /// Read the "extended" FROSTT header. Although not part of the documented
 /// format, we assume that the file starts with optional comments followed
 /// by two lines that define the rank, the number of nonzeros, and the
 /// dimensions sizes (one per rank) of the sparse tensor.
 void SparseTensorFile::readExtFROSTTHeader() {
   // Skip comments.
   while (true) {
     readLine();
     if (line[0] != '#')
       break;
   }
   // Next line contains RANK and NNZ.
   if (sscanf(line, "%" PRIu64 "%" PRIu64 "\n", idata, idata + 1) != 2)
     FATAL("Cannot find metadata in %s\n", filename);
   // Followed by a line with the dimension sizes (one per rank).
   for (uint64_t r = 0; r < idata[0]; r++)
     if (fscanf(file, "%" PRIu64, idata + 2 + r) != 1)
       FATAL("Cannot find dimension size %s\n", filename);
   readLine(); // end of line
   // The FROSTT format does not define the data type of the nonzero elements.
   valueKind_ = ValueKind::kUndefined;
 }

 // Adds a value to a tensor in coordinate scheme. If is_symmetric_value is true,
 // also adds the value to its symmetric location.
 template <typename T, typename V>
 static inline void addValue(T *coo, V value,
                             const std::vector<uint64_t> indices,
                             bool is_symmetric_value) {
   // TODO: <https://github.com/llvm/llvm-project/issues/54179>
   coo->add(indices, value);
   // We currently chose to deal with symmetric matrices by fully constructing
   // them. In the future, we may want to make symmetry implicit for storage
   // reasons.
   if (is_symmetric_value)
     coo->add({indices[1], indices[0]}, value);
 }

 // Reads an element of a complex type for the current indices in coordinate
 // scheme.
 template <typename V>
 static inline void readCOOValue(SparseTensorCOO<std::complex<V>> *coo,
                                 const std::vector<uint64_t> indices,
                                 char **linePtr, bool is_pattern,
                                 bool add_symmetric_value) {
   // Read two values to make a complex. The external formats always store
   // numerical values with the type double, but we cast these values to the
   // sparse tensor object type. For a pattern tensor, we arbitrarily pick the
   // value 1 for all entries.
   V re = is_pattern ? 1.0 : strtod(*linePtr, linePtr);
   V im = is_pattern ? 1.0 : strtod(*linePtr, linePtr);
   std::complex<V> value = {re, im};
   addValue(coo, value, indices, add_symmetric_value);
 }

 // Reads an element of a non-complex type for the current indices in coordinate
 // scheme.
 template <typename V,
           typename std::enable_if<
               !std::is_same<std::complex<float>, V>::value &&
               !std::is_same<std::complex<double>, V>::value>::type * = nullptr>
 static void inline readCOOValue(SparseTensorCOO<V> *coo,
                                 const std::vector<uint64_t> indices,
                                 char **linePtr, bool is_pattern,
                                 bool is_symmetric_value) {
   // The external formats always store these numerical values with the type
   // double, but we cast these values to the sparse tensor object type.
   // For a pattern tensor, we arbitrarily pick the value 1 for all entries.
   double value = is_pattern ? 1.0 : strtod(*linePtr, linePtr);
   addValue(coo, value, indices, is_symmetric_value);
 }

 /// Reads a sparse tensor with the given filename into a memory-resident
 /// sparse tensor in coordinate scheme.
 template <typename V>
 static SparseTensorCOO<V> *
 openSparseTensorCOO(char *filename, uint64_t rank, const uint64_t *shape,
                     const uint64_t *perm, PrimaryType valTp) {
   SparseTensorFile stfile(filename);
   stfile.openFile();
   stfile.readHeader();
   // Check tensor element type against the value type in the input file.
   SparseTensorFile::ValueKind valueKind = stfile.getValueKind();
   bool tensorIsInteger =
       (valTp >= PrimaryType::kI64 && valTp <= PrimaryType::kI8);
   bool tensorIsReal = (valTp >= PrimaryType::kF64 && valTp <= PrimaryType::kI8);
   if ((valueKind == SparseTensorFile::ValueKind::kReal && tensorIsInteger) ||
       (valueKind == SparseTensorFile::ValueKind::kComplex && tensorIsReal)) {
     FATAL("Tensor element type %d not compatible with values in file %s\n",
           static_cast<int>(valTp), filename);
   }
   stfile.assertMatchesShape(rank, shape);
   // Prepare sparse tensor object with per-dimension sizes
   // and the number of nonzeros as initial capacity.
   uint64_t nnz = stfile.getNNZ();
   auto *coo = SparseTensorCOO<V>::newSparseTensorCOO(rank, stfile.getDimSizes(),
                                                      perm, nnz);
   // Read all nonzero elements.
   std::vector<uint64_t> indices(rank);
   for (uint64_t k = 0; k < nnz; k++) {
     char *linePtr = stfile.readLine();
     for (uint64_t r = 0; r < rank; r++) {
       uint64_t idx = strtoul(linePtr, &linePtr, 10);
       // Add 0-based index.
       indices[perm[r]] = idx - 1;
     }
     readCOOValue(coo, indices, &linePtr, stfile.isPattern(),
                  stfile.isSymmetric() && indices[0] != indices[1]);
   }
   // Close the file and return tensor.
   stfile.closeFile();
   return coo;
 }

 /// Writes the sparse tensor to `dest` in extended FROSTT format.
 template <typename V>
 static void outSparseTensor(void *tensor, void *dest, bool sort) {
   assert(tensor && dest);
   auto coo = static_cast<SparseTensorCOO<V> *>(tensor);
   if (sort)
     coo->sort();
   char *filename = static_cast<char *>(dest);
   auto &dimSizes = coo->getDimSizes();
   auto &elements = coo->getElements();
   uint64_t rank = coo->getRank();
   uint64_t nnz = elements.size();
   std::fstream file;
   file.open(filename, std::ios_base::out | std::ios_base::trunc);
   assert(file.is_open());
   file << "; extended FROSTT format\n" << rank << " " << nnz << std::endl;
   for (uint64_t r = 0; r < rank - 1; r++)
     file << dimSizes[r] << " ";
   file << dimSizes[rank - 1] << std::endl;
   for (uint64_t i = 0; i < nnz; i++) {
     auto &idx = elements[i].indices;
     for (uint64_t r = 0; r < rank; r++)
       file << (idx[r] + 1) << " ";
     file << elements[i].value << std::endl;
   }
   file.flush();
   file.close();
   assert(file.good());
 }

 /// Initializes sparse tensor from an external COO-flavored format.
 template <typename V>
 static SparseTensorStorage<uint64_t, uint64_t, V> *
 toMLIRSparseTensor(uint64_t rank, uint64_t nse, uint64_t *shape, V *values,
                    uint64_t *indices, uint64_t *perm, uint8_t *sparse) {
   const DimLevelType *sparsity = (DimLevelType *)(sparse);
 #ifndef NDEBUG
   // Verify that perm is a permutation of 0..(rank-1).
   std::vector<uint64_t> order(perm, perm + rank);
   std::sort(order.begin(), order.end());
   for (uint64_t i = 0; i < rank; ++i)
     if (i != order[i])
       FATAL("Not a permutation of 0..%" PRIu64 "\n", rank);

   // Verify that the sparsity values are supported.
   for (uint64_t i = 0; i < rank; ++i)
     if (sparsity[i] != DimLevelType::kDense &&
         sparsity[i] != DimLevelType::kCompressed)
       FATAL("Unsupported sparsity value %d\n", static_cast<int>(sparsity[i]));
 #endif

   // Convert external format to internal COO.
   auto *coo = SparseTensorCOO<V>::newSparseTensorCOO(rank, shape, perm, nse);
   std::vector<uint64_t> idx(rank);
   for (uint64_t i = 0, base = 0; i < nse; i++) {
     for (uint64_t r = 0; r < rank; r++)
       idx[perm[r]] = indices[base + r];
     coo->add(idx, values[i]);
     base += rank;
   }
   // Return sparse tensor storage format as opaque pointer.
   auto *tensor = SparseTensorStorage<uint64_t, uint64_t, V>::newSparseTensor(
       rank, shape, perm, sparsity, coo);
   delete coo;
   return tensor;
 }

 /// Converts a sparse tensor to an external COO-flavored format.
 template <typename V>
 static void fromMLIRSparseTensor(void *tensor, uint64_t *pRank, uint64_t *pNse,
                                  uint64_t **pShape, V **pValues,
                                  uint64_t **pIndices) {
   assert(tensor);
   auto sparseTensor =
       static_cast<SparseTensorStorage<uint64_t, uint64_t, V> *>(tensor);
   uint64_t rank = sparseTensor->getRank();
   std::vector<uint64_t> perm(rank);
   std::iota(perm.begin(), perm.end(), 0);
   SparseTensorCOO<V> *coo = sparseTensor->toCOO(perm.data());

   const std::vector<Element<V>> &elements = coo->getElements();
   uint64_t nse = elements.size();

   uint64_t *shape = new uint64_t[rank];
   for (uint64_t i = 0; i < rank; i++)
     shape[i] = coo->getDimSizes()[i];

   V *values = new V[nse];
   uint64_t *indices = new uint64_t[rank * nse];

   for (uint64_t i = 0, base = 0; i < nse; i++) {
     values[i] = elements[i].value;
     for (uint64_t j = 0; j < rank; j++)
       indices[base + j] = elements[i].indices[j];
     base += rank;
   }

   delete coo;
   *pRank = rank;
   *pNse = nse;
   *pShape = shape;
   *pValues = values;
   *pIndices = indices;
 }

 } // anonymous namespace

 extern "C" {

 //===----------------------------------------------------------------------===//
 //
 // Public functions which operate on MLIR buffers (memrefs) to interact
 // with sparse tensors (which are only visible as opaque pointers externally).
 //
 //===----------------------------------------------------------------------===//

 #define CASE(p, i, v, P, I, V)                                                 \
   if (ptrTp == (p) && indTp == (i) && valTp == (v)) {                          \
     SparseTensorCOO<V> *coo = nullptr;                                         \
     if (action <= Action::kFromCOO) {                                          \
       if (action == Action::kFromFile) {                                       \
         char *filename = static_cast<char *>(ptr);                             \
         coo = openSparseTensorCOO<V>(filename, rank, shape, perm, v);          \
       } else if (action == Action::kFromCOO) {                                 \
         coo = static_cast<SparseTensorCOO<V> *>(ptr);                          \
       } else {                                                                 \
         assert(action == Action::kEmpty);                                      \
       }                                                                        \
       auto *tensor = SparseTensorStorage<P, I, V>::newSparseTensor(            \
           rank, shape, perm, sparsity, coo);                                   \
       if (action == Action::kFromFile)                                         \
         delete coo;                                                            \
       return tensor;                                                           \
     }                                                                          \
     if (action == Action::kSparseToSparse) {                                   \
       auto *tensor = static_cast<SparseTensorStorageBase *>(ptr);              \
       return SparseTensorStorage<P, I, V>::newSparseTensor(rank, shape, perm,  \
                                                            sparsity, tensor);  \
     }                                                                          \
     if (action == Action::kEmptyCOO)                                           \
       return SparseTensorCOO<V>::newSparseTensorCOO(rank, shape, perm);        \
     coo = static_cast<SparseTensorStorage<P, I, V> *>(ptr)->toCOO(perm);       \
     if (action == Action::kToIterator) {                                       \
       coo->startIterator();                                                    \
     } else {                                                                   \
       assert(action == Action::kToCOO);                                        \
     }                                                                          \
     return coo;                                                                \
   }

 #define CASE_SECSAME(p, v, P, V) CASE(p, p, v, P, P, V)

 // Assume index_type is in fact uint64_t, so that _mlir_ciface_newSparseTensor
 // can safely rewrite kIndex to kU64.  We make this assertion to guarantee
 // that this file cannot get out of sync with its header.
 static_assert(std::is_same<index_type, uint64_t>::value,
               "Expected index_type == uint64_t");

 void *
 _mlir_ciface_newSparseTensor(StridedMemRefType<DimLevelType, 1> *aref, // NOLINT
                              StridedMemRefType<index_type, 1> *sref,
                              StridedMemRefType<index_type, 1> *pref,
                              OverheadType ptrTp, OverheadType indTp,
                              PrimaryType valTp, Action action, void *ptr) {
   assert(aref && sref && pref);
   assert(aref->strides[0] == 1 && sref->strides[0] == 1 &&
          pref->strides[0] == 1);
   assert(aref->sizes[0] == sref->sizes[0] && sref->sizes[0] == pref->sizes[0]);
   const DimLevelType *sparsity = aref->data + aref->offset;
   const index_type *shape = sref->data + sref->offset;
   const index_type *perm = pref->data + pref->offset;
   uint64_t rank = aref->sizes[0];

   // Rewrite kIndex to kU64, to avoid introducing a bunch of new cases.
   // This is safe because of the static_assert above.
   if (ptrTp == OverheadType::kIndex)
     ptrTp = OverheadType::kU64;
   if (indTp == OverheadType::kIndex)
     indTp = OverheadType::kU64;

   // Double matrices with all combinations of overhead storage.
   CASE(OverheadType::kU64, OverheadType::kU64, PrimaryType::kF64, uint64_t,
        uint64_t, double);
   CASE(OverheadType::kU64, OverheadType::kU32, PrimaryType::kF64, uint64_t,
        uint32_t, double);
   CASE(OverheadType::kU64, OverheadType::kU16, PrimaryType::kF64, uint64_t,
        uint16_t, double);
   CASE(OverheadType::kU64, OverheadType::kU8, PrimaryType::kF64, uint64_t,
        uint8_t, double);
   CASE(OverheadType::kU32, OverheadType::kU64, PrimaryType::kF64, uint32_t,
        uint64_t, double);
   CASE(OverheadType::kU32, OverheadType::kU32, PrimaryType::kF64, uint32_t,
        uint32_t, double);
   CASE(OverheadType::kU32, OverheadType::kU16, PrimaryType::kF64, uint32_t,
        uint16_t, double);
   CASE(OverheadType::kU32, OverheadType::kU8, PrimaryType::kF64, uint32_t,
        uint8_t, double);
   CASE(OverheadType::kU16, OverheadType::kU64, PrimaryType::kF64, uint16_t,
        uint64_t, double);
   CASE(OverheadType::kU16, OverheadType::kU32, PrimaryType::kF64, uint16_t,
        uint32_t, double);
   CASE(OverheadType::kU16, OverheadType::kU16, PrimaryType::kF64, uint16_t,
        uint16_t, double);
   CASE(OverheadType::kU16, OverheadType::kU8, PrimaryType::kF64, uint16_t,
        uint8_t, double);
   CASE(OverheadType::kU8, OverheadType::kU64, PrimaryType::kF64, uint8_t,
        uint64_t, double);
   CASE(OverheadType::kU8, OverheadType::kU32, PrimaryType::kF64, uint8_t,
        uint32_t, double);
   CASE(OverheadType::kU8, OverheadType::kU16, PrimaryType::kF64, uint8_t,
        uint16_t, double);
   CASE(OverheadType::kU8, OverheadType::kU8, PrimaryType::kF64, uint8_t,
        uint8_t, double);

   // Float matrices with all combinations of overhead storage.
   CASE(OverheadType::kU64, OverheadType::kU64, PrimaryType::kF32, uint64_t,
        uint64_t, float);
   CASE(OverheadType::kU64, OverheadType::kU32, PrimaryType::kF32, uint64_t,
        uint32_t, float);
   CASE(OverheadType::kU64, OverheadType::kU16, PrimaryType::kF32, uint64_t,
        uint16_t, float);
   CASE(OverheadType::kU64, OverheadType::kU8, PrimaryType::kF32, uint64_t,
        uint8_t, float);
   CASE(OverheadType::kU32, OverheadType::kU64, PrimaryType::kF32, uint32_t,
        uint64_t, float);
   CASE(OverheadType::kU32, OverheadType::kU32, PrimaryType::kF32, uint32_t,
        uint32_t, float);
   CASE(OverheadType::kU32, OverheadType::kU16, PrimaryType::kF32, uint32_t,
        uint16_t, float);
   CASE(OverheadType::kU32, OverheadType::kU8, PrimaryType::kF32, uint32_t,
        uint8_t, float);
   CASE(OverheadType::kU16, OverheadType::kU64, PrimaryType::kF32, uint16_t,
        uint64_t, float);
   CASE(OverheadType::kU16, OverheadType::kU32, PrimaryType::kF32, uint16_t,
        uint32_t, float);
   CASE(OverheadType::kU16, OverheadType::kU16, PrimaryType::kF32, uint16_t,
        uint16_t, float);
   CASE(OverheadType::kU16, OverheadType::kU8, PrimaryType::kF32, uint16_t,
        uint8_t, float);
   CASE(OverheadType::kU8, OverheadType::kU64, PrimaryType::kF32, uint8_t,
        uint64_t, float);
   CASE(OverheadType::kU8, OverheadType::kU32, PrimaryType::kF32, uint8_t,
        uint32_t, float);
   CASE(OverheadType::kU8, OverheadType::kU16, PrimaryType::kF32, uint8_t,
        uint16_t, float);
   CASE(OverheadType::kU8, OverheadType::kU8, PrimaryType::kF32, uint8_t,
        uint8_t, float);

   // Two-byte floats with both overheads of the same type.
   CASE_SECSAME(OverheadType::kU64, PrimaryType::kF16, uint64_t, f16);
   CASE_SECSAME(OverheadType::kU64, PrimaryType::kBF16, uint64_t, bf16);
   CASE_SECSAME(OverheadType::kU32, PrimaryType::kF16, uint32_t, f16);
   CASE_SECSAME(OverheadType::kU32, PrimaryType::kBF16, uint32_t, bf16);
   CASE_SECSAME(OverheadType::kU16, PrimaryType::kF16, uint16_t, f16);
   CASE_SECSAME(OverheadType::kU16, PrimaryType::kBF16, uint16_t, bf16);
   CASE_SECSAME(OverheadType::kU8, PrimaryType::kF16, uint8_t, f16);
   CASE_SECSAME(OverheadType::kU8, PrimaryType::kBF16, uint8_t, bf16);

   // Integral matrices with both overheads of the same type.
   CASE_SECSAME(OverheadType::kU64, PrimaryType::kI64, uint64_t, int64_t);
   CASE_SECSAME(OverheadType::kU64, PrimaryType::kI32, uint64_t, int32_t);
   CASE_SECSAME(OverheadType::kU64, PrimaryType::kI16, uint64_t, int16_t);
   CASE_SECSAME(OverheadType::kU64, PrimaryType::kI8, uint64_t, int8_t);
   CASE_SECSAME(OverheadType::kU32, PrimaryType::kI64, uint32_t, int64_t);
   CASE_SECSAME(OverheadType::kU32, PrimaryType::kI32, uint32_t, int32_t);
   CASE_SECSAME(OverheadType::kU32, PrimaryType::kI16, uint32_t, int16_t);
   CASE_SECSAME(OverheadType::kU32, PrimaryType::kI8, uint32_t, int8_t);
   CASE_SECSAME(OverheadType::kU16, PrimaryType::kI64, uint16_t, int64_t);
   CASE_SECSAME(OverheadType::kU16, PrimaryType::kI32, uint16_t, int32_t);
   CASE_SECSAME(OverheadType::kU16, PrimaryType::kI16, uint16_t, int16_t);
   CASE_SECSAME(OverheadType::kU16, PrimaryType::kI8, uint16_t, int8_t);
   CASE_SECSAME(OverheadType::kU8, PrimaryType::kI64, uint8_t, int64_t);
   CASE_SECSAME(OverheadType::kU8, PrimaryType::kI32, uint8_t, int32_t);
   CASE_SECSAME(OverheadType::kU8, PrimaryType::kI16, uint8_t, int16_t);
   CASE_SECSAME(OverheadType::kU8, PrimaryType::kI8, uint8_t, int8_t);

   // Complex matrices with wide overhead.
   CASE_SECSAME(OverheadType::kU64, PrimaryType::kC64, uint64_t, complex64);
   CASE_SECSAME(OverheadType::kU64, PrimaryType::kC32, uint64_t, complex32);

   // Unsupported case (add above if needed).
   // TODO: better pretty-printing of enum values!
   FATAL("unsupported combination of types: <P=%d, I=%d, V=%d>\n",
         static_cast<int>(ptrTp), static_cast<int>(indTp),
         static_cast<int>(valTp));
 }
 #undef CASE
 #undef CASE_SECSAME

 #define IMPL_SPARSEVALUES(VNAME, V)                                            \
   void _mlir_ciface_sparseValues##VNAME(StridedMemRefType<V, 1> *ref,          \
                                         void *tensor) {                        \
     assert(ref &&tensor);                                                      \
     std::vector<V> *v;                                                         \
     static_cast<SparseTensorStorageBase *>(tensor)->getValues(&v);             \
     ref->basePtr = ref->data = v->data();                                      \
     ref->offset = 0;                                                           \
     ref->sizes[0] = v->size();                                                 \
     ref->strides[0] = 1;                                                       \
   }
 FOREVERY_V(IMPL_SPARSEVALUES)
 #undef IMPL_SPARSEVALUES

 #define IMPL_GETOVERHEAD(NAME, TYPE, LIB)                                      \
   void _mlir_ciface_##NAME(StridedMemRefType<TYPE, 1> *ref, void *tensor,      \
                            index_type d) {                                     \
     assert(ref &&tensor);                                                      \
     std::vector<TYPE> *v;                                                      \
     static_cast<SparseTensorStorageBase *>(tensor)->LIB(&v, d);                \
     ref->basePtr = ref->data = v->data();                                      \
     ref->offset = 0;                                                           \
     ref->sizes[0] = v->size();                                                 \
     ref->strides[0] = 1;                                                       \
   }
 #define IMPL_SPARSEPOINTERS(PNAME, P)                                          \
   IMPL_GETOVERHEAD(sparsePointers##PNAME, P, getPointers)
 FOREVERY_O(IMPL_SPARSEPOINTERS)
 #undef IMPL_SPARSEPOINTERS

 #define IMPL_SPARSEINDICES(INAME, I)                                           \
   IMPL_GETOVERHEAD(sparseIndices##INAME, I, getIndices)
 FOREVERY_O(IMPL_SPARSEINDICES)
 #undef IMPL_SPARSEINDICES
 #undef IMPL_GETOVERHEAD

 #define IMPL_ADDELT(VNAME, V)                                                  \
   void *_mlir_ciface_addElt##VNAME(void *coo, StridedMemRefType<V, 0> *vref,   \
                                    StridedMemRefType<index_type, 1> *iref,     \
                                    StridedMemRefType<index_type, 1> *pref) {   \
     assert(coo &&vref &&iref &&pref);                                          \
     assert(iref->strides[0] == 1 && pref->strides[0] == 1);                    \
     assert(iref->sizes[0] == pref->sizes[0]);                                  \
     const index_type *indx = iref->data + iref->offset;                        \
     const index_type *perm = pref->data + pref->offset;                        \
     uint64_t isize = iref->sizes[0];                                           \
     std::vector<index_type> indices(isize);                                    \
     for (uint64_t r = 0; r < isize; r++)                                       \
       indices[perm[r]] = indx[r];                                              \
     V *value = vref->data + vref->offset;                                      \
     static_cast<SparseTensorCOO<V> *>(coo)->add(indices, *value);              \
     return coo;                                                                \
   }
 FOREVERY_V(IMPL_ADDELT)
 #undef IMPL_ADDELT

 #define IMPL_GETNEXT(VNAME, V)                                                 \
   bool _mlir_ciface_getNext##VNAME(void *coo,                                  \
                                    StridedMemRefType<index_type, 1> *iref,     \
                                    StridedMemRefType<V, 0> *vref) {            \
     assert(coo &&iref &&vref);                                                 \
     assert(iref->strides[0] == 1);                                             \
     index_type *indx = iref->data + iref->offset;                              \
     V *value = vref->data + vref->offset;                                      \
     const uint64_t isize = iref->sizes[0];                                     \
     const Element<V> *elem =                                                   \
         static_cast<SparseTensorCOO<V> *>(coo)->getNext();                     \
     if (elem == nullptr)                                                       \
       return false;                                                            \
     for (uint64_t r = 0; r < isize; r++)                                       \
       indx[r] = elem->indices[r];                                              \
     *value = elem->value;                                                      \
     return true;                                                               \
   }
 FOREVERY_V(IMPL_GETNEXT)
 #undef IMPL_GETNEXT

 #define IMPL_LEXINSERT(VNAME, V)                                               \
   void _mlir_ciface_lexInsert##VNAME(void *tensor,                             \
                                      StridedMemRefType<index_type, 1> *cref,   \
                                      StridedMemRefType<V, 0> *vref) {          \
     assert(tensor &&cref &&vref);                                              \
     assert(cref->strides[0] == 1);                                             \
     index_type *cursor = cref->data + cref->offset;                            \
     assert(cursor);                                                            \
     V *value = vref->data + vref->offset;                                      \
     static_cast<SparseTensorStorageBase *>(tensor)->lexInsert(cursor, *value); \
   }
 FOREVERY_V(IMPL_LEXINSERT)
 #undef IMPL_LEXINSERT

 #define IMPL_EXPINSERT(VNAME, V)                                               \
   void _mlir_ciface_expInsert##VNAME(                                          \
       void *tensor, StridedMemRefType<index_type, 1> *cref,                    \
       StridedMemRefType<V, 1> *vref, StridedMemRefType<bool, 1> *fref,         \
       StridedMemRefType<index_type, 1> *aref, index_type count) {              \
     assert(tensor &&cref &&vref &&fref &&aref);                                \
     assert(cref->strides[0] == 1);                                             \
     assert(vref->strides[0] == 1);                                             \
     assert(fref->strides[0] == 1);                                             \
     assert(aref->strides[0] == 1);                                             \
     assert(vref->sizes[0] == fref->sizes[0]);                                  \
     index_type *cursor = cref->data + cref->offset;                            \
     V *values = vref->data + vref->offset;                                     \
     bool *filled = fref->data + fref->offset;                                  \
     index_type *added = aref->data + aref->offset;                             \
     static_cast<SparseTensorStorageBase *>(tensor)->expInsert(                 \
         cursor, values, filled, added, count);                                 \
   }
 FOREVERY_V(IMPL_EXPINSERT)
 #undef IMPL_EXPINSERT

 //===----------------------------------------------------------------------===//
 //
 // Public functions which accept only C-style data structures to interact
 // with sparse tensors (which are only visible as opaque pointers externally).
 //
 //===----------------------------------------------------------------------===//

 index_type sparseDimSize(void *tensor, index_type d) {
   return static_cast<SparseTensorStorageBase *>(tensor)->getDimSize(d);
 }

 void endInsert(void *tensor) {
   return static_cast<SparseTensorStorageBase *>(tensor)->endInsert();
 }

 #define IMPL_OUTSPARSETENSOR(VNAME, V)                                         \
   void outSparseTensor##VNAME(void *coo, void *dest, bool sort) {              \
     return outSparseTensor<V>(coo, dest, sort);                                \
   }
 FOREVERY_V(IMPL_OUTSPARSETENSOR)
 #undef IMPL_OUTSPARSETENSOR

 void delSparseTensor(void *tensor) {
   delete static_cast<SparseTensorStorageBase *>(tensor);
 }

 #define IMPL_DELCOO(VNAME, V)                                                  \
   void delSparseTensorCOO##VNAME(void *coo) {                                  \
     delete static_cast<SparseTensorCOO<V> *>(coo);                             \
   }
 FOREVERY_V(IMPL_DELCOO)
 #undef IMPL_DELCOO

 char *getTensorFilename(index_type id) {
   char var[80];
   sprintf(var, "TENSOR%" PRIu64, id);
   char *env = getenv(var);
   if (!env)
     FATAL("Environment variable %s is not set\n", var);
   return env;
 }

 void readSparseTensorShape(char *filename, std::vector<uint64_t> *out) {
   assert(out && "Received nullptr for out-parameter");
   SparseTensorFile stfile(filename);
   stfile.openFile();
   stfile.readHeader();
   stfile.closeFile();
   const uint64_t rank = stfile.getRank();
   const uint64_t *dimSizes = stfile.getDimSizes();
   out->reserve(rank);
   out->assign(dimSizes, dimSizes + rank);
 }

 // TODO: generalize beyond 64-bit indices.
 #define IMPL_CONVERTTOMLIRSPARSETENSOR(VNAME, V)                               \
   void *convertToMLIRSparseTensor##VNAME(                                      \
       uint64_t rank, uint64_t nse, uint64_t *shape, V *values,                 \
       uint64_t *indices, uint64_t *perm, uint8_t *sparse) {                    \
     return toMLIRSparseTensor<V>(rank, nse, shape, values, indices, perm,      \
                                  sparse);                                      \
   }
 FOREVERY_V(IMPL_CONVERTTOMLIRSPARSETENSOR)
 #undef IMPL_CONVERTTOMLIRSPARSETENSOR

 // TODO: Currently, values are copied from SparseTensorStorage to
 // SparseTensorCOO, then to the output.  We may want to reduce the number
 // of copies.
 //
 // TODO: generalize beyond 64-bit indices, no dim ordering, all dimensions
 // compressed
 #define IMPL_CONVERTFROMMLIRSPARSETENSOR(VNAME, V)                             \
   void convertFromMLIRSparseTensor##VNAME(void *tensor, uint64_t *pRank,       \
                                           uint64_t *pNse, uint64_t **pShape,   \
                                           V **pValues, uint64_t **pIndices) {  \
     fromMLIRSparseTensor<V>(tensor, pRank, pNse, pShape, pValues, pIndices);   \
   }
 FOREVERY_V(IMPL_CONVERTFROMMLIRSPARSETENSOR)
 #undef IMPL_CONVERTFROMMLIRSPARSETENSOR

 } // extern "C"

 #endif // MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS