update with the file we use for benchmarking
diff --git a/enzyme/test/Integration/Sparse/eigen_analysis.cpp b/enzyme/test/Integration/Sparse/eigen_analysis.cpp
index 434ebcf..366607d 100644
--- a/enzyme/test/Integration/Sparse/eigen_analysis.cpp
+++ b/enzyme/test/Integration/Sparse/eigen_analysis.cpp
@@ -101,7 +101,7 @@
 // Calculate total energy for all faces in 3D
 template<typename T>
 __attribute__((always_inline))
-static T eigenstuffL(const T *__restrict__ x, size_t num_faces, const int *__restrict__ faces, const T *__restrict__ pos0) {
+static T eigenstuffL(const T *__restrict__ pos0, size_t num_faces, const int *__restrict__ faces, const T *__restrict__ x) {
     T sum = 0;
     __builtin_assume(num_faces != 0);
     for (size_t idx=0; idx<num_faces; idx++) {
@@ -143,7 +143,7 @@
 __attribute__((always_inline))
 static void gradient_ip(const T *__restrict__ pos0, const size_t num_faces, const int* faces, const T *__restrict__ x, T *__restrict__ out)
 {
-    __enzyme_autodiff<void>((void *)eigenstuffM<T>,
+    __enzyme_autodiff<void>((void *)eigenstuffL<T>,
                             enzyme_const, pos0,
                             enzyme_const, num_faces,
                             enzyme_const, faces,
@@ -206,10 +206,10 @@
 
     // Call eigenstuffM_simple
     struct timeval start, end;
-    gettimeofday(&start, NULL);
-    const float resultM = eigenstuffM(pos0, num_faces, faces, x);
-    gettimeofday(&end, NULL);
-    printf("Result for eigenstuffM_simple: %f, runtime:%f\n", resultM, tdiff(&start, &end));
+    // gettimeofday(&start, NULL);
+    // const float resultM = eigenstuffM(pos0, num_faces, faces, x);
+    // gettimeofday(&end, NULL);
+    // printf("Result for eigenstuffM_simple: %f, runtime:%f\n", resultM, tdiff(&start, &end));
 
     // Call eigenstuffL_simple
     gettimeofday(&start, NULL);
@@ -222,10 +222,10 @@
         dx[i] = 0;
     gradient_ip(pos0, num_faces, faces, x, dx);
 
-    if (x_pts < 30) {
-    for (size_t i=0; i<sizeof(dx)/sizeof(dx[0]); i++)
-        printf("eigenstuffM grad_vert[%zu]=%f\n", i, dx[i]);
-    }
+    // if (x_pts < 30) {
+    // for (size_t i=0; i<sizeof(dx)/sizeof(dx[0]); i++)
+    //     printf("eigenstuffM grad_vert[%zu]=%f\n", i, dx[i]);
+    // }
 
     gettimeofday(&start, NULL);
     auto hess_x = hessian(pos0, num_faces, faces, x, x_pts);
@@ -235,10 +235,10 @@
   
     printf("Runtime %0.6f\n", tdiff(&start, &end));
 
-    if (x_pts <= 8)
-    for (auto &hess : hess_x) {
-        printf("i=%lu, j=%lu, val=%f\n", hess.row, hess.col, hess.val);
-    }
+    // if (x_pts <= 8)
+    // for (auto &hess : hess_x) {
+    //     printf("i=%lu, j=%lu, val=%f\n", hess.row, hess.col, hess.val);
+    // }
 
     return 0;
 }
diff --git a/enzyme/test/Integration/Sparse/matrix.h b/enzyme/test/Integration/Sparse/matrix.h
index c542c70..c9607c0 100644
--- a/enzyme/test/Integration/Sparse/matrix.h
+++ b/enzyme/test/Integration/Sparse/matrix.h
@@ -16,7 +16,7 @@
 };
 
 __attribute__((enzyme_sparse_accumulate))
-static void inner_storeflt(int64_t row, int64_t col, float val, std::vector<Triple<float>> &triplets) {
+static void inner_storeflt(size_t row, size_t col, float val, std::vector<Triple<float>> &triplets) {
 #ifdef BENCHMARK
     if (val == 0.0) return;
 #else
@@ -26,9 +26,9 @@
 }
 
 __attribute__((enzyme_sparse_accumulate))
-static void inner_storedbl(int64_t row, int64_t col, double val, std::vector<Triple<double>> &triplets) {
+static void inner_storedbl(size_t row, size_t col, double val, std::vector<Triple<double>> &triplets) {
 #ifdef BENCHMARK
-    if (val == 0.0) return;
+    if (val > -1e-10 && val < 1e-10) return;
 #else
 #warning "Compiling for debug/verfication, performance may be slowed"
 #endif
@@ -37,7 +37,7 @@
 
 template<typename T>
 __attribute__((always_inline))
-static void sparse_store(T val, int64_t idx, size_t i, std::vector<Triple<T>> &triplets) {
+static void sparse_store(T val, size_t idx, size_t i, std::vector<Triple<T>> &triplets) {
     if (val == 0.0) return;
     idx /= sizeof(T);
     if constexpr (sizeof(T) == 4)
@@ -48,19 +48,57 @@
 
 template<typename T>
 __attribute__((always_inline))
-static T sparse_load(int64_t idx, size_t i, std::vector<Triple<T>> &triplets) {
+static T sparse_load(size_t idx, size_t i, std::vector<Triple<T>> &triplets) {
+    return 0.0;
+}
+
+
+__attribute__((enzyme_sparse_accumulate))
+static void inner_storeflt_modn(size_t row, size_t col, float val, size_t N, std::vector<Triple<float>> &triplets) {
+#ifdef BENCHMARK
+    if (val == 0.0) return;
+#else
+#warning "Compiling for debug/verfication, performance may be slowed"
+#endif
+    triplets.emplace_back(row, col % N, val);
+}
+
+__attribute__((enzyme_sparse_accumulate))
+static void inner_storedbl_modn(size_t row, size_t col, double val, size_t N, std::vector<Triple<double>> &triplets) {
+#ifdef BENCHMARK
+    if (val > -1e-10 && val < 1e-10) return;
+#else
+#warning "Compiling for debug/verfication, performance may be slowed"
+#endif
+    triplets.emplace_back(row, col % N, val);
+}
+
+template<typename T>
+__attribute__((always_inline))
+static void sparse_store_modn(T val, size_t idx, size_t i, size_t N, std::vector<Triple<T>> &triplets) {
+    if (val == 0.0) return;
+    idx /= sizeof(T);
+    if constexpr (sizeof(T) == 4)
+      inner_storeflt_modn(i, idx, val, N, triplets);
+    else
+      inner_storedbl_modn(i, idx, val, N, triplets);
+}
+
+template<typename T>
+__attribute__((always_inline))
+static T sparse_load_modn(int64_t idx, size_t i, size_t N, std::vector<Triple<T>> &triplets) {
     return 0.0;
 }
 
 template<typename T>
 __attribute__((always_inline))
-static void ident_store(T, int64_t idx, size_t i) {
+static void ident_store(T, size_t idx, size_t i) {
     assert(0 && "should never load");
 }
 
 template<typename T>
 __attribute__((always_inline))
-static T ident_load(int64_t idx, size_t i) {
+static T ident_load(size_t idx, size_t i) {
     idx /= sizeof(T);
     return (T)(idx == i);// ? 1.0 : 0.0;
 }
diff --git a/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp b/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp
index cae8bda..1c2cd10 100644
--- a/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp
+++ b/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp
@@ -7,6 +7,9 @@
 // TODO: if [ %llvmver -ge 12 ]; then %clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0  -std=c++11 -O2 %s -S -emit-llvm -o - %newLoadClangEnzyme -mllvm -enzyme-auto-sparsity=1 -S | %lli - ; fi
 // TODO: if [ %llvmver -ge 12 ]; then %clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0  -std=c++11 -O3 %s -S -emit-llvm -o - %newLoadClangEnzyme -mllvm -enzyme-auto-sparsity=1 -S | %lli - ; fi
 
+// /usr/local/Cellar/llvm@15/15.0.7/bin/clang++ -fno-exceptions  -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O3 /Users/jessemichel/research/sparse_project/benchmark_enzyme/Enzyme/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp -fpass-plugin=/Users/jessemichel/research/sparse_project/benchmark_enzyme/Enzyme/enzyme/build15D/Enzyme/ClangEnzyme-15.dylib -Xclang -load -Xclang /Users/jessemichel/research/sparse_project/benchmark_enzyme/Enzyme/enzyme/build15D/Enzyme/ClangEnzyme-15.dylib -mllvm -enzyme-auto-sparsity=1 -o ringspring3Drestlengthone.out
+
+
 #include <stdio.h>
 #include <assert.h>
 #include <vector>
@@ -21,7 +24,7 @@
 static double f(size_t N, T* __restrict__ pos) {
     double e = 0.;
     __builtin_assume(N != 0);
-    for (size_t j = 0; j < N/3; j ++) {
+    for (size_t j = 0; j < N; j ++) {
         size_t i = 3 * j;
         T vx = pos[i];
         T vy = pos[i + 1];
@@ -49,9 +52,10 @@
     assert(0 && "this is a read only input, why are you storing here...");
 }
 
+template<typename T>
 __attribute__((always_inline))
-static double mod_load(int64_t idx, double* input, size_t N) {
-    idx /= sizeof(double);
+static T mod_load(int64_t idx, T* input, size_t N) {
+    idx /= sizeof(T);
     return input[idx % N];
 }
 
@@ -59,12 +63,12 @@
 __attribute__((noinline))
 std::vector<Triple<T>> hess_f(size_t N, T* input) {
     std::vector<Triple<T>> triplets;
-    // input = __enzyme_todense((void*)mod_load, (void*)never_store, input, N);
+    input = __enzyme_post_sparse_todense<T*>((void*)mod_load<T>, (void*)never_store<T>, input, N);
     __builtin_assume(N > 0);
     for (size_t i=0; i<N; i++) {
         __builtin_assume(i < 100000000);
         T* d_input = __enzyme_todense<T*>((void*)ident_load<T>, (void*)ident_store<T>, i);
-        T* d_dinput = __enzyme_todense<T*>((void*)sparse_load<T>, (void*)sparse_store<T>, i, &triplets);
+        T* d_dinput = __enzyme_todense<T*>((void*)sparse_load_modn<T>, (void*)sparse_store_modn<T>, i, N, &triplets);
 
        __enzyme_fwddiff<void>((void*)grad_f<T>, 
                             enzyme_const, N,
@@ -85,6 +89,62 @@
 }
 */
 
+template<typename T>
+__attribute__((noinline))
+std::vector<Triple<T>> handrolled_hess_f(size_t n, T* data) {
+    std::vector<Triple<T>> triplets;
+    int k = 3 * n;
+    for (int i = 0; i < n; ++i) {
+        int row = 3 * i;
+        double x = data[row % k];
+        double y = data[(row + 1) % k];
+        double z = data[(row + 2) % k];
+        double a = data[(row + 3) % k];
+        double b = data[(row + 4) % k];
+        double c = data[(row + 5) % k];
+
+        double f = pow(x - a, 2) + pow(y - b, 2) + pow(z - c, 2);
+        // double f = (x - a) * (x - a) + (y - b) * (y - b) + (z - c) * (z - c);
+        double g = 2 * (1 - 1 / sqrt(f));
+
+        // Diagonal terms
+        #pragma clang loop unroll(full)
+        for (int j = 0; j < 6; ++j) {
+            T val = g;
+#ifdef BENCHMARK
+            if (val > -1e-10 && val < 1e-10) continue;
+#endif
+            triplets.emplace_back((row + j) % k, (row + j) % k, val);
+        }
+
+        #pragma clang loop unroll(full)
+        for (int j = 0; j < 6; ++j) {
+            T val = -g;
+#ifdef BENCHMARK
+            if (val > -1e-10 && val < 1e-10) continue;
+#endif
+            triplets.emplace_back((row + ((j + 3) % 6)) % k, (row + j) % k, val);
+        }
+
+        // Cross terms
+        std::vector<double> half_grad_f = {x - a, y - b, z - c, a - x, b - y, c - z};
+        // double half_grad_f[] = {x - a, y - b, z - c, a - x, b - y, c - z};
+        #pragma clang loop unroll(full)
+        for (int j = 0; j < 6; ++j) {
+            #pragma clang loop unroll(full)
+            for (int l = 0; l < 6; ++l) {
+                T val = 2 * half_grad_f[j] * half_grad_f[l] * pow(f, -1.5);
+                // T val = 2 * half_grad_f[j] * half_grad_f[l] / ( f * sqrt(f) );
+#ifdef BENCHMARK
+                if (val > -1e-10 && val < 1e-10) continue;
+#endif
+                triplets.emplace_back((row + j) % k, (row + l) % k, val);
+            }
+        }
+    }
+    return triplets;
+}
+
 // int argc, char** argv
 int main(int argc, char** argv) {
     size_t N = 30;
@@ -93,7 +153,7 @@
          N = atoi(argv[1]);
     }
 
-    double *x = (double*)malloc(sizeof(double) * 3 * N);
+    double *x = (double*)malloc(sizeof(double) * (3 * N + 3));
     for (int i = 0; i < N; ++i) {
         double angle = 2 * M_PI * i / N;
         x[3 * i] = cos(angle) ;//+ normal(generator);
@@ -101,8 +161,10 @@
         x[3 * i + 2] = 0;//normal(generator);
     }
 
-
+    for (int i=0; i<10; i++) 
+    {
   struct timeval start, end;
+
   gettimeofday(&start, NULL);
   
   auto res = hess_f(N, x);
@@ -118,6 +180,14 @@
       printf("%ld, %ld = %f\n", tup.row, tup.col, tup.val);
   }
 
+  gettimeofday(&start, NULL);
+  auto hand_res = handrolled_hess_f(N, x);
+  gettimeofday(&end, NULL);
+  
+  printf("Handrolled Number of elements %ld\n", hand_res.size());
+  printf(" Runtime %0.6f\n", tdiff(&start, &end));
+    }
+
   return 0;
 }