update with the file we use for benchmarking
diff --git a/enzyme/test/Integration/Sparse/eigen_analysis.cpp b/enzyme/test/Integration/Sparse/eigen_analysis.cpp
index 434ebcf..366607d 100644
--- a/enzyme/test/Integration/Sparse/eigen_analysis.cpp
+++ b/enzyme/test/Integration/Sparse/eigen_analysis.cpp
@@ -101,7 +101,7 @@
// Calculate total energy for all faces in 3D
template<typename T>
__attribute__((always_inline))
-static T eigenstuffL(const T *__restrict__ x, size_t num_faces, const int *__restrict__ faces, const T *__restrict__ pos0) {
+static T eigenstuffL(const T *__restrict__ pos0, size_t num_faces, const int *__restrict__ faces, const T *__restrict__ x) {
T sum = 0;
__builtin_assume(num_faces != 0);
for (size_t idx=0; idx<num_faces; idx++) {
@@ -143,7 +143,7 @@
__attribute__((always_inline))
static void gradient_ip(const T *__restrict__ pos0, const size_t num_faces, const int* faces, const T *__restrict__ x, T *__restrict__ out)
{
- __enzyme_autodiff<void>((void *)eigenstuffM<T>,
+ __enzyme_autodiff<void>((void *)eigenstuffL<T>,
enzyme_const, pos0,
enzyme_const, num_faces,
enzyme_const, faces,
@@ -206,10 +206,10 @@
// Call eigenstuffM_simple
struct timeval start, end;
- gettimeofday(&start, NULL);
- const float resultM = eigenstuffM(pos0, num_faces, faces, x);
- gettimeofday(&end, NULL);
- printf("Result for eigenstuffM_simple: %f, runtime:%f\n", resultM, tdiff(&start, &end));
+ // gettimeofday(&start, NULL);
+ // const float resultM = eigenstuffM(pos0, num_faces, faces, x);
+ // gettimeofday(&end, NULL);
+ // printf("Result for eigenstuffM_simple: %f, runtime:%f\n", resultM, tdiff(&start, &end));
// Call eigenstuffL_simple
gettimeofday(&start, NULL);
@@ -222,10 +222,10 @@
dx[i] = 0;
gradient_ip(pos0, num_faces, faces, x, dx);
- if (x_pts < 30) {
- for (size_t i=0; i<sizeof(dx)/sizeof(dx[0]); i++)
- printf("eigenstuffM grad_vert[%zu]=%f\n", i, dx[i]);
- }
+ // if (x_pts < 30) {
+ // for (size_t i=0; i<sizeof(dx)/sizeof(dx[0]); i++)
+ // printf("eigenstuffM grad_vert[%zu]=%f\n", i, dx[i]);
+ // }
gettimeofday(&start, NULL);
auto hess_x = hessian(pos0, num_faces, faces, x, x_pts);
@@ -235,10 +235,10 @@
printf("Runtime %0.6f\n", tdiff(&start, &end));
- if (x_pts <= 8)
- for (auto &hess : hess_x) {
- printf("i=%lu, j=%lu, val=%f\n", hess.row, hess.col, hess.val);
- }
+ // if (x_pts <= 8)
+ // for (auto &hess : hess_x) {
+ // printf("i=%lu, j=%lu, val=%f\n", hess.row, hess.col, hess.val);
+ // }
return 0;
}
diff --git a/enzyme/test/Integration/Sparse/matrix.h b/enzyme/test/Integration/Sparse/matrix.h
index c542c70..c9607c0 100644
--- a/enzyme/test/Integration/Sparse/matrix.h
+++ b/enzyme/test/Integration/Sparse/matrix.h
@@ -16,7 +16,7 @@
};
__attribute__((enzyme_sparse_accumulate))
-static void inner_storeflt(int64_t row, int64_t col, float val, std::vector<Triple<float>> &triplets) {
+static void inner_storeflt(size_t row, size_t col, float val, std::vector<Triple<float>> &triplets) {
#ifdef BENCHMARK
if (val == 0.0) return;
#else
@@ -26,9 +26,9 @@
}
__attribute__((enzyme_sparse_accumulate))
-static void inner_storedbl(int64_t row, int64_t col, double val, std::vector<Triple<double>> &triplets) {
+static void inner_storedbl(size_t row, size_t col, double val, std::vector<Triple<double>> &triplets) {
#ifdef BENCHMARK
- if (val == 0.0) return;
+ if (val > -1e-10 && val < 1e-10) return;
#else
#warning "Compiling for debug/verfication, performance may be slowed"
#endif
@@ -37,7 +37,7 @@
template<typename T>
__attribute__((always_inline))
-static void sparse_store(T val, int64_t idx, size_t i, std::vector<Triple<T>> &triplets) {
+static void sparse_store(T val, size_t idx, size_t i, std::vector<Triple<T>> &triplets) {
if (val == 0.0) return;
idx /= sizeof(T);
if constexpr (sizeof(T) == 4)
@@ -48,19 +48,57 @@
template<typename T>
__attribute__((always_inline))
-static T sparse_load(int64_t idx, size_t i, std::vector<Triple<T>> &triplets) {
+static T sparse_load(size_t idx, size_t i, std::vector<Triple<T>> &triplets) {
+ return 0.0;
+}
+
+
+__attribute__((enzyme_sparse_accumulate))
+static void inner_storeflt_modn(size_t row, size_t col, float val, size_t N, std::vector<Triple<float>> &triplets) {
+#ifdef BENCHMARK
+ if (val == 0.0) return;
+#else
+#warning "Compiling for debug/verfication, performance may be slowed"
+#endif
+ triplets.emplace_back(row, col % N, val);
+}
+
+__attribute__((enzyme_sparse_accumulate))
+static void inner_storedbl_modn(size_t row, size_t col, double val, size_t N, std::vector<Triple<double>> &triplets) {
+#ifdef BENCHMARK
+ if (val > -1e-10 && val < 1e-10) return;
+#else
+#warning "Compiling for debug/verfication, performance may be slowed"
+#endif
+ triplets.emplace_back(row, col % N, val);
+}
+
+template<typename T>
+__attribute__((always_inline))
+static void sparse_store_modn(T val, size_t idx, size_t i, size_t N, std::vector<Triple<T>> &triplets) {
+ if (val == 0.0) return;
+ idx /= sizeof(T);
+ if constexpr (sizeof(T) == 4)
+ inner_storeflt_modn(i, idx, val, N, triplets);
+ else
+ inner_storedbl_modn(i, idx, val, N, triplets);
+}
+
+template<typename T>
+__attribute__((always_inline))
+static T sparse_load_modn(int64_t idx, size_t i, size_t N, std::vector<Triple<T>> &triplets) {
return 0.0;
}
template<typename T>
__attribute__((always_inline))
-static void ident_store(T, int64_t idx, size_t i) {
+static void ident_store(T, size_t idx, size_t i) {
assert(0 && "should never load");
}
template<typename T>
__attribute__((always_inline))
-static T ident_load(int64_t idx, size_t i) {
+static T ident_load(size_t idx, size_t i) {
idx /= sizeof(T);
return (T)(idx == i);// ? 1.0 : 0.0;
}
diff --git a/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp b/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp
index cae8bda..1c2cd10 100644
--- a/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp
+++ b/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp
@@ -7,6 +7,9 @@
// TODO: if [ %llvmver -ge 12 ]; then %clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O2 %s -S -emit-llvm -o - %newLoadClangEnzyme -mllvm -enzyme-auto-sparsity=1 -S | %lli - ; fi
// TODO: if [ %llvmver -ge 12 ]; then %clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O3 %s -S -emit-llvm -o - %newLoadClangEnzyme -mllvm -enzyme-auto-sparsity=1 -S | %lli - ; fi
+// /usr/local/Cellar/llvm@15/15.0.7/bin/clang++ -fno-exceptions -ffast-math -mllvm -enable-load-pre=0 -std=c++11 -O3 /Users/jessemichel/research/sparse_project/benchmark_enzyme/Enzyme/enzyme/test/Integration/Sparse/ringspring3Drestlengthone.cpp -fpass-plugin=/Users/jessemichel/research/sparse_project/benchmark_enzyme/Enzyme/enzyme/build15D/Enzyme/ClangEnzyme-15.dylib -Xclang -load -Xclang /Users/jessemichel/research/sparse_project/benchmark_enzyme/Enzyme/enzyme/build15D/Enzyme/ClangEnzyme-15.dylib -mllvm -enzyme-auto-sparsity=1 -o ringspring3Drestlengthone.out
+
+
#include <stdio.h>
#include <assert.h>
#include <vector>
@@ -21,7 +24,7 @@
static double f(size_t N, T* __restrict__ pos) {
double e = 0.;
__builtin_assume(N != 0);
- for (size_t j = 0; j < N/3; j ++) {
+ for (size_t j = 0; j < N; j ++) {
size_t i = 3 * j;
T vx = pos[i];
T vy = pos[i + 1];
@@ -49,9 +52,10 @@
assert(0 && "this is a read only input, why are you storing here...");
}
+template<typename T>
__attribute__((always_inline))
-static double mod_load(int64_t idx, double* input, size_t N) {
- idx /= sizeof(double);
+static T mod_load(int64_t idx, T* input, size_t N) {
+ idx /= sizeof(T);
return input[idx % N];
}
@@ -59,12 +63,12 @@
__attribute__((noinline))
std::vector<Triple<T>> hess_f(size_t N, T* input) {
std::vector<Triple<T>> triplets;
- // input = __enzyme_todense((void*)mod_load, (void*)never_store, input, N);
+ input = __enzyme_post_sparse_todense<T*>((void*)mod_load<T>, (void*)never_store<T>, input, N);
__builtin_assume(N > 0);
for (size_t i=0; i<N; i++) {
__builtin_assume(i < 100000000);
T* d_input = __enzyme_todense<T*>((void*)ident_load<T>, (void*)ident_store<T>, i);
- T* d_dinput = __enzyme_todense<T*>((void*)sparse_load<T>, (void*)sparse_store<T>, i, &triplets);
+ T* d_dinput = __enzyme_todense<T*>((void*)sparse_load_modn<T>, (void*)sparse_store_modn<T>, i, N, &triplets);
__enzyme_fwddiff<void>((void*)grad_f<T>,
enzyme_const, N,
@@ -85,6 +89,62 @@
}
*/
+template<typename T>
+__attribute__((noinline))
+std::vector<Triple<T>> handrolled_hess_f(size_t n, T* data) {
+ std::vector<Triple<T>> triplets;
+ int k = 3 * n;
+ for (int i = 0; i < n; ++i) {
+ int row = 3 * i;
+ double x = data[row % k];
+ double y = data[(row + 1) % k];
+ double z = data[(row + 2) % k];
+ double a = data[(row + 3) % k];
+ double b = data[(row + 4) % k];
+ double c = data[(row + 5) % k];
+
+ double f = pow(x - a, 2) + pow(y - b, 2) + pow(z - c, 2);
+ // double f = (x - a) * (x - a) + (y - b) * (y - b) + (z - c) * (z - c);
+ double g = 2 * (1 - 1 / sqrt(f));
+
+ // Diagonal terms
+ #pragma clang loop unroll(full)
+ for (int j = 0; j < 6; ++j) {
+ T val = g;
+#ifdef BENCHMARK
+ if (val > -1e-10 && val < 1e-10) continue;
+#endif
+ triplets.emplace_back((row + j) % k, (row + j) % k, val);
+ }
+
+ #pragma clang loop unroll(full)
+ for (int j = 0; j < 6; ++j) {
+ T val = -g;
+#ifdef BENCHMARK
+ if (val > -1e-10 && val < 1e-10) continue;
+#endif
+ triplets.emplace_back((row + ((j + 3) % 6)) % k, (row + j) % k, val);
+ }
+
+ // Cross terms
+ std::vector<double> half_grad_f = {x - a, y - b, z - c, a - x, b - y, c - z};
+ // double half_grad_f[] = {x - a, y - b, z - c, a - x, b - y, c - z};
+ #pragma clang loop unroll(full)
+ for (int j = 0; j < 6; ++j) {
+ #pragma clang loop unroll(full)
+ for (int l = 0; l < 6; ++l) {
+ T val = 2 * half_grad_f[j] * half_grad_f[l] * pow(f, -1.5);
+ // T val = 2 * half_grad_f[j] * half_grad_f[l] / ( f * sqrt(f) );
+#ifdef BENCHMARK
+ if (val > -1e-10 && val < 1e-10) continue;
+#endif
+ triplets.emplace_back((row + j) % k, (row + l) % k, val);
+ }
+ }
+ }
+ return triplets;
+}
+
// int argc, char** argv
int main(int argc, char** argv) {
size_t N = 30;
@@ -93,7 +153,7 @@
N = atoi(argv[1]);
}
- double *x = (double*)malloc(sizeof(double) * 3 * N);
+ double *x = (double*)malloc(sizeof(double) * (3 * N + 3));
for (int i = 0; i < N; ++i) {
double angle = 2 * M_PI * i / N;
x[3 * i] = cos(angle) ;//+ normal(generator);
@@ -101,8 +161,10 @@
x[3 * i + 2] = 0;//normal(generator);
}
-
+ for (int i=0; i<10; i++)
+ {
struct timeval start, end;
+
gettimeofday(&start, NULL);
auto res = hess_f(N, x);
@@ -118,6 +180,14 @@
printf("%ld, %ld = %f\n", tup.row, tup.col, tup.val);
}
+ gettimeofday(&start, NULL);
+ auto hand_res = handrolled_hess_f(N, x);
+ gettimeofday(&end, NULL);
+
+ printf("Handrolled Number of elements %ld\n", hand_res.size());
+ printf(" Runtime %0.6f\n", tdiff(&start, &end));
+ }
+
return 0;
}