various cleanups
diff --git a/enzyme/benchmarks/ReverseMode/adbench/Makefile.config b/enzyme/benchmarks/ReverseMode/adbench/Makefile.config
new file mode 100644
index 0000000..c620d4a
--- /dev/null
+++ b/enzyme/benchmarks/ReverseMode/adbench/Makefile.config
@@ -0,0 +1,9 @@
+CLANG := /home/manuel/prog/rust-middle/build/x86_64-unknown-linux-gnu/llvm/build/bin/clang++
+OPT := /home/manuel/prog/rust-middle/build/x86_64-unknown-linux-gnu/llvm/build/bin/opt
+
+PASSES1 := verify,annotation2metadata,forceattrs,inferattrs,coro-early,function<eager-inv>(ee-instrument<>,lower-expect,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sroa<modify-cfg>,early-cse<>,callsite-splitting),openmp-opt,ipsccp,called-value-propagation,globalopt,function<eager-inv>(mem2reg,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>),always-inline,require<globals-aa>,function(invalidate<aa>),require<profile-summary>,cgscc(devirt<4>(inline,function-attrs<skip-non-recursive-function-attrs>,argpromotion,openmp-opt-cgscc,function<eager-inv;no-rerun>(sroa<modify-cfg>,early-cse<memssa>,speculative-execution<only-if-divergent-target>,jump-threading,correlated-propagation,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,aggressive-instcombine,libcalls-shrinkwrap,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,reassociate,constraint-elimination,loop-mssa(loop-instsimplify,loop-simplifycfg,licm<no-allowspeculation>,loop-rotate<header-duplication;prepare-for-lto>,licm<allowspeculation>,simple-loop-unswitch<nontrivial;trivial>),simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>,loop(loop-idiom,indvars,extra-simple-loop-unswitch-passes,loop-deletion,loop-unroll-full),sroa<modify-cfg>,vector-combine,mldst-motion<no-split-footer-bb>,gvn<>,sccp,bdce,instcombine<max-iterations=1;no-verify-fixpoint>,jump-threading,correlated-propagation,adce,memcpyopt,dse,move-auto-init,loop-mssa(licm<allowspeculation>),coro-elide,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,instcombine<max-iterations=1;no-verify-fixpoint>),function-attrs,function(require<should-not-run-function-passes>),coro-split,coro-annotation-elide)),deadargelim,coro-cleanup,globalopt,globaldce,rpo-function-attrs,recompute-globalsaa,function<eager-inv>(float2int,lower-constant-intrinsics,chr,loop(loop-rotate<header-duplication;prepare-for-lto>,loop-deletion),loop-distribute,inject-tli-mappings,loop-vectorize<interleave-forced-only;vectorize-forced-only;>,infer-alignment,loop-load-elim,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,vector-combine,instcombine<max-iterations=1;no-verify-fixpoint>,loop-unroll<O3>,transform-warning,sroa<preserve-cfg>,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,loop-sink,instsimplify,div-rem-pairs,tailcallelim,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>),globaldce,constmerge,function(annotation-remarks),canonicalize-aliases,name-anon-globals,verify
+
+PASSES2 := cross-dso-cfi,openmp-opt,globaldce<vfe-linkage-unit-visibility>,inferattrs,function<eager-inv>(callsite-splitting),pgo-icall-prom,cgscc(function-attrs,argpromotion,function(sroa<modify-cfg>)),ipsccp,called-value-propagation,rpo-function-attrs,globalsplit,wholeprogramdevirt,globalopt,function(mem2reg),constmerge,deadargelim,function<eager-inv>(instcombine<max-iterations=1;no-verify-fixpoint>,aggressive-instcombine),expand-variadics,cgscc(inline<only-mandatory>,inline),globalopt,openmp-opt,globaldce<vfe-linkage-unit-visibility>,cgscc(argpromotion),function<eager-inv>(instcombine<max-iterations=1;no-verify-fixpoint>,constraint-elimination,jump-threading,sroa<modify-cfg>,tailcallelim),cgscc(function-attrs),require<globals-aa>,function(invalidate<aa>),cgscc(openmp-opt-cgscc),function<eager-inv>(loop-mssa(licm<allowspeculation>),gvn<>,memcpyopt,dse,move-auto-init,mldst-motion<no-split-footer-bb>,loop(indvars,loop-deletion,loop-unroll-full),loop-distribute,loop-vectorize<interleave-forced-only;vectorize-forced-only;>,infer-alignment,loop-unroll<O3>,transform-warning,sroa<preserve-cfg>,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sccp,instcombine<max-iterations=1;no-verify-fixpoint>,bdce,vector-combine,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,jump-threading),lowertypetests,lowertypetests,function(loop-sink,div-rem-pairs,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>),elim-avail-extern,globaldce<vfe-linkage-unit-visibility>,rel-lookup-table-converter,cg-profile,function(annotation-remarks),canonicalize-aliases,name-anon-globals
+#PASSES2 := cross-dso-cfi,openmp-opt,globaldce<vfe-linkage-unit-visibility>,inferattrs,function<eager-inv>(callsite-splitting),pgo-icall-prom,cgscc(function-attrs,argpromotion,function(sroa<modify-cfg>)),ipsccp,called-value-propagation,rpo-function-attrs,globalsplit,wholeprogramdevirt,globalopt,function(mem2reg),constmerge,deadargelim,function<eager-inv>(instcombine<max-iterations=1;no-verify-fixpoint>,aggressive-instcombine),expand-variadics,cgscc(inline<only-mandatory>,inline),globalopt,openmp-opt,globaldce<vfe-linkage-unit-visibility>,cgscc(argpromotion),function<eager-inv>(instcombine<max-iterations=1;no-verify-fixpoint>,constraint-elimination,jump-threading,sroa<modify-cfg>,tailcallelim),cgscc(function-attrs),require<globals-aa>,function(invalidate<aa>),cgscc(openmp-opt-cgscc),function<eager-inv>(loop-mssa(licm<allowspeculation>),gvn<>,memcpyopt,dse,move-auto-init,mldst-motion<no-split-footer-bb>,loop(indvars,loop-deletion,loop-unroll-full),loop-distribute,loop-vectorize<interleave-forced-only;vectorize-forced-only;>,infer-alignment,loop-unroll<O3>,transform-warning,sroa<preserve-cfg>,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sccp,instcombine<max-iterations=1;no-verify-fixpoint>,bdce,vector-combine,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,jump-threading),lowertypetests,lowertypetests,function(loop-sink,div-rem-pairs,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>),elim-avail-extern,globaldce<vfe-linkage-unit-visibility>,rel-lookup-table-converter,cg-profile,function(annotation-remarks),canonicalize-aliases,name-anon-globals,EnzymeNewPM
+
+PASSES3 := cross-dso-cfi,openmp-opt,globaldce<vfe-linkage-unit-visibility>,inferattrs,function<eager-inv>(callsite-splitting),pgo-icall-prom,cgscc(function-attrs,argpromotion,function(sroa<modify-cfg>)),ipsccp,called-value-propagation,rpo-function-attrs,globalsplit,wholeprogramdevirt,globalopt,function(mem2reg),constmerge,deadargelim,function<eager-inv>(instcombine<max-iterations=1;no-verify-fixpoint>,aggressive-instcombine),expand-variadics,cgscc(inline<only-mandatory>,inline),globalopt,openmp-opt,globaldce<vfe-linkage-unit-visibility>,cgscc(argpromotion),function<eager-inv>(instcombine<max-iterations=1;no-verify-fixpoint>,constraint-elimination,jump-threading,sroa<modify-cfg>,tailcallelim),cgscc(function-attrs),require<globals-aa>,function(invalidate<aa>),cgscc(openmp-opt-cgscc),function<eager-inv>(loop-mssa(licm<allowspeculation>),gvn<>,memcpyopt,dse,move-auto-init,mldst-motion<no-split-footer-bb>,loop(indvars,loop-deletion,loop-unroll-full),loop-distribute,loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,infer-alignment,loop-unroll<O3>,transform-warning,sroa<preserve-cfg>,instcombine<max-iterations=1;no-verify-fixpoint>,simplifycfg<bonus-inst-threshold=1;forward-switch-cond;switch-range-to-icmp;switch-to-lookup;no-keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;no-speculate-unpredictables>,sccp,instcombine<max-iterations=1;no-verify-fixpoint>,bdce,slp-vectorizer,vector-combine,infer-alignment,instcombine<max-iterations=1;no-verify-fixpoint>,loop-mssa(licm<allowspeculation>),alignment-from-assumptions,jump-threading),lowertypetests,lowertypetests,function(loop-sink,div-rem-pairs,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>),elim-avail-extern,globaldce<vfe-linkage-unit-visibility>,mergefunc,rel-lookup-table-converter,cg-profile,function(annotation-remarks),canonicalize-aliases,name-anon-globals
diff --git a/enzyme/benchmarks/ReverseMode/adbench/ba.h b/enzyme/benchmarks/ReverseMode/adbench/ba.h
index 6a3f977..131a5f8 100644
--- a/enzyme/benchmarks/ReverseMode/adbench/ba.h
+++ b/enzyme/benchmarks/ReverseMode/adbench/ba.h
@@ -427,7 +427,7 @@
}
}
- {
+ for (int j=0;j<5;j++) {
struct BAInput input;
read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams,
@@ -659,7 +659,7 @@
}
}
- {
+ for(int j=0;j<5;j++){
struct BAInput input;
read_ba_instance("data/" + path, input.n, input.m, input.p, input.cams,
diff --git a/enzyme/benchmarks/ReverseMode/adbench/gmm.h b/enzyme/benchmarks/ReverseMode/adbench/gmm.h
index c5ec727..35f4423 100644
--- a/enzyme/benchmarks/ReverseMode/adbench/gmm.h
+++ b/enzyme/benchmarks/ReverseMode/adbench/gmm.h
@@ -213,17 +213,11 @@
std::vector<std::string> paths = { "10k/gmm_d10_K200.txt" };
- //getTests(paths, "data/1k", "1k/");
- if (std::getenv("BENCH_LARGE")) {
- getTests(paths, "data/2.5k", "2.5k/");
- getTests(paths, "data/10k", "10k/");
- }
-
getTests(paths, "data/1k", "1k/");
- if (std::getenv("BENCH_LARGE")) {
+ //if (std::getenv("BENCH_LARGE")) {
getTests(paths, "data/2.5k", "2.5k/");
getTests(paths, "data/10k", "10k/");
- }
+ //}
std::ofstream jsonfile("results.json", std::ofstream::trunc);
json test_results;
@@ -274,7 +268,7 @@
struct GMMOutput result = { 0, std::vector<double>(Jcols) };
- //if (0) {
+ if (0) {
try {
struct timeval start, end;
gettimeofday(&start, NULL);
@@ -294,7 +288,7 @@
} catch (std::bad_alloc) {
printf("Adept combined 88888888 ooms\n");
}
- //}
+ }
}
for (size_t i = 0; i < 5; i++)
diff --git a/enzyme/benchmarks/ReverseMode/adbench/lstm.h b/enzyme/benchmarks/ReverseMode/adbench/lstm.h
index 4f99841..80452b4 100644
--- a/enzyme/benchmarks/ReverseMode/adbench/lstm.h
+++ b/enzyme/benchmarks/ReverseMode/adbench/lstm.h
@@ -243,8 +243,8 @@
int main(const int argc, const char* argv[]) {
printf("starting main\n");
- //std::vector<std::string> paths = { "lstm_l2_c1024.txt", "lstm_l4_c1024.txt", "lstm_l2_c4096.txt", "lstm_l4_c4096.txt" };
- std::vector<std::string> paths = { "lstm_l4_c4096.txt" };
+ std::vector<std::string> paths = { "lstm_l2_c1024.txt", "lstm_l4_c1024.txt", "lstm_l2_c4096.txt", "lstm_l4_c4096.txt" };
+ //std::vector<std::string> paths = { "lstm_l4_c4096.txt" };
std::ofstream jsonfile("results.json", std::ofstream::trunc);
json test_results;
@@ -289,7 +289,7 @@
}
- {
+ if (0){
struct LSTMInput input = {};
@@ -323,7 +323,7 @@
}
- {
+ for (int j=0; j<5; j++){
struct LSTMInput input = {};
@@ -390,7 +390,7 @@
}
}
- {
+ for (int j=0; j<5; j++){
struct LSTMInput input = {};
diff --git a/enzyme/benchmarks/ReverseMode/ba/Makefile.make b/enzyme/benchmarks/ReverseMode/ba/Makefile.make
index 50ab0cf..cec8d4b 100644
--- a/enzyme/benchmarks/ReverseMode/ba/Makefile.make
+++ b/enzyme/benchmarks/ReverseMode/ba/Makefile.make
@@ -4,6 +4,28 @@
dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..)
+include $(dir)/benchmarks/ReverseMode/adbench/Makefile.config
+
+ifeq ($(strip $(CLANG)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES1)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES2)),)
+$(error PASSES2 is not set)
+endif
+
+ifeq ($(strip $(PASSES3)),)
+$(error PASSES3 is not set)
+endif
+
+ifneq ($(strip $(PASSES4)),)
+$(error PASSES4 is set)
+endif
+
clean:
rm -f *.ll *.o results.txt results.json
cargo +enzyme clean
@@ -12,16 +34,13 @@
RUSTFLAGS="-Z autodiff=Enable" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm
%-unopt.ll: %.cpp
- clang++ $(BENCH) $(PTR) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm
+ $(CLANG) $(BENCH) $^ -pthread -O3 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o $@ -S -emit-llvm
-%-raw.ll: %-unopt.ll
- opt $^ $(LOAD) $(ENZYME) -o $@ -S
-
-%-opt.ll: %-raw.ll
- opt $^ -o $@ -S
+%-opt.ll: %-unopt.ll
+ $(OPT) $^ $(LOAD) -passes="$(PASSES2),enzyme" -o $@ -S
ba.o: ba-opt.ll $(dir)/benchmarks/ReverseMode/ba/target/release/libbars.a
- clang++ $(BENCH) -pthread -O2 $^ -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o $@ $(BENCHLINK) -lpthread -lm -L /usr/lib/gcc/x86_64-linux-gnu/11
+ $(CLANG) -pthread -O3 -fno-math-errno $^ -o $@ $(BENCHLINK) -lm
results.json: ba.o
numactl -C 1 ./$^
diff --git a/enzyme/benchmarks/ReverseMode/ba/ba.cpp b/enzyme/benchmarks/ReverseMode/ba/ba.cpp
index 602af73..c9b29ec 100644
--- a/enzyme/benchmarks/ReverseMode/ba/ba.cpp
+++ b/enzyme/benchmarks/ReverseMode/ba/ba.cpp
@@ -115,6 +115,15 @@
proj[1] = proj[1] * L;
}
+void radial_distort_restrict(double const *__restrict rad_params, double *__restrict proj)
+{
+ double rsq, L;
+ rsq = sqsum(2, proj);
+ L = 1. + rad_params[0] * rsq + rad_params[1] * rsq * rsq;
+ proj[0] = proj[0] * L;
+ proj[1] = proj[1] * L;
+}
+
void project_restrict(double const *__restrict cam, double const *__restrict X,
double *__restrict proj) {
double const* C = &cam[3];
@@ -129,7 +138,7 @@
proj[0] = Xcam[0] / Xcam[2];
proj[1] = Xcam[1] / Xcam[2];
- radial_distort(&cam[9], proj);
+ radial_distort_restrict(&cam[9], proj);
proj[0] = proj[0] * cam[6] + cam[7];
proj[1] = proj[1] * cam[6] + cam[8];
diff --git a/enzyme/benchmarks/ReverseMode/ba/src/safe.rs b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs
index 3530c79..dd8bf88 100644
--- a/enzyme/benchmarks/ReverseMode/ba/src/safe.rs
+++ b/enzyme/benchmarks/ReverseMode/ba/src/safe.rs
@@ -182,9 +182,9 @@
#[no_mangle]
extern "C" fn rust2_ba_objective(
- n: usize,
- m: usize,
- p: usize,
+ n: i32,
+ m: i32,
+ p: i32,
cams: *const f64,
x: *const f64,
w: *const f64,
@@ -193,6 +193,9 @@
reproj_err: *mut f64,
w_err: *mut f64,
) {
+ let n = n as usize;
+ let m = m as usize;
+ let p = p as usize;
let cams = unsafe { std::slice::from_raw_parts(cams, n * 11) };
let x = unsafe { std::slice::from_raw_parts(x, m * 3) };
let w = unsafe { std::slice::from_raw_parts(w, p) };
diff --git a/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs
index 09f74be..467a7cb 100644
--- a/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs
+++ b/enzyme/benchmarks/ReverseMode/ba/src/unsafe.rs
@@ -110,9 +110,9 @@
#[no_mangle]
unsafe extern "C" fn rust2_unsafe_ba_objective(
- n: usize,
- m: usize,
- p: usize,
+ n: i32,
+ m: i32,
+ p: i32,
cams: *const f64,
x: *const f64,
w: *const f64,
@@ -121,6 +121,9 @@
reproj_err: *mut f64,
w_err: *mut f64,
) {
+ let n = n as usize;
+ let m = m as usize;
+ let p = p as usize;
for i in 0..p {
let cam_idx = *obs.add(i * 2 + 0) as usize;
let pt_idx = *obs.add(i * 2 + 1) as usize;
diff --git a/enzyme/benchmarks/ReverseMode/fft/Makefile.make b/enzyme/benchmarks/ReverseMode/fft/Makefile.make
index b9385cd..9ed3daa 100644
--- a/enzyme/benchmarks/ReverseMode/fft/Makefile.make
+++ b/enzyme/benchmarks/ReverseMode/fft/Makefile.make
@@ -4,6 +4,28 @@
dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..)
+include $(dir)/benchmarks/ReverseMode/adbench/Makefile.config
+
+ifeq ($(strip $(CLANG)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES1)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES2)),)
+$(error PASSES2 is not set)
+endif
+
+ifeq ($(strip $(PASSES3)),)
+$(error PASSES3 is not set)
+endif
+
+ifneq ($(strip $(PASSES4)),)
+$(error PASSES4 is set)
+endif
+
clean:
rm -f *.ll *.o results.txt results.json
@@ -11,17 +33,21 @@
RUSTFLAGS="-Z autodiff=Enable" cargo +enzyme rustc --release --lib --crate-type=staticlib
%-unopt.ll: %.cpp
- clang++ $(BENCH) $(PTR) $^ -pthread -O2 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm
+ $(CLANG) $(BENCH) $^ -DCPP=1 -fno-math-errno -fno-plt -pthread -O3 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o $@ -S -emit-llvm #-fno-use-cxa-atexit
+%-unoptr.ll: %.cpp
+ $(CLANG) $(BENCH) $^ -fno-math-errno -fno-plt -pthread -O3 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o $@ -S -emit-llvm #-fno-use-cxa-atexit
-%-raw.ll: %-unopt.ll
- opt $^ $(LOAD) $(ENZYME) -o $@ -S
-%-opt.ll: %-raw.ll
- opt $^ -o $@ -S
+%-opt.ll: %-unopt.ll
+ $(OPT) $^ $(LOAD) -passes="$(PASSES2),enzyme" -o $@ -S
+%-optr.ll: %-unoptr.ll
+ $(OPT) $^ $(LOAD) -passes="$(PASSES2),enzyme" -o $@ -S
fft.o: fft-opt.ll $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a
- clang++ $(BENCH) -pthread -O2 $^ -o $@ $(BENCHLINK) -lpthread -lm -L /usr/lib/gcc/x86_64-linux-gnu/11
- #clang++ $(LOAD) $(BENCH) fft.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o fft.o -lpthread $(BENCHLINK) -lm -L /usr/lib/gcc/x86_64-linux-gnu/11
+ $(CLANG) -DCPP=1 -pthread -O3 -fno-math-errno -fno-plt -lpthread -lm $^ -o $@ $(BENCHLINK) -lm
+fftr.o: fft-optr.ll $(dir)/benchmarks/ReverseMode/fft/target/release/libfft.a
+ $(CLANG) -pthread -O3 -fno-math-errno -fno-plt -lpthread -lm $^ -o $@ $(BENCHLINK) -lm
-results.json: fft.o
- ./$^ 1048576 | tee $@
+results.json: fftr.o fft.o
+ numactl -C 1 ./fft.o 1048576 | tee results.json
+ numactl -C 1 ./fftr.o 1048576 | tee resultsr.json
diff --git a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make
index 17e22dd..f5f6de4 100644
--- a/enzyme/benchmarks/ReverseMode/gmm/Makefile.make
+++ b/enzyme/benchmarks/ReverseMode/gmm/Makefile.make
@@ -1,28 +1,46 @@
-# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" PTR="%ptr" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" LOADCLANG="%loadClangEnzyme" ENZYME="%enzyme" make -B gmm-raw.ll results.json -f %s
+# RUN: cd %S && LD_LIBRARY_PATH="%bldpath:$LD_LIBRARY_PATH" BENCH="%bench" BENCHLINK="%blink" LOAD="%loadEnzyme" LOADCLANG="%loadClangEnzyme" ENZYME="%enzyme" make -B gmm-raw.ll results.json -f %s
.PHONY: clean
dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..)
+include $(dir)/benchmarks/ReverseMode/adbench/Makefile.config
+
+ifeq ($(strip $(CLANG)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES1)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES2)),)
+$(error PASSES2 is not set)
+endif
+
+ifeq ($(strip $(PASSES3)),)
+$(error PASSES3 is not set)
+endif
+
+ifneq ($(strip $(PASSES4)),)
+$(error PASSES4 is set)
+endif
+
clean:
rm -f *.ll *.o results.txt results.json
cargo +enzyme clean
$(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a: src/lib.rs Cargo.toml
- RUSTFLAGS="-Z autodiff=Enable,LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib --features=libm
+ RUSTFLAGS="-Z autodiff=Enable,PrintPasses,LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib
%-unopt.ll: %.cpp
- clang++ $(BENCH) $(PTR) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm
+ $(CLANG) $(BENCH) $^ -pthread -O3 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o $@ -S -emit-llvm
-%-raw.ll: %-unopt.ll
- opt $^ $(LOAD) $(ENZYME) -o $@ -S
-
-%-opt.ll: %-raw.ll
- opt $^ -o $@ -S
+%-opt.ll: %-unopt.ll
+ $(OPT) $^ $(LOAD) -passes="$(PASSES2),enzyme" -o $@ -S
gmm.o: gmm-opt.ll $(dir)/benchmarks/ReverseMode/gmm/target/release/libgmmrs.a
- clang++ -pthread -O2 $^ -o $@ $(BENCHLINK) -lm
- #clang++ $(LOADCLANG) $(BENCH) gmm.cpp -I /usr/include/c++/11 -I/usr/include/x86_64-linux-gnu/c++/11 -O2 -o gmm.o -lpthread $(BENCHLINK) -lm -L /usr/lib/gcc/x86_64-linux-gnu/11
+ $(CLANG) -pthread -O3 -fno-math-errno $^ -o $@ $(BENCHLINK) -lm
results.json: gmm.o
numactl -C 1 ./$^
diff --git a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make
index 1388a54..71c6f5b 100644
--- a/enzyme/benchmarks/ReverseMode/lstm/Makefile.make
+++ b/enzyme/benchmarks/ReverseMode/lstm/Makefile.make
@@ -4,24 +4,44 @@
dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..)
+include $(dir)/benchmarks/ReverseMode/adbench/Makefile.config
+
+ifeq ($(strip $(CLANG)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES1)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES2)),)
+$(error PASSES2 is not set)
+endif
+
+ifeq ($(strip $(PASSES3)),)
+$(error PASSES3 is not set)
+endif
+
+ifneq ($(strip $(PASSES4)),)
+$(error PASSES4 is set)
+endif
+
clean:
rm -f *.ll *.o results.txt results.json
cargo +enzyme clean
$(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a: src/lib.rs Cargo.toml
- RUSTFLAGS="-Z autodiff=Enable,LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib
+ RUSTFLAGS="-Z autodiff=Enable,PrintPasses" cargo +enzyme rustc --release --lib --crate-type=staticlib
%-unopt.ll: %.cpp
- clang++ $(BENCH) $(PTR) $^ -pthread -O2 -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm
+ $(CLANG) $(BENCH) $^ -pthread -O3 -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o $@ -S -emit-llvm
-%-raw.ll: %-unopt.ll
- opt $^ $(LOAD) $(ENZYME) -o $@ -S
-
-%-opt.ll: %-raw.ll
- opt $^ -o $@ -S
+%-opt.ll: %-unopt.ll
+ $(OPT) $^ $(LOAD) -passes="$(PASSES2),enzyme" -o $@ -S
lstm.o: lstm-opt.ll $(dir)/benchmarks/ReverseMode/lstm/target/release/liblstm.a
- clang++ -pthread -O2 $^ -o $@ $(BENCHLINK) -lm
+ $(CLANG) -pthread -O3 $^ -o $@ $(BENCHLINK) -lm
+ #$(CLANG) -pthread -O3 -fno-math-errno $^ -o $@ $(BENCHLINK) -lm
results.json: lstm.o
numactl -C 1 ./$^
diff --git a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs
index d6847a4..3329ebb 100644
--- a/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs
+++ b/enzyme/benchmarks/ReverseMode/lstm/src/safe.rs
@@ -1,5 +1,6 @@
use std::slice;
use std::autodiff::autodiff;
+use std::hint::assert_unchecked;
// Sigmoid on scalar
fn sigmoid(x: f64) -> f64 {
@@ -32,11 +33,11 @@
let (a, b) = gates.split_at_mut(2 * hsize);
let ((forget, ingate), (outgate, change)) = (a.split_at_mut(hsize), b.split_at_mut(hsize));
- //debug_assert_eq!(weight.len(), 4 * hsize);
- //debug_assert_eq!(bias.len(), 4 * hsize);
- //debug_assert_eq!(hidden.len(), hsize);
- //debug_assert!(cell.len() >= hsize);
- //debug_assert!(input.len() >= hsize);
+ // unsafe {assert_unchecked(weight.len()== 4 * hsize)};
+ // unsafe {assert_unchecked(bias.len()== 4 * hsize)};
+ // unsafe {assert_unchecked(hidden.len()== hsize)};
+ // unsafe {assert_unchecked(cell.len() >= hsize)};
+ // unsafe {assert_unchecked(input.len() >= hsize)};
// caching input
for i in 0..hsize {
forget[i] = sigmoid(input[i] * weight[i] + bias[i]);
@@ -131,7 +132,7 @@
let mut ypred = vec![0.0; b];
let mut ynorm = vec![0.0; b];
- //debug_assert!(b > 0);
+ // unsafe{assert_unchecked(b > 0)};
let limit = (c - 1) * b;
for j in 0..(c - 1) {
@@ -156,15 +157,18 @@
#[no_mangle]
pub extern "C" fn rust_lstm_objective(
- l: usize,
- c: usize,
- b: usize,
+ l: i32,
+ c: i32,
+ b: i32,
main_params: *const f64,
extra_params: *const f64,
state: *mut f64,
sequence: *const f64,
loss: *mut f64,
) {
+ let l = l as usize;
+ let c = c as usize;
+ let b = b as usize;
let (main_params, extra_params, state, sequence) = unsafe {
(
slice::from_raw_parts(main_params, 2 * l * 4 * b),
@@ -190,9 +194,9 @@
#[no_mangle]
pub extern "C" fn rust_dlstm_objective(
- l: usize,
- c: usize,
- b: usize,
+ l: i32,
+ c: i32,
+ b: i32,
main_params: *const f64,
d_main_params: *mut f64,
extra_params: *const f64,
@@ -202,6 +206,9 @@
res: *mut f64,
d_res: *mut f64,
) {
+ let l = l as usize;
+ let c = c as usize;
+ let b = b as usize;
let (main_params, d_main_params, extra_params, d_extra_params, state, sequence) = unsafe {
(
slice::from_raw_parts(main_params, 2 * l * 4 * b),
diff --git a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make
index 87af95f..582ba79 100644
--- a/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make
+++ b/enzyme/benchmarks/ReverseMode/ode-real/Makefile.make
@@ -4,6 +4,28 @@
dir := $(abspath $(lastword $(MAKEFILE_LIST))/../../../..)
+include $(dir)/benchmarks/ReverseMode/adbench/Makefile.config
+
+ifeq ($(strip $(CLANG)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES1)),)
+$(error PASSES1 is not set)
+endif
+
+ifeq ($(strip $(PASSES2)),)
+$(error PASSES2 is not set)
+endif
+
+ifeq ($(strip $(PASSES3)),)
+$(error PASSES3 is not set)
+endif
+
+ifneq ($(strip $(PASSES4)),)
+$(error PASSES4 is set)
+endif
+
clean:
rm -f *.ll *.o results.txt results.json
cargo +enzyme clean
@@ -12,16 +34,13 @@
RUSTFLAGS="-Z autodiff=Enable,LooseTypes" cargo +enzyme rustc --release --lib --crate-type=staticlib
%-unopt.ll: %.cpp
- clang++ $(BENCH) $(PTR) $^ -O2 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -ffast-math -fno-unroll-loops -o $@ -S -emit-llvm
+ $(CLANG) $(BENCH) $^ -pthread -O3 -fno-use-cxa-atexit -fno-vectorize -fno-slp-vectorize -fno-unroll-loops -o $@ -S -emit-llvm
-%-raw.ll: %-unopt.ll
- opt $^ $(LOAD) $(ENZYME) -o $@ -S
-
-%-opt.ll: %-raw.ll
- opt $^ -o $@ -S
+%-opt.ll: %-unopt.ll
+ $(OPT) $^ $(LOAD) -passes="$(PASSES2),enzyme" -o $@ -S
ode.o: ode-opt.ll $(dir)/benchmarks/ReverseMode/ode-real/target/release/libode.a
- clang++ $(BENCH) -O2 $^ -o $@ $(BENCHLINK)
+ $(CLANG) -pthread -O3 -fno-math-errno $^ -o $@ $(BENCHLINK)
results.json: ode.o
numactl -C 1 ./$^ 1000 | tee $@