enzyme/test/Integration/ReverseMode/eigentensorfull.cpp - EnzymeAD/Enzyme - Git at Google

 // LLVM 14 itself has a bug compiling eigen, even the original code without AD
 // RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -mllvm -force-vector-width=1 -ffast-math -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O3 %s -S -emit-llvm -o - | %opt - %OPloadEnzyme %enzyme -S | %lli - ; fi
 // RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O2 %s -S -emit-llvm -o - | %opt - %OPloadEnzyme %enzyme -S | %lli - ; fi
 // RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O1 %s -S -emit-llvm -o - | %opt - %OPloadEnzyme %enzyme -S | %lli - ; fi
 // RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions %O0TBAA %s -S -emit-llvm -o - | %opt - %OPloadEnzyme %enzyme -S | %lli - ; fi
 // RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O3 %s -S -emit-llvm -o - | %opt - %OPloadEnzyme %enzyme -enzyme-inline=1 -S | %lli -; fi
 // RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O2 %s -S -emit-llvm -o - | %opt - %OPloadEnzyme %enzyme -enzyme-inline=1 -S | %lli -; fi
 // RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O1 %s -S -emit-llvm -o - | %opt - %OPloadEnzyme %enzyme -enzyme-inline=1 -S | %lli - ; fi
 // RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions %O0TBAA %s -S -emit-llvm -o - | %opt - %OPloadEnzyme %enzyme -enzyme-inline=1 -S | %lli - ; fi

 #define EIGEN_NO_AUTOMATIC_RESIZING 1
 #define EIGEN_DONT_ALIGN 1
 #define EIGEN_NO_DEBUG 1
 #define EIGEN_UNROLLING_LIMIT 0
 #define EIGEN_DONT_VECTORIZE 1

 #include "../test_utils.h"

 /*
 void memcpy(float* __restrict dst, float* __restrict src, size_t count) {
     for(size_t i=0; i<count/sizeof(float); i++) {
         dst[i] = src[i];
     }
 }

 void memcpy(double* __restrict dst, double* __restrict src, size_t count) {
     for(size_t i=0; i<count/sizeof(double); i++) {
         dst[i] = src[i];
     }
 }


 template<typename T>
 void memcpy(T* __restrict dst, T* __restrict src, size_t count) {
     for(size_t i=0; i<count/sizeof(T); i++) {
         dst[i] = src[i];
     }
 }*/


 #include <eigen3/Eigen/Dense>
 #include <eigen3/unsupported/Eigen/CXX11/Tensor>

 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #include <stdlib.h>
 #include <math.h>
 #include <inttypes.h>
 #include <stdlib.h>
 #include <string.h>

 using Eigen::MatrixXd;
 using Eigen::Matrix;
 using Eigen::Tensor;

 constexpr size_t IN = 4, OUT = 4, NUM = 5;


 namespace Eigen {
 namespace internal {

 template<> struct smart_copy_helper<float, true> {
 EIGEN_DEVICE_FUNC static inline void run(const float* start, const float* end, float* target) {
     for(unsigned i=0; start+i != end; i++) {
         target[i] = start[i];
     }
 }
 };
 };

 };


 extern "C" {
     extern double __enzyme_autodiff(void*, const Tensor<float, 2>* __restrict K, const Tensor<float, 2>* __restrict Kp, const Tensor<float, 4>* __restrict I, const Tensor<float, 4>* __restrict Ip, Tensor<float, 4>* __restrict O, Tensor<float, 4>* __restrict Op);
 }

 __attribute__((noinline))
 static void matvec(const Tensor<float, 2>* __restrict K, const Tensor<float, 4>* __restrict In, Tensor<float, 4>* Out) {
   Eigen::array<ptrdiff_t, 2> dims({1, 2});
   *Out = In->convolve(*K, dims);
 }

 int main(int argc, char** argv) {

     Tensor<float, 4> input(3, 3, 7, 11);
     Tensor<float, 2> kernel(2, 2);
     Tensor<float, 4> output(3, 2, 6, 11);
     input.setRandom();
     kernel.setRandom();

     Tensor<float, 4> inputp(3, 3, 7, 11);
     Tensor<float, 2> kernelp(2, 2);
     Tensor<float, 4> outputp(3, 2, 6, 11);
     inputp.setZero();
     kernelp.setZero();
     outputp.setRandom(); //One();

     Tensor<float, 2> expected_kernel(2, 2);
     expected_kernel.setZero();
 for (int i = 0; i < 3; ++i) {
   for (int j = 0; j < 2; ++j) {
     for (int k = 0; k < 6; ++k) {
       for (int l = 0; l < 11; ++l) {
         const float result = output(i,j,k,l);
         const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
                                input(i,j+1,k+0,l) * kernel(1,0) +
                                input(i,j+0,k+1,l) * kernel(0,1) +
                                input(i,j+1,k+1,l) * kernel(1,1);
         //VERIFY_IS_APPROX(result, expected);
         //VERIFY_IS_APPROX(result, expected);
 		for(int si=0; si<2; si++)
 		for(int sj=0; sj<2; sj++)
 			expected_kernel(si,sj) += outputp(i, j, k, l) * input(i, j+si, k+sj, l);
       }
     }
   }
 }

     matvec(&kernel, &input, &output);
     printf("did original\n");
     __enzyme_autodiff((void*)matvec, &kernel, &kernelp, &input, &inputp, &output, &outputp);


 	for(int si=0; si<2; si++)
 	for(int sj=0; sj<2; sj++) {
         fprintf(stderr, "kernelp(si=%d, sj=%d)=%f, expected_kernel(si=%d, sj=%d)=%f\n", si, sj, kernelp(si, sj), si, sj, expected_kernel(si, sj) );
         APPROX_EQ( kernelp(si, sj), expected_kernel(si, sj), 1e-3);
     }

 }
	// LLVM 14 itself has a bug compiling eigen, even the original code without AD
	// RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -mllvm -force-vector-width=1 -ffast-math -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O3 %s -S -emit-llvm -o - \| %opt - %OPloadEnzyme %enzyme -S \| %lli - ; fi
	// RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O2 %s -S -emit-llvm -o - \| %opt - %OPloadEnzyme %enzyme -S \| %lli - ; fi
	// RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O1 %s -S -emit-llvm -o - \| %opt - %OPloadEnzyme %enzyme -S \| %lli - ; fi
	// RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions %O0TBAA %s -S -emit-llvm -o - \| %opt - %OPloadEnzyme %enzyme -S \| %lli - ; fi
	// RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O3 %s -S -emit-llvm -o - \| %opt - %OPloadEnzyme %enzyme -enzyme-inline=1 -S \| %lli -; fi
	// RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O2 %s -S -emit-llvm -o - \| %opt - %OPloadEnzyme %enzyme -enzyme-inline=1 -S \| %lli -; fi
	// RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions -O1 %s -S -emit-llvm -o - \| %opt - %OPloadEnzyme %enzyme -enzyme-inline=1 -S \| %lli - ; fi
	// RUN: if [ %llvmver -ne 14 ] && [ %llvmver -ne 15 ]; then %clang++ -I/usr/include/eigen3 -Xclang -new-struct-path-tbaa -fno-unroll-loops -fno-vectorize -fno-slp-vectorize -fno-exceptions %O0TBAA %s -S -emit-llvm -o - \| %opt - %OPloadEnzyme %enzyme -enzyme-inline=1 -S \| %lli - ; fi

	#define EIGEN_NO_AUTOMATIC_RESIZING 1
	#define EIGEN_DONT_ALIGN 1
	#define EIGEN_NO_DEBUG 1
	#define EIGEN_UNROLLING_LIMIT 0
	#define EIGEN_DONT_VECTORIZE 1

	#include "../test_utils.h"

	/*
	void memcpy(float* __restrict dst, float* __restrict src, size_t count) {
	for(size_t i=0; i<count/sizeof(float); i++) {
	dst[i] = src[i];
	}
	}

	void memcpy(double* __restrict dst, double* __restrict src, size_t count) {
	for(size_t i=0; i<count/sizeof(double); i++) {
	dst[i] = src[i];
	}
	}


	template<typename T>
	void memcpy(T* __restrict dst, T* __restrict src, size_t count) {
	for(size_t i=0; i<count/sizeof(T); i++) {
	dst[i] = src[i];
	}
	}*/



	#include <eigen3/Eigen/Dense>
	#include <eigen3/unsupported/Eigen/CXX11/Tensor>

	#include <math.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <sys/time.h>
	#include <stdlib.h>
	#include <math.h>
	#include <inttypes.h>
	#include <stdlib.h>
	#include <string.h>

	using Eigen::MatrixXd;
	using Eigen::Matrix;
	using Eigen::Tensor;

	constexpr size_t IN = 4, OUT = 4, NUM = 5;


	namespace Eigen {
	namespace internal {

	template<> struct smart_copy_helper<float, true> {
	EIGEN_DEVICE_FUNC static inline void run(const float* start, const float* end, float* target) {
	for(unsigned i=0; start+i != end; i++) {
	target[i] = start[i];
	}
	}
	};
	};

	};


	extern "C" {
	extern double __enzyme_autodiff(void, const Tensor<float, 2> __restrict K, const Tensor<float, 2>* __restrict Kp, const Tensor<float, 4>* __restrict I, const Tensor<float, 4>* __restrict Ip, Tensor<float, 4>* __restrict O, Tensor<float, 4>* __restrict Op);
	}

	__attribute__((noinline))
	static void matvec(const Tensor<float, 2>* __restrict K, const Tensor<float, 4>* __restrict In, Tensor<float, 4>* Out) {
	Eigen::array<ptrdiff_t, 2> dims({1, 2});
	Out = In->convolve(K, dims);
	}

	int main(int argc, char** argv) {

	Tensor<float, 4> input(3, 3, 7, 11);
	Tensor<float, 2> kernel(2, 2);
	Tensor<float, 4> output(3, 2, 6, 11);
	input.setRandom();
	kernel.setRandom();

	Tensor<float, 4> inputp(3, 3, 7, 11);
	Tensor<float, 2> kernelp(2, 2);
	Tensor<float, 4> outputp(3, 2, 6, 11);
	inputp.setZero();
	kernelp.setZero();
	outputp.setRandom(); //One();

	Tensor<float, 2> expected_kernel(2, 2);
	expected_kernel.setZero();
	for (int i = 0; i < 3; ++i) {
	for (int j = 0; j < 2; ++j) {
	for (int k = 0; k < 6; ++k) {
	for (int l = 0; l < 11; ++l) {
	const float result = output(i,j,k,l);
	const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
	input(i,j+1,k+0,l) * kernel(1,0) +
	input(i,j+0,k+1,l) * kernel(0,1) +
	input(i,j+1,k+1,l) * kernel(1,1);
	//VERIFY_IS_APPROX(result, expected);
	//VERIFY_IS_APPROX(result, expected);
	for(int si=0; si<2; si++)
	for(int sj=0; sj<2; sj++)
	expected_kernel(si,sj) += outputp(i, j, k, l) * input(i, j+si, k+sj, l);
	}
	}
	}
	}

	matvec(&kernel, &input, &output);
	printf("did original\n");
	__enzyme_autodiff((void*)matvec, &kernel, &kernelp, &input, &inputp, &output, &outputp);


	for(int si=0; si<2; si++)
	for(int sj=0; sj<2; sj++) {
	fprintf(stderr, "kernelp(si=%d, sj=%d)=%f, expected_kernel(si=%d, sj=%d)=%f\n", si, sj, kernelp(si, sj), si, sj, expected_kernel(si, sj) );
	APPROX_EQ( kernelp(si, sj), expected_kernel(si, sj), 1e-3);
	}

	}