enzyme/test/Integration/blas_inline.h - EnzymeAD/Enzyme - Git at Google

 #include <stdio.h>
 #include <assert.h>

 typedef int32_t integer;
 typedef double doublereal;
 typedef bool logical;

 #define abs(x) ((x) >= 0 ? (x) : -(x))
 #define min(a,b) ((a) <= (b) ? (a) : (b))
 #define max(a,b) ((a) >= (b) ? (a) : (b))
 #define TRUE_ true
 #define FALSE_ false

 #ifdef __cplusplus
 extern "C" {
 #endif

 __attribute__((noinline))
 int xerbla_(const char *srname, integer *info, int len)
 {
     static char fmt_9999[] = "(\002 ** On entry to \002,a,\002 parameter num"
 	    "ber \002,i2,\002 had \002,\002an illegal value\002)";

 	printf("** On entry to %6s, parameter number %2i had an illegal value\n",
 		srname, *info);
 	assert(0 &&" error");
 	exit(1);
     return 0;
 }
 __attribute__((noinline))
 logical lsame_(char *ca, char *cb, int ca_size, int cb_size)
 {
     /* System generated locals */
     logical ret_val;

     /* Local variables */
     integer inta, intb, zcode;


 /*  -- LAPACK auxiliary routine (version 3.1) -- */
 /*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
 /*     November 2006 */

 /*     .. Scalar Arguments .. */
 /*     .. */

 /*  Purpose */
 /*  ======= */

 /*  LSAME returns .TRUE. if CA is the same letter as CB regardless of */
 /*  case. */

 /*  Arguments */
 /*  ========= */

 /*  CA      (input) CHARACTER*1 */

 /*  CB      (input) CHARACTER*1 */
 /*          CA and CB specify the single characters to be compared. */

 /* ===================================================================== */

 /*     .. Intrinsic Functions .. */
 /*     .. */
 /*     .. Local Scalars .. */
 /*     .. */

 /*     Test if the characters are equal */

     ret_val = *(unsigned char *)ca == *(unsigned char *)cb;
     if (ret_val) {
 	return ret_val;
     }

 /*     Now test for equivalence if both characters are alphabetic. */

     zcode = 'Z';

 /*     Use 'Z' rather than 'A' so that ASCII can be detected on Prime */
 /*     machines, on which ICHAR returns a value with bit 8 set. */
 /*     ICHAR('A') on Prime machines returns 193 which is the same as */
 /*     ICHAR('A') on an EBCDIC machine. */

     inta = *(unsigned char *)ca;
     intb = *(unsigned char *)cb;

     if (zcode == 90 || zcode == 122) {

 /*        ASCII is assumed - ZCODE is the ASCII code of either lower or */
 /*        upper case 'Z'. */

 	if (inta >= 97 && inta <= 122) {
 	    inta += -32;
 	}
 	if (intb >= 97 && intb <= 122) {
 	    intb += -32;
 	}

     } else if (zcode == 233 || zcode == 169) {

 /*        EBCDIC is assumed - ZCODE is the EBCDIC code of either lower or */
 /*        upper case 'Z'. */

 	if (inta >= 129 && inta <= 137 || inta >= 145 && inta <= 153 || inta
 		>= 162 && inta <= 169) {
 	    inta += 64;
 	}
 	if (intb >= 129 && intb <= 137 || intb >= 145 && intb <= 153 || intb
 		>= 162 && intb <= 169) {
 	    intb += 64;
 	}

     } else if (zcode == 218 || zcode == 250) {

 /*        ASCII is assumed, on Prime machines - ZCODE is the ASCII code */
 /*        plus 128 of either lower or upper case 'Z'. */

 	if (inta >= 225 && inta <= 250) {
 	    inta += -32;
 	}
 	if (intb >= 225 && intb <= 250) {
 	    intb += -32;
 	}
     }
     ret_val = inta == intb;

 /*     RETURN */

 /*     End of LSAME */

     return ret_val;
 } /* lsame_ */

 __attribute__((noinline))

 logical dlaisnan_(doublereal *din1, doublereal *din2)
 {
     /* System generated locals */
     logical ret_val;


 /*  -- LAPACK auxiliary routine (version 3.2) -- */
 /*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
 /*     November 2006 */

 /*     .. Scalar Arguments .. */
 /*     .. */

 /*  Purpose */
 /*  ======= */

 /*  This routine is not for general use.  It exists solely to avoid */
 /*  over-optimization in DISNAN. */

 /*  DLAISNAN checks for NaNs by comparing its two arguments for */
 /*  inequality.  NaN is the only floating-point value where NaN != NaN */
 /*  returns .TRUE.  To check for NaNs, pass the same variable as both */
 /*  arguments. */

 /*  A compiler must assume that the two arguments are */
 /*  not the same variable, and the test will not be optimized away. */
 /*  Interprocedural or whole-program optimization may delete this */
 /*  test.  The ISNAN functions will be replaced by the correct */
 /*  Fortran 03 intrinsic once the intrinsic is widely available. */

 /*  Arguments */
 /*  ========= */

 /*  DIN1     (input) DOUBLE PRECISION */
 /*  DIN2     (input) DOUBLE PRECISION */
 /*          Two numbers to compare for inequality. */

 /*  ===================================================================== */

 /*  .. Executable Statements .. */
     ret_val = *din1 != *din2;
     return ret_val;
 } /* dlaisnan_ */

 __attribute__((noinline))
 logical disnan_(doublereal *din)
 {
     /* System generated locals */
     logical ret_val;

     /* Local variables */

 /*  -- LAPACK auxiliary routine (version 3.2) -- */
 /*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
 /*     November 2006 */

 /*     .. Scalar Arguments .. */
 /*     .. */

 /*  Purpose */
 /*  ======= */

 /*  DISNAN returns .TRUE. if its argument is NaN, and .FALSE. */
 /*  otherwise.  To be replaced by the Fortran 2003 intrinsic in the */
 /*  future. */

 /*  Arguments */
 /*  ========= */

 /*  DIN      (input) DOUBLE PRECISION */
 /*          Input to test for NaN. */

 /*  ===================================================================== */

 /*  .. External Functions .. */
 /*  .. */
 /*  .. Executable Statements .. */
     ret_val = dlaisnan_(din, din);
     return ret_val;
 } /* disnan_ */

 __attribute__((noinline))

 /* Subroutine */ void dlacpy_(char *uplo, integer *m, integer *n, const doublereal *
 	a, integer *lda, doublereal *b, integer *ldb)
 {
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, i__1, i__2;

     /* Local variables */
     integer i__, j;


 /*  -- LAPACK auxiliary routine (version 3.2) -- */
 /*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
 /*     November 2006 */

 /*     .. Scalar Arguments .. */
 /*     .. */
 /*     .. Array Arguments .. */
 /*     .. */

 /*  Purpose */
 /*  ======= */

 /*  DLACPY copies all or part of a two-dimensional matrix A to another */
 /*  matrix B. */

 /*  Arguments */
 /*  ========= */

 /*  UPLO    (input) CHARACTER*1 */
 /*          Specifies the part of the matrix A to be copied to B. */
 /*          = 'U':      Upper triangular part */
 /*          = 'L':      Lower triangular part */
 /*          Otherwise:  All of the matrix A */

 /*  M       (input) INTEGER */
 /*          The number of rows of the matrix A.  M >= 0. */

 /*  N       (input) INTEGER */
 /*          The number of columns of the matrix A.  N >= 0. */

 /*  A       (input) DOUBLE PRECISION array, dimension (LDA,N) */
 /*          The m by n matrix A.  If UPLO = 'U', only the upper triangle */
 /*          or trapezoid is accessed; if UPLO = 'L', only the lower */
 /*          triangle or trapezoid is accessed. */

 /*  LDA     (input) INTEGER */
 /*          The leading dimension of the array A.  LDA >= max(1,M). */

 /*  B       (output) DOUBLE PRECISION array, dimension (LDB,N) */
 /*          On exit, B = A in the locations specified by UPLO. */

 /*  LDB     (input) INTEGER */
 /*          The leading dimension of the array B.  LDB >= max(1,M). */

 /*  ===================================================================== */

 /*     .. Local Scalars .. */
 /*     .. */
 /*     .. External Functions .. */
 /*     .. */
 /*     .. Intrinsic Functions .. */
 /*     .. */
 /*     .. Executable Statements .. */

     /* Parameter adjustments */
     a_dim1 = *lda;
     a_offset = 1 + a_dim1;
     a -= a_offset;
     b_dim1 = *ldb;
     b_offset = 1 + b_dim1;
     b -= b_offset;

     /* Function Body */
     if (lsame_(uplo, (char*)"U", 1, 1)) {
 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 	    i__2 = min(j,*m);
 	    for (i__ = 1; i__ <= i__2; ++i__) {
 		b[i__ + j * b_dim1] = a[i__ + j * a_dim1];
 /* L10: */
 	    }
 /* L20: */
 	}
     } else if (lsame_(uplo, (char*)"L", 1, 1)) {
 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 	    i__2 = *m;
 	    for (i__ = j; i__ <= i__2; ++i__) {
 		b[i__ + j * b_dim1] = a[i__ + j * a_dim1];
 /* L30: */
 	    }
 /* L40: */
 	}
     } else {
 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 	    i__2 = *m;
 	    for (i__ = 1; i__ <= i__2; ++i__) {
 		b[i__ + j * b_dim1] = a[i__ + j * a_dim1];
 /* L50: */
 	    }
 /* L60: */
 	}
     }
     return;

 /*     End of DLACPY */

 } /* dlacpy_ */

 __attribute__((noinline))

 /* Subroutine */ int dlascl_(char *type__, integer *kl, integer *ku,
 	doublereal *cfrom, doublereal *cto, integer *m, integer *n,
 	doublereal *a, integer *lda, integer *info)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5;

     /* Local variables */
     integer i__, j, k1, k2, k3, k4;
     doublereal mul, cto1;
     logical done;
     doublereal ctoc;
     integer itype;
     doublereal cfrom1;
     // extern doublereal dlamch_(char *);
     doublereal cfromc;
     doublereal bignum, smlnum;


 /*  -- LAPACK auxiliary routine (version 3.2) -- */
 /*     Univ. of Tennessee, Univ. of California Berkeley and NAG Ltd.. */
 /*     November 2006 */

 /*     .. Scalar Arguments .. */
 /*     .. */
 /*     .. Array Arguments .. */
 /*     .. */

 /*  Purpose */
 /*  ======= */

 /*  DLASCL multiplies the M by N real matrix A by the real scalar */
 /*  CTO/CFROM.  This is done without over/underflow as long as the final */
 /*  result CTO*A(I,J)/CFROM does not over/underflow. TYPE specifies that */
 /*  A may be full, upper triangular, lower triangular, upper Hessenberg, */
 /*  or banded. */

 /*  Arguments */
 /*  ========= */

 /*  TYPE    (input) CHARACTER*1 */
 /*          TYPE indices the storage type of the input matrix. */
 /*          = 'G':  A is a full matrix. */
 /*          = 'L':  A is a lower triangular matrix. */
 /*          = 'U':  A is an upper triangular matrix. */
 /*          = 'H':  A is an upper Hessenberg matrix. */
 /*          = 'B':  A is a symmetric band matrix with lower bandwidth KL */
 /*                  and upper bandwidth KU and with the only the lower */
 /*                  half stored. */
 /*          = 'Q':  A is a symmetric band matrix with lower bandwidth KL */
 /*                  and upper bandwidth KU and with the only the upper */
 /*                  half stored. */
 /*          = 'Z':  A is a band matrix with lower bandwidth KL and upper */
 /*                  bandwidth KU. */

 /*  KL      (input) INTEGER */
 /*          The lower bandwidth of A.  Referenced only if TYPE = 'B', */
 /*          'Q' or 'Z'. */

 /*  KU      (input) INTEGER */
 /*          The upper bandwidth of A.  Referenced only if TYPE = 'B', */
 /*          'Q' or 'Z'. */

 /*  CFROM   (input) DOUBLE PRECISION */
 /*  CTO     (input) DOUBLE PRECISION */
 /*          The matrix A is multiplied by CTO/CFROM. A(I,J) is computed */
 /*          without over/underflow if the final result CTO*A(I,J)/CFROM */
 /*          can be represented without over/underflow.  CFROM must be */
 /*          nonzero. */

 /*  M       (input) INTEGER */
 /*          The number of rows of the matrix A.  M >= 0. */

 /*  N       (input) INTEGER */
 /*          The number of columns of the matrix A.  N >= 0. */

 /*  A       (input/output) DOUBLE PRECISION array, dimension (LDA,N) */
 /*          The matrix to be multiplied by CTO/CFROM.  See TYPE for the */
 /*          storage type. */

 /*  LDA     (input) INTEGER */
 /*          The leading dimension of the array A.  LDA >= max(1,M). */

 /*  INFO    (output) INTEGER */
 /*          0  - successful exit */
 /*          <0 - if INFO = -i, the i-th argument had an illegal value. */

 /*  ===================================================================== */

 /*     .. Parameters .. */
 /*     .. */
 /*     .. Local Scalars .. */
 /*     .. */
 /*     .. External Functions .. */
 /*     .. */
 /*     .. Intrinsic Functions .. */
 /*     .. */
 /*     .. External Subroutines .. */
 /*     .. */
 /*     .. Executable Statements .. */

 /*     Test the input arguments */

     /* Parameter adjustments */
     a_dim1 = *lda;
     a_offset = 1 + a_dim1;
     a -= a_offset;

     /* Function Body */
     *info = 0;

     if (lsame_(type__, (char*)"G", 1, 1)) {
 	itype = 0;
     } else if (lsame_(type__, (char*)"L", 1, 1)) {
 	itype = 1;
     } else if (lsame_(type__, (char*)"U", 1, 1)) {
 	itype = 2;
     } else if (lsame_(type__, (char*)"H", 1, 1)) {
 	itype = 3;
     } else if (lsame_(type__, (char*)"B", 1, 1)) {
 	itype = 4;
     } else if (lsame_(type__, (char*)"Q", 1, 1)) {
 	itype = 5;
     } else if (lsame_(type__, (char*)"Z", 1, 1)) {
 	itype = 6;
     } else {
 	itype = -1;
     }

     if (itype == -1) {
 	*info = -1;
     } else if (*cfrom == 0. || disnan_(cfrom)) {
 	*info = -4;
     } else if (disnan_(cto)) {
 	*info = -5;
     } else if (*m < 0) {
 	*info = -6;
     } else if (*n < 0 || itype == 4 && *n != *m || itype == 5 && *n != *m) {
 	*info = -7;
     } else if (itype <= 3 && *lda < max(1,*m)) {
 	*info = -9;
     } else if (itype >= 4) {
 /* Computing MAX */
 	i__1 = *m - 1;
 	if (*kl < 0 || *kl > max(i__1,0)) {
 	    *info = -2;
 	} else /* if(complicated condition) */ {
 /* Computing MAX */
 	    i__1 = *n - 1;
 	    if (*ku < 0 || *ku > max(i__1,0) || (itype == 4 || itype == 5) &&
 		    *kl != *ku) {
 		*info = -3;
 	    } else if (itype == 4 && *lda < *kl + 1 || itype == 5 && *lda < *
 		    ku + 1 || itype == 6 && *lda < (*kl << 1) + *ku + 1) {
 		*info = -9;
 	    }
 	}
     }

     if (*info != 0) {
 	i__1 = -(*info);
 	xerbla_("DLASCL", &i__1, 0);
 	return 0;
     }

 /*     Quick return if possible */

     if (*n == 0 || *m == 0) {
 	return 0;
     }

 /*     Get machine parameters */

     smlnum = 0.0001; //dlamch_("S");
     bignum = 1. / smlnum;

     cfromc = *cfrom;
     ctoc = *cto;

 L10:
     cfrom1 = cfromc * smlnum;
     if (cfrom1 == cfromc) {
 /*        CFROMC is an inf.  Multiply by a correctly signed zero for */
 /*        finite CTOC, or a NaN if CTOC is infinite. */
 	mul = ctoc / cfromc;
 	done = TRUE_;
 	cto1 = ctoc;
     } else {
 	cto1 = ctoc / bignum;
 	if (cto1 == ctoc) {
 /*           CTOC is either 0 or an inf.  In both cases, CTOC itself */
 /*           serves as the correct multiplication factor. */
 	    mul = ctoc;
 	    done = TRUE_;
 	    cfromc = 1.;
 	} else if (abs(cfrom1) > abs(ctoc) && ctoc != 0.) {
 	    mul = smlnum;
 	    done = FALSE_;
 	    cfromc = cfrom1;
 	} else if (abs(cto1) > abs(cfromc)) {
 	    mul = bignum;
 	    done = FALSE_;
 	    ctoc = cto1;
 	} else {
 	    mul = ctoc / cfromc;
 	    done = TRUE_;
 	}
     }

     if (itype == 0) {

 /*        Full matrix */

 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 	    i__2 = *m;
 	    for (i__ = 1; i__ <= i__2; ++i__) {
 		a[i__ + j * a_dim1] *= mul;
 /* L20: */
 	    }
 /* L30: */
 	}

     } else if (itype == 1) {

 /*        Lower triangular matrix */

 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 	    i__2 = *m;
 	    for (i__ = j; i__ <= i__2; ++i__) {
 		a[i__ + j * a_dim1] *= mul;
 /* L40: */
 	    }
 /* L50: */
 	}

     } else if (itype == 2) {

 /*        Upper triangular matrix */

 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 	    i__2 = min(j,*m);
 	    for (i__ = 1; i__ <= i__2; ++i__) {
 		a[i__ + j * a_dim1] *= mul;
 /* L60: */
 	    }
 /* L70: */
 	}

     } else if (itype == 3) {

 /*        Upper Hessenberg matrix */

 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 /* Computing MIN */
 	    i__3 = j + 1;
 	    i__2 = min(i__3,*m);
 	    for (i__ = 1; i__ <= i__2; ++i__) {
 		a[i__ + j * a_dim1] *= mul;
 /* L80: */
 	    }
 /* L90: */
 	}

     } else if (itype == 4) {

 /*        Lower half of a symmetric band matrix */

 	k3 = *kl + 1;
 	k4 = *n + 1;
 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 /* Computing MIN */
 	    i__3 = k3, i__4 = k4 - j;
 	    i__2 = min(i__3,i__4);
 	    for (i__ = 1; i__ <= i__2; ++i__) {
 		a[i__ + j * a_dim1] *= mul;
 /* L100: */
 	    }
 /* L110: */
 	}

     } else if (itype == 5) {

 /*        Upper half of a symmetric band matrix */

 	k1 = *ku + 2;
 	k3 = *ku + 1;
 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 /* Computing MAX */
 	    i__2 = k1 - j;
 	    i__3 = k3;
 	    for (i__ = max(i__2,1); i__ <= i__3; ++i__) {
 		a[i__ + j * a_dim1] *= mul;
 /* L120: */
 	    }
 /* L130: */
 	}

     } else if (itype == 6) {

 /*        Band matrix */

 	k1 = *kl + *ku + 2;
 	k2 = *kl + 1;
 	k3 = (*kl << 1) + *ku + 1;
 	k4 = *kl + *ku + 1 + *m;
 	i__1 = *n;
 	for (j = 1; j <= i__1; ++j) {
 /* Computing MAX */
 	    i__3 = k1 - j;
 /* Computing MIN */
 	    i__4 = k3, i__5 = k4 - j;
 	    i__2 = min(i__4,i__5);
 	    for (i__ = max(i__3,k2); i__ <= i__2; ++i__) {
 		a[i__ + j * a_dim1] *= mul;
 /* L140: */
 	    }
 /* L150: */
 	}

     }

     if (! done) {
 	goto L10;
     }

     return 0;

 /*     End of DLASCL */

 } /* dlascl_ */

 __attribute__((noinline))

 doublereal ddot_(integer *n, doublereal *dx, integer *incx, doublereal *dy,
 	integer *incy)
 {
     /* System generated locals */
     integer i__1;
     doublereal ret_val;

     /* Local variables */
     integer i__, m, ix, iy, mp1;
     doublereal dtemp;

 /*     .. Scalar Arguments .. */
 /*     .. */
 /*     .. Array Arguments .. */
 /*     .. */

 /*  Purpose */
 /*  ======= */

 /*     forms the dot product of two vectors. */
 /*     uses unrolled loops for increments equal to one. */
 /*     jack dongarra, linpack, 3/11/78. */
 /*     modified 12/3/93, array(1) declarations changed to array(*) */


 /*     .. Local Scalars .. */
 /*     .. */
 /*     .. Intrinsic Functions .. */
 /*     .. */
     /* Parameter adjustments */
     --dy;
     --dx;

     /* Function Body */
     ret_val = 0.;
     dtemp = 0.;
     if (*n <= 0) {
 	return ret_val;
     }
     if (*incx == 1 && *incy == 1) {
 	goto L20;
     }

 /*        code for unequal increments or equal increments */
 /*          not equal to 1 */

     ix = 1;
     iy = 1;
     if (*incx < 0) {
 	ix = (-(*n) + 1) * *incx + 1;
     }
     if (*incy < 0) {
 	iy = (-(*n) + 1) * *incy + 1;
     }
     i__1 = *n;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	dtemp += dx[ix] * dy[iy];
 	ix += *incx;
 	iy += *incy;
 /* L10: */
     }
     ret_val = dtemp;
     return ret_val;

 /*        code for both increments equal to 1 */


 /*        clean-up loop */

 L20:
     m = *n % 5;
     if (m == 0) {
 	goto L40;
     }
     i__1 = m;
     for (i__ = 1; i__ <= i__1; ++i__) {
 	dtemp += dx[i__] * dy[i__];
 /* L30: */
     }
     if (*n < 5) {
 	goto L60;
     }
 L40:
     mp1 = m + 1;
     i__1 = *n;
     for (i__ = mp1; i__ <= i__1; i__ += 5) {
 	dtemp = dtemp + dx[i__] * dy[i__] + dx[i__ + 1] * dy[i__ + 1] + dx[
 		i__ + 2] * dy[i__ + 2] + dx[i__ + 3] * dy[i__ + 3] + dx[i__ +
 		4] * dy[i__ + 4];
 /* L50: */
     }
 L60:
     ret_val = dtemp;
     return ret_val;
 } /* ddot_ */

 __attribute__((noinline))
 /* Subroutine */ int dgemm_(const char *transa_t, const char *transb_t, const integer *m, const integer *
 	n, const integer *k, const doublereal *alpha, const doublereal *a, const integer *lda,
 	const doublereal *b, const integer *ldb, const doublereal *beta, doublereal *c, const integer
 	*ldc)
 {

     char transa_v = *transa_t;
     char* transa = &transa_v;

     char transb_v = *transb_t;
     char* transb = &transb_v;

     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2,
 	    i__3;

     /* Local variables */
     integer info;
     logical nota, notb;
     doublereal temp;
     integer i, j, l, ncola;
     integer nrowa, nrowb;


 /*  Purpose
     =======

     DGEMM  performs one of the matrix-matrix operations

        C := alpha*op( A )*op( B ) + beta*C,

     where  op( X ) is one of

        op( X ) = X   or   op( X ) = X',

     alpha and beta are scalars, and A, B and C are matrices, with op( A )

     an m by k matrix,  op( B )  a  k by n matrix and  C an m by n matrix.


     Parameters
     ==========

     TRANSA - CHARACTER*1.
              On entry, TRANSA specifies the form of op( A ) to be used in

              the matrix multiplication as follows:

                 TRANSA = 'N' or 'n',  op( A ) = A.

                 TRANSA = 'T' or 't',  op( A ) = A'.

                 TRANSA = 'C' or 'c',  op( A ) = A'.

              Unchanged on exit.

     TRANSB - CHARACTER*1.
              On entry, TRANSB specifies the form of op( B ) to be used in

              the matrix multiplication as follows:

                 TRANSB = 'N' or 'n',  op( B ) = B.

                 TRANSB = 'T' or 't',  op( B ) = B'.

                 TRANSB = 'C' or 'c',  op( B ) = B'.

              Unchanged on exit.

     M      - INTEGER.
              On entry,  M  specifies  the number  of rows  of the  matrix

              op( A )  and of the  matrix  C.  M  must  be at least  zero.

              Unchanged on exit.

     N      - INTEGER.
              On entry,  N  specifies the number  of columns of the matrix

              op( B ) and the number of columns of the matrix C. N must be

              at least zero.
              Unchanged on exit.

     K      - INTEGER.
              On entry,  K  specifies  the number of columns of the matrix

              op( A ) and the number of rows of the matrix op( B ). K must

              be at least  zero.
              Unchanged on exit.

     ALPHA  - DOUBLE PRECISION.
              On entry, ALPHA specifies the scalar alpha.
              Unchanged on exit.

     A      - DOUBLE PRECISION array of DIMENSION ( LDA, ka ), where ka is

              k  when  TRANSA = 'N' or 'n',  and is  m  otherwise.
              Before entry with  TRANSA = 'N' or 'n',  the leading  m by k

              part of the array  A  must contain the matrix  A,  otherwise

              the leading  k by m  part of the array  A  must contain  the

              matrix A.
              Unchanged on exit.

     LDA    - INTEGER.
              On entry, LDA specifies the first dimension of A as declared

              in the calling (sub) program. When  TRANSA = 'N' or 'n' then

              LDA must be at least  max( 1, m ), otherwise  LDA must be at

              least  max( 1, k ).
              Unchanged on exit.

     B      - DOUBLE PRECISION array of DIMENSION ( LDB, kb ), where kb is

              n  when  TRANSB = 'N' or 'n',  and is  k  otherwise.
              Before entry with  TRANSB = 'N' or 'n',  the leading  k by n

              part of the array  B  must contain the matrix  B,  otherwise

              the leading  n by k  part of the array  B  must contain  the

              matrix B.
              Unchanged on exit.

     LDB    - INTEGER.
              On entry, LDB specifies the first dimension of B as declared

              in the calling (sub) program. When  TRANSB = 'N' or 'n' then

              LDB must be at least  max( 1, k ), otherwise  LDB must be at

              least  max( 1, n ).
              Unchanged on exit.

     BETA   - DOUBLE PRECISION.
              On entry,  BETA  specifies the scalar  beta.  When  BETA  is

              supplied as zero then C need not be set on input.
              Unchanged on exit.

     C      - DOUBLE PRECISION array of DIMENSION ( LDC, n ).
              Before entry, the leading  m by n  part of the array  C must

              contain the matrix  C,  except when  beta  is zero, in which

              case C need not be set on entry.
              On exit, the array  C  is overwritten by the  m by n  matrix

              ( alpha*op( A )*op( B ) + beta*C ).

     LDC    - INTEGER.
              On entry, LDC specifies the first dimension of C as declared

              in  the  calling  (sub)  program.   LDC  must  be  at  least

              max( 1, m ).
              Unchanged on exit.


     Level 3 Blas routine.

     -- Written on 8-February-1989.
        Jack Dongarra, Argonne National Laboratory.
        Iain Duff, AERE Harwell.
        Jeremy Du Croz, Numerical Algorithms Group Ltd.
        Sven Hammarling, Numerical Algorithms Group Ltd.


        Set  NOTA  and  NOTB  as  true if  A  and  B  respectively are not

        transposed and set  NROWA, NCOLA and  NROWB  as the number of rows

        and  columns of  A  and the  number of  rows  of  B  respectively.


    Parameter adjustments
        Function Body */

 #define A(I,J) a[(I)-1 + ((J)-1)* ( *lda)]
 #define B(I,J) b[(I)-1 + ((J)-1)* ( *ldb)]
 #define C(I,J) c[(I)-1 + ((J)-1)* ( *ldc)]

     nota = lsame_((char*)transa, (char*)"N", 1, 1);
     notb = lsame_((char*)transb, (char*)"N", 1, 1);
     if (nota) {
 	nrowa = *m;
 	ncola = *k;
     } else {
 	nrowa = *k;
 	ncola = *m;
     }
     if (notb) {
 	nrowb = *k;
     } else {
 	nrowb = *n;
     }

 /*     Test the input parameters. */

     info = 0;
     if (! nota && ! lsame_((char*)transa, (char*)"C", 1, 1) && ! lsame_((char*)transa, (char*)"T", 1, 1)) {
 	info = 1;
     } else if (! notb && ! lsame_((char*)transb, (char*)"C", 1, 1) && ! lsame_((char*)transb,
 	    (char*)"T", 1, 1)) {
 	info = 2;
     } else if (*m < 0) {
 	info = 3;
     } else if (*n < 0) {
 	info = 4;
     } else if (*k < 0) {
 	info = 5;
     } else if (*lda < max(1,nrowa)) {
 	info = 8;
     } else if (*ldb < max(1,nrowb)) {
 	info = 10;
     } else if (*ldc < max(1,*m)) {
 	info = 13;
     }
     if (info != 0) {
 	xerbla_("DGEMM ", &info, 0);
 	return 0;
     }

 /*     Quick return if possible. */

     if (*m == 0 || *n == 0 || (*alpha == 0. || *k == 0) && *beta == 1.) {
 	return 0;
     }

 /*     And if  alpha.eq.zero. */

     if (*alpha == 0.) {
 	if (*beta == 0.) {
 	    i__1 = *n;
 	    for (j = 1; j <= *n; ++j) {
 		i__2 = *m;
 		for (i = 1; i <= *m; ++i) {
 		    C(i,j) = 0.;
 /* L10: */
 		}
 /* L20: */
 	    }
 	} else {
 	    i__1 = *n;
 	    for (j = 1; j <= *n; ++j) {
 		i__2 = *m;
 		for (i = 1; i <= *m; ++i) {
 		    C(i,j) = *beta * C(i,j);
 /* L30: */
 		}
 /* L40: */
 	    }
 	}
 	return 0;
     }

 /*     Start the operations. */

     if (notb) {
 	if (nota) {

 /*           Form  C := alpha*A*B + beta*C. */

 	    i__1 = *n;
 	    for (j = 1; j <= *n; ++j) {
 		if (*beta == 0.) {
 		    i__2 = *m;
 		    for (i = 1; i <= *m; ++i) {
 			C(i,j) = 0.;
 /* L50: */
 		    }
 		} else if (*beta != 1.) {
 		    i__2 = *m;
 		    for (i = 1; i <= *m; ++i) {
 			C(i,j) = *beta * C(i,j);
 /* L60: */
 		    }
 		}
 		i__2 = *k;
 		for (l = 1; l <= *k; ++l) {
 		    if (B(l,j) != 0.) {
 			temp = *alpha * B(l,j);
 			i__3 = *m;
 			for (i = 1; i <= *m; ++i) {
 			    C(i,j) += temp * A(i,l);
 /* L70: */
 			}
 		    }
 /* L80: */
 		}
 /* L90: */
 	    }
 	} else {

 /*           Form  C := alpha*A'*B + beta*C */

 	    i__1 = *n;
 	    for (j = 1; j <= *n; ++j) {
 		i__2 = *m;
 		for (i = 1; i <= *m; ++i) {
 		    temp = 0.;
 		    i__3 = *k;
 		    for (l = 1; l <= *k; ++l) {
 			temp += A(l,i) * B(l,j);
 /* L100: */
 		    }
 		    if (*beta == 0.) {
 			C(i,j) = *alpha * temp;
 		    } else {
 			C(i,j) = *alpha * temp + *beta * C(i,j);
 		    }
 /* L110: */
 		}
 /* L120: */
 	    }
 	}
     } else {
 	if (nota) {

 /*           Form  C := alpha*A*B' + beta*C */

 	    i__1 = *n;
 	    for (j = 1; j <= *n; ++j) {
 		if (*beta == 0.) {
 		    i__2 = *m;
 		    for (i = 1; i <= *m; ++i) {
 			C(i,j) = 0.;
 /* L130: */
 		    }
 		} else if (*beta != 1.) {
 		    i__2 = *m;
 		    for (i = 1; i <= *m; ++i) {
 			C(i,j) = *beta * C(i,j);
 /* L140: */
 		    }
 		}
 		i__2 = *k;
 		for (l = 1; l <= *k; ++l) {
 		    if (B(j,l) != 0.) {
 			temp = *alpha * B(j,l);
 			i__3 = *m;
 			for (i = 1; i <= *m; ++i) {
 			    C(i,j) += temp * A(i,l);
 /* L150: */
 			}
 		    }
 /* L160: */
 		}
 /* L170: */
 	    }
 	} else {

 /*           Form  C := alpha*A'*B' + beta*C */

 	    i__1 = *n;
 	    for (j = 1; j <= *n; ++j) {
 		i__2 = *m;
 		for (i = 1; i <= *m; ++i) {
 		    temp = 0.;
 		    i__3 = *k;
 		    for (l = 1; l <= *k; ++l) {
 			temp += A(l,i) * B(j,l);
 /* L180: */
 		    }
 		    if (*beta == 0.) {
 			C(i,j) = *alpha * temp;
 		    } else {
 			C(i,j) = *alpha * temp + *beta * C(i,j);
 		    }
 /* L190: */
 		}
 /* L200: */
 	    }
 	}
     }

     return 0;
 #undef A
 #undef B
 #undef C
 /*     End of DGEMM . */

 } /* dgemm_ */

 #undef max
 #undef min
 #undef abs
 #ifdef __cplusplus
 }
 #endif