#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define NITER 10000


int main(int argc, char **argv)
{

int i,j,k;
int NSIZES=24;
int sizes[NSIZES];
void dgemm_smm_compare_(int *, int *, int *, double *, double *, double *, double *, double *, double *, double *);
double *a,*b,*c, tcode=0.0, tmatmul=0.0,tblas=0.0, tsmm=0.0;;
double stc,stm,stb,sts;
double gflops[4];
int m,n,ijk;
double smm_vs_blas_ratio;
typedef struct {
                  int m;
                  int n;
                  int k;
                  double gcode;
                  double gmatmul;
                  double gblas;
                  double gsmm;
                  double smm_vs_blas_ratio;
               } caserec;

caserec relperf[NSIZES*NSIZES*NSIZES];

ijk=0;
for(i=0;i<NSIZES;i++) {sizes[i] = i+1; }
 
                  
printf("#Size          TimeCode TimeMatMUL   TimeBLAS    TimeSMM         relativeSpeed                          GFlops\n");
printf("#  M  N  K                                                    SMM/BLAS    SMM     BLAS   MATMUL     Code      SMM     BLAS   MATMUL     Code \n");
for (i=0;i<NSIZES;i++) {       /* Loop for M */
  for(j=0;j<NSIZES; j++) {     /* Loop for N */
     for(k=0;k<NSIZES;k++) {   /* Loop for K */ 
        a=calloc(sizes[i]*sizes[k],sizeof(double));
        b=calloc(sizes[j]*sizes[k],sizeof(double));
        c=calloc(sizes[i]*sizes[j],sizeof(double));
          for(m=0;m<NITER;m++) {
            dgemm_smm_compare_(&sizes[i],&sizes[j],&sizes[k],a,b,c,&stc,&stm,&stb,&sts);
            tcode+= stc; tmatmul += stm; tblas += stb; tsmm += sts;
          }  /* End for over NITER */
      tcode /= (double)NITER; tmatmul /= (double)NITER; tblas /= (double)NITER; tsmm /= (double)NITER;
      gflops[0] = 2.0e-9*( (double) sizes[i]*(double) sizes[j]*(double) sizes[k] )/tcode;
      gflops[1] = 2.0e-9*( (double) sizes[i]*(double) sizes[j]*(double) sizes[k] )/tmatmul;
      gflops[2] = 2.0e-9*( (double) sizes[i]*(double) sizes[j]*(double) sizes[k] )/tblas;
      gflops[3] = 2.0e-9*( (double) sizes[i]*(double) sizes[j]*(double) sizes[k] )/tsmm;
      smm_vs_blas_ratio=gflops[3]/gflops[2];
            relperf[ijk].smm_vs_blas_ratio= smm_vs_blas_ratio;
            relperf[ijk].m       = sizes[i];
            relperf[ijk].n       = sizes[j];
            relperf[ijk].k       = sizes[k];
            relperf[ijk].gcode   = gflops[0];
            relperf[ijk].gmatmul = gflops[1];
            relperf[ijk].gblas   = gflops[2];
            relperf[ijk].gsmm    = gflops[3];
            ijk++;

      printf(" %3d %3d %3d %10.6lf %10.6lf %10.6lf %10.6lf %8.4lf %8.3lf %8.3lf %8.3lf %8.3lf %8.3lf %8.3lf %8.3lf %8.3lf\n",
              sizes[i],sizes[j],sizes[k],tcode,tmatmul,tblas,tsmm,tblas/tsmm,tcode/tsmm,tcode/tblas,tcode/tmatmul,
              tcode/tcode,gflops[3],gflops[2],gflops[1],gflops[0]);
      free(a); free(b); free(c);
   }  /* Loop over M */
printf("\n");
  }   /* Loop over N */
printf("\n");
}     /* Loop over K */

double dtmp;
int itmp;

for (i=0;i<NSIZES*NSIZES*NSIZES-1;i++) { 
     for(j=i+1;j<NSIZES*NSIZES*NSIZES; j++ ) { 
          if (relperf[i].smm_vs_blas_ratio < relperf[j].smm_vs_blas_ratio ) { 
            dtmp                         = relperf[i].smm_vs_blas_ratio;
            relperf[i].smm_vs_blas_ratio = relperf[j].smm_vs_blas_ratio;
            relperf[j].smm_vs_blas_ratio = dtmp;
            dtmp                         = relperf[i].gcode;
            relperf[i].gcode             = relperf[j].gcode;
            relperf[j].gcode             = dtmp;
            dtmp                         = relperf[i].gmatmul;
            relperf[i].gmatmul           = relperf[j].gmatmul;
            relperf[j].gmatmul           = dtmp;
            dtmp                         = relperf[i].gblas;
            relperf[i].gblas             = relperf[j].gblas;
            relperf[j].gblas             = dtmp;
            dtmp                         = relperf[i].gsmm;
            relperf[i].gsmm              = relperf[j].gsmm;
            relperf[j].gsmm              = dtmp;
            itmp                         = relperf[i].m;
            relperf[i].m                 = relperf[j].m;
            relperf[j].m                 = itmp;
            itmp                         = relperf[i].n;
            relperf[i].n                 = relperf[j].n;
            relperf[j].n                 = itmp;
            itmp                         = relperf[i].k;
            relperf[i].k                 = relperf[j].k;
            relperf[j].k                 = itmp;
         }
       }
}

printf ("#Summary\n");
int nbettercode=0;
int nbettermatmul=0;
int nbetterblas=0;
int nbettersmm=0;

for (i=0;i<NSIZES*NSIZES*NSIZES; i++ ) {
     if (
           relperf[i].gcode   > relperf[i].gmatmul &&
           relperf[i].gcode   > relperf[i].gblas &&
           relperf[i].gcode   > relperf[i].gsmm 
        ) {  nbettercode++; }
     if (
           relperf[i].gmatmul   > relperf[i].gcode &&
           relperf[i].gmatmul   > relperf[i].gblas &&
           relperf[i].gmatmul   > relperf[i].gsmm 
        ) {  nbettermatmul++; }
     if (
           relperf[i].gblas   > relperf[i].gcode &&
           relperf[i].gblas   > relperf[i].gmatmul &&
           relperf[i].gblas   > relperf[i].gsmm 
        ) {  nbetterblas++; }
     if (
           relperf[i].gsmm   > relperf[i].gcode &&
           relperf[i].gsmm   > relperf[i].gmatmul &&
           relperf[i].gsmm   > relperf[i].gblas 
        ) {  nbettersmm++; }
}

printf("#In %5d attempted size combinations\n",NSIZES*NSIZES*NSIZES);
printf("#Simple code   is faster in       %5d cases\n",nbettercode);
printf("#MatMUL        is faster in       %5d cases\n",nbettermatmul);
printf("#Selected BLAS is faster in       %5d cases\n",nbetterblas);
printf("#SMM           is faster in       %5d cases\n",nbettersmm);

printf("#Where SMM is at least 10 per cent faster, its performance ratio is :\n");
for (i=0;i<NSIZES*NSIZES*NSIZES; i++ ) {
     if (
           relperf[i].gsmm   > 1.10*relperf[i].gcode &&
           relperf[i].gsmm   > 1.10*relperf[i].gmatmul &&
           relperf[i].gsmm   > 1.10*relperf[i].gblas 
        ) {  
     printf ("#M=%3d N=%3d K=%3d GFlops(Code)=%6.3lf GFlops(matmul)=%6.3lf GFlops(BLAS)=%6.3lf GFlops(SMM)=%6.3lf Ratio(SMM/BLAS)=%6.3lf\n",
           relperf[i].m,relperf[i].n,relperf[i].k,relperf[i].gcode,relperf[i].gmatmul,relperf[i].gblas,relperf[i].gsmm,relperf[i].gsmm/relperf[i].gblas);
}
}

double avgcode=0.0;
double avgmatmul=0.0;
double avgblas=0.0;
double avgsmm=0.0;
for(i=0;i<NSIZES*NSIZES*NSIZES;i++) { 
     avgcode   += relperf[i].gcode;
     avgmatmul += relperf[i].gmatmul;
     avgblas   += relperf[i].gblas;
     avgsmm    += relperf[i].gsmm;
}
avgcode   /= (double)(NSIZES*NSIZES*NSIZES);
avgmatmul /= (double)(NSIZES*NSIZES*NSIZES);
avgblas   /= (double)(NSIZES*NSIZES*NSIZES);
avgsmm    /= (double)(NSIZES*NSIZES*NSIZES);
printf("#Average GFlops :\n");
printf("#Code        : %6.2lf\n",avgcode);
printf("#MatMUL      : %6.2lf\n",avgmatmul);
printf("#BLAS        : %6.2lf\n",avgblas);
printf("#SMM         : %6.2lf\n",avgsmm);

exit(0);
}

