Linpack port to ARM MCU from Arduino IDE and roylongbottom.org ARM based source

Dependencies:   mbed

Files at this revision

API Documentation at this revision

Comitter:
JovanEps
Date:
Wed Jan 18 23:35:49 2017 +0000
Parent:
8:66f6deeb2556
Child:
12:623ce727d4fa
Commit message:
RC Alpha

Changed in this revision

main.cpp Show annotated file Show diff for this revision Revisions of this file
--- a/main.cpp	Tue Jan 03 22:29:43 2017 +0000
+++ b/main.cpp	Wed Jan 18 23:35:49 2017 +0000
@@ -1,53 +1,1139 @@
 //********************************************************
-//** BETA---------------
-//**  Nucleo-144 Stm32F746 and Stm32F767 benchmark  ******
-//**  Limpack -port form Arduino IDE                 *****
-//**  Jovan Ivkovic - 2016                          ******
+//** RC Apha  --- based on roylongbottom.org ARM    ******
+//**  Nucleo-64-144 Stm32F103 to F767 benchmark     ******
+//**  Limpack -port form Arduino IDE                ******
+//**  Jovan Ivkovic - 2017                          ******
 //********************************************************
+#define GCCARMDP
+
+#define UNROLL
+#define DP
+
+#ifdef SP
+#define REAL float
+#define ZERO 0.0
+#define ONE 1.0
+#define PREC "Single"
+#endif
+
+#ifdef DP
+#define REAL double
+#define ZERO 0.0e0
+#define ONE 1.0e0
+#define PREC "Double"
+#endif
+
+#ifdef ROLL
+#define ROLLING "Rolled"
+#endif
+#ifdef UNROLL
+#define ROLLING "Unrolled"
+#endif
+
+// VERSION
+
+#ifdef CNNT
+#define options   "Non-optimised"
+#define opt "0"
+#else
+//    #define options   "Optimised"
+//    #define options   "Opt 3 32 Bit"
+#define options   "vfpv4 32 Bit"
+#define opt "1"
+#endif
+
+#define NTIMES 10
+
 #include "mbed.h"
 
-/* the following is optional depending on the timing function used */
-# include <stdlib.h>
-# include <stdio.h>
-# include <math.h>
+
+//---------------------------------
+DigitalOut myled(LED1);
+Serial pc(USBTX, USBRX); //USB is out of oreder on Embeded-Pi
+//Serial pc(PC_10,PC_11); //RX-TX D0,D1 Embeded-PI ports
+Timer timer;
+//--------------------------------
+
+void print_time (int row);
+void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma);
+void dgefa (REAL a[], int lda, int n, int ipvt[], int *info);
+void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job);
+void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]);
+void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy);
+REAL epslon (REAL x);
+int idamax (int n, REAL dx[], int incx);
+void dscal (int n, REAL da, REAL dx[], int incx);
+REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy);
+
+static REAL atime[9][15];
+double runSecs = 1;
+
+void do_benchmark(void){
+   
+    float t1,t2 = 0.0;
+    static REAL aa[20*20],a[20*21],b[20],x[20];
+    REAL cray,ops,total,norma,normx;
+    REAL resid,residn,eps,tm2,epsn,x1,x2;
+    REAL mflops;
+    static int ipvt[21],n,i,j,ntimes,info,lda,ldaa;
+    int endit, pass, loop;
+    REAL overhead1, overhead2, time2;
+    REAL max1, max2;
+    char was[5][20];
+    char expect[5][20];
+    char title[5][20];
+    int errors;
+    int         nopause = 1;
+
+   /*
+      if (argc > 1) {
+        switch (argv[1][0]) {
+            case 'N':
+                nopause = 0;
+                break;
+            case 'n':
+                nopause = 0;
+                break;
+        }
+     }
+    */
+    pc.printf("\n");
+
+
+    // outfile = fopen("Linpack.txt","a+");
+    // if (outfile == NULL)
+    // {
+    //    printf (" Cannot open results file \n\n");
+    //   printf(" Press Enter\n\n");
+    //    int g = getchar();
+    //    exit (0);
+    //}
+
+
+
+#ifdef GCCARMDP
+    pc.printf(expect[0], "             1.7");
+    pc.printf(expect[1], "  7.41628980e-14");
+    pc.printf(expect[2], "  2.22044605e-16");
+    pc.printf(expect[3], " -1.49880108e-14");
+    pc.printf(expect[4], " -1.89848137e-14");
+#endif
+
+#ifdef GCCARMSP
+    pc.printf(expect[0], "             1.6");
+    pc.printf(expect[1], "  3.80277634e-05");
+    pc.printf(expect[2], "  1.19209290e-07");
+    pc.printf(expect[3], " -1.38282776e-05");
+    pc.printf(expect[4], " -7.51018524e-06");
+#endif
+
+    lda = 21;
+    ldaa = 20;
+    cray = .056;
+    n = 20;
+    pc.printf("----------------------------------------------\n");
+    pc.printf("%s ", ROLLING);
+    pc.printf("%s ", PREC);
+    pc.printf("Precision Linpack Benchmark - Linux Version in 'C/C++'\n\n");
+
+    pc.printf("Optimisation %s\n\n",options);
+
+    ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n);
+
+    matgen(a,lda,n,b,&norma);
+    timer.start();
+    //start_time();
+    t1 = timer.read();
+    dgefa(a,lda,n,ipvt,&info);
+    //end_time();
+    t2 = timer.read();
+    timer.stop();
+    atime[0][0] = t2-t1; //secs
+
+    timer.reset();
+    timer.start();
+    t1 = timer.read();
+    dgesl(a,lda,n,ipvt,b,0);
+    t2 = timer.read();
+    timer.stop();
+    atime[1][0] = t2-t1; //secs
+    total = atime[0][0] + atime[1][0];
+
+    /*     compute a residual to verify results.  */
+
+    for (i = 0; i < n; i++) {
+        x[i] = b[i];
+    }
+    matgen(a,lda,n,b,&norma);
+    for (i = 0; i < n; i++) {
+        b[i] = -b[i];
+    }
+    dmxpy(n,b,n,lda,x,a);
+    resid = 0.0;
+    normx = 0.0;
+    for (i = 0; i < n; i++) {
+        resid = (resid > fabs((double)b[i]))
+                ? resid : fabs((double)b[i]);
+        normx = (normx > fabs((double)x[i]))
+                ? normx : fabs((double)x[i]);
+    }
+    eps = epslon(ONE);
+    residn = resid/( n*norma*normx*eps );
+    epsn = eps;
+    x1 = x[0] - 1;
+    x2 = x[n-1] - 1;
+
+    pc.printf("norm resid      resid           machep");
+    pc.printf("         x[0]-1          x[n-1]-1\n");
+    pc.printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n",
+              (double)residn, (double)resid, (double)epsn,
+              (double)x1, (double)x2);
+
+    pc.printf("Times are reported for matrices of order        %5d\n",n);
+    pc.printf("1 pass times for array with leading dimension of%5d\n\n",lda);
+    pc.printf("      dgefa      dgesl      total     Mflops       unit");
+    pc.printf("      ratio\n");
+
+    atime[2][0] = total;
+    if (total > 0.0) {
+        atime[3][0] = ops/(1.0e6*total);
+        atime[4][0] = 2.0/atime[3][0];
+    } else {
+        atime[3][0] = 0.0;
+        atime[4][0] = 0.0;
+    }
+    atime[5][0] = total/cray;
+
+    print_time(0);
+
+    /************************************************************************
+     *       Calculate overhead of executing matgen procedure              *
+     ************************************************************************/
+
+    pc.printf ("\nCalculating matgen overhead\n");
+    pass = -20;
+    loop = NTIMES;
+    do {
+        timer.start();
+        t1 = timer.read();
+        pass = pass + 1;
+        for ( i = 0 ; i < loop ; i++) {
+            matgen(a,lda,n,b,&norma);
+        }
+        t2 = timer.read();
+        timer.stop();
+        overhead1 = t2-t1; //sec
+        pc.printf ("%10d times %6.2f seconds\n", loop, overhead1);
+        if (overhead1 > runSecs) {
+            pass = 0;
+        }
+        if (pass < 0) {
+            if (overhead1 < 0.1) {
+                loop = loop * 10;
+            } else {
+                loop = loop * 2;
+            }
+        }
+    } while (pass < 0);
+
+    overhead1 = overhead1 / (double)loop;
+
+    pc.printf("Overhead for 1 matgen %12.5f seconds\n\n", overhead1);
+
+    /************************************************************************
+     *           Calculate matgen/dgefa passes for runSecs seconds                *
+     ************************************************************************/
+
+    pc.printf ("Calculating matgen/dgefa passes for %d seconds\n", (int)runSecs);
+    pass = -20;
+    ntimes = NTIMES;
+    do {
+        timer.start();
+        t1 = timer.read();
+        pass = pass + 1;
+        for ( i = 0 ; i < ntimes ; i++) {
+            matgen(a,lda,n,b,&norma);
+            dgefa(a,lda,n,ipvt,&info );
+        }
+        t2 = timer.read();
+        timer.stop();
+        time2 = t2-t1; //sec
+
+        pc.printf ("%10d times %6.2f seconds\n", ntimes, time2);
+        if (time2 > runSecs) {
+            pass = 0;
+        }
+        if (pass < 0) {
+            if (time2 < 0.1) {
+                ntimes = ntimes * 10;
+            } else {
+                ntimes = ntimes * 2;
+            }
+        }
+    } while (pass < 0);
+
+    ntimes =  (int)(runSecs * (double)ntimes / time2);
+    if (ntimes == 0) ntimes = 1;
+
+    pc.printf ("Passes used %10d \n\n", ntimes);
+    pc.printf("Times for array with leading dimension of%4d\n\n",lda);
+    pc.printf("      dgefa      dgesl      total     Mflops       unit");
+    pc.printf("      ratio\n");
+
+    /************************************************************************
+     *                              Execute 5 passes                        *
+     ************************************************************************/
+
+    tm2 = ntimes * overhead1;
+    atime[3][6] = 0;
+
+    for (j=1 ; j<6 ; j++) {
+        timer.start();
+        t1 = timer.read();
+
+        for (i = 0; i < ntimes; i++) {
+            matgen(a,lda,n,b,&norma);
+            dgefa(a,lda,n,ipvt,&info );
+        }
+        t2 = timer.read();
+        timer.stop();
+        atime[0][j] = ((t2-t1) - tm2)/ntimes;
+
+        timer.start();
+        t1 = timer.read();
+        for (i = 0; i < ntimes; i++) {
+            dgesl(a,lda,n,ipvt,b,0);
+        }
+        t2 = timer.read();
+        timer.stop();
+
+        atime[1][j] = (t2-t1)/ntimes;
+        total       = atime[0][j] + atime[1][j];
+        atime[2][j] = total;
+        atime[3][j] = ops/(1.0e6*total);
+        atime[4][j] = 2.0/atime[3][j];
+        atime[5][j] = total/cray;
+        atime[3][6] = atime[3][6] + atime[3][j];
 
-DigitalOut myled(LED1);
-Serial pc(USBTX, USBRX);
-Timer timer;
+        print_time(j);
+    }
+    atime[3][6] = atime[3][6] / 5.0;
+    pc.printf("Average                          %11.2f\n",
+              (double)atime[3][6]);
+
+    pc.printf("\nCalculating matgen2 overhead\n");
+
+    /************************************************************************
+     *             Calculate overhead of executing matgen procedure         *
+     ************************************************************************/
+
+    timer.start();
+    t1 = timer.read();
+    for ( i = 0 ; i < loop ; i++) {
+        matgen(aa,ldaa,n,b,&norma);
+    }
+    t2 = timer.read();
+    timer.stop();
+    overhead2 = t2-t1;
+    overhead2 = overhead2 / (double)loop;
+
+    pc.printf("Overhead for 1 matgen %12.5f seconds\n\n", overhead2);
+    pc.printf("Times for array with leading dimension of%4d\n\n",ldaa);
+    pc.printf("      dgefa      dgesl      total     Mflops       unit");
+    pc.printf("      ratio\n");
+
+    /************************************************************************
+     *                              Execute 5 passes                        *
+     ************************************************************************/
+
+    tm2 = ntimes * overhead2;
+    atime[3][12] = 0;
+
+    for (j=7 ; j<12 ; j++) {
+        timer.start();
+        t1 = timer.read();
+        for (i = 0; i < ntimes; i++) {
+            matgen(aa,ldaa,n,b,&norma);
+            dgefa(aa,ldaa,n,ipvt,&info  );
+        }
+        t2 = timer.read();
+        timer.stop();
+        atime[0][j] = ((t2-t1) - tm2)/ntimes;
+
+        timer.start();
+        t1 = timer.read();
+        for (i = 0; i < ntimes; i++) {
+            dgesl(aa,ldaa,n,ipvt,b,0);
+        }
+        t2 = timer.read();
+        timer.stop();
+        atime[1][j] = (t2-t1)/ntimes;
+        total       = atime[0][j] + atime[1][j];
+        atime[2][j] = total;
+        atime[3][j] = ops/(1.0e6*total);
+        atime[4][j] = 2.0/atime[3][j];
+        atime[5][j] = total/cray;
+        atime[3][12] = atime[3][12] + atime[3][j];
+
+        print_time(j);
+    }
+    atime[3][12] = atime[3][12] / 5.0;
+    pc.printf("Average                          %11.2f\n\n",
+              (double)atime[3][12]);
+    pc.printf("##########################################\n");
+    pc.printf ("\nFrom File /proc/cpuinfo\n");
+    //  pc.printf("%s\n", configdata[0]);
+    pc.printf ("\nFrom File /proc/version\n");
+    //pc.printf("%s\n", configdata[1]);
+
+    /************************************************************************
+     *           Use minimum average as overall Mflops rating               *
+     ************************************************************************/
+
+    mflops = atime[3][6];
+    if (atime[3][12] < mflops) mflops = atime[3][12];
+
+    pc.printf("\n");
+    pc.printf("%s ", ROLLING);
+    pc.printf("%s ", PREC);
+    pc.printf(" Precision %11.4f Mflops \n\n",mflops);
+
+    // local_time();
+
+
+    /************************************************************************
+     *              Add results to output file Linpack.txt                  *
+     ************************************************************************/
+    pc.printf (" ########################################################\n\n");
+    pc.printf (" Linpack %s Precision %s Benchmark n @ 100\n", PREC, ROLLING);
+    //pc.printf (outfile, " Optimisation %s, %s\n", options, timeday);
+
+    max1 = 0;
+    for (i=1 ; i<6 ; i++) {
+        if (atime[3][i] > max1) max1 = atime[3][i];
+    }
+
+    max2 = 0;
+    for (i=7 ; i<12 ; i++) {
+        if (atime[3][i] > max2) max2 = atime[3][i];
+    }
+    if (max1 < max2) max2 = max1;
+
+    pc.printf(" Speed %10.4f MFLOPS\n\n", max2);
+    pc.printf(was[0], "%16.1f",(double)residn);
+    pc.printf(was[1], "%16.8e",(double)resid);
+    pc.printf(was[2], "%16.8e",(double)epsn);
+    pc.printf(was[3], "%16.8e",(double)x1);
+    pc.printf(was[4], "%16.8e",(double)x2);
+
+    pc.printf(title[0], "norm. resid");
+    pc.printf(title[1], "resid      ");
+    pc.printf(title[2], "machep     ");
+    pc.printf(title[3], "x[0]-1     ");
+    pc.printf(title[4], "x[n-1]-1   ");
+
+    if (strtol(opt, NULL, 10) == 0) {
+        pc.printf(expect[2], " 8.88178420e-016");
+    }
+    errors = 0;
+
+    for (i=0; i<5; i++) {
+        if (strcmp (expect[i], was[i]) != 0) {
+            pc.printf(" Variable %s Non-standard result was %s instead of %s\n",
+                      title[i], was[i], expect[i]);
+            errors = errors + 1;
+        }
+    }
+    if (errors == 0) {
+        pc.printf(" Numeric results were as expected\n\n");
+    } else {
+        pc.printf(" Different numeric results - see linpack.txt\n\n");
+        pc.printf("\n Compiler #define or values in linpack.c might need to be changed\n\n");
+
+    }
+
+
+    pc.printf (" ########################################################\n\n");
+    pc.printf("\n");
+    pc.printf ("SYSTEM INFORMATION\n\nFrom File /proc/cpuinfo\n");
+    // pc.printf (outfile, "%s \n", configdata[0]);
+    pc.printf ("\nFrom File /proc/version\n");
+    //  pc.printf (outfile, "%s \n", configdata[1]);
+    pc.printf ("\n");
+
+    pc.printf("\n");
+
+}
+
+/*----------------------*/
+void print_time (int row)
+
+{
+    pc.printf("%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n",   (double)atime[0][row],
+              (double)atime[1][row], (double)atime[2][row], (double)atime[3][row],
+              (double)atime[4][row], (double)atime[5][row]);
+    return;
+}
+
+/*----------------------*/
+
+void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma)
+
+
+/* We would like to declare a[][lda], but c does not allow it.  In this
+function, references to a[i][j] are written a[lda*i+j].  */
+
+{
+    int init, i, j;
+
+    init = 1325;
+    *norma = 0.0;
+    for (j = 0; j < n; j++) {
+        for (i = 0; i < n; i++) {
+            init = 3125*init % 65536;
+            a[lda*j+i] = (init - 32768.0)/16384.0;
+            *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma;
+
+            /* alternative for some compilers
+            if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]);
+            */
+        }
+    }
+    for (i = 0; i < n; i++) {
+        b[i] = 0.0;
+    }
+    for (j = 0; j < n; j++) {
+        for (i = 0; i < n; i++) {
+            b[i] = b[i] + a[lda*j+i];
+        }
+    }
+    return;
+}
+
+/*----------------------*/
+void dgefa(REAL a[], int lda, int n, int ipvt[], int *info)
+
+
+/* We would like to declare a[][lda], but c does not allow it.  In this
+function, references to a[i][j] are written a[lda*i+j].  */
+/*
+     dgefa factors a double precision matrix by gaussian elimination.
+
+     dgefa is usually called by dgeco, but it can be called
+     directly with a saving in time if  rcond  is not needed.
+     (time for dgeco) = (1 + 9/n)*(time for dgefa) .
+
+     on entry
+
+        a       REAL precision[n][lda]
+                the matrix to be factored.
+
+        lda     integer
+                the leading dimension of the array  a .
+
+        n       integer
+                the order of the matrix  a .
+
+     on return
+
+        a       an upper triangular matrix and the multipliers
+                which were used to obtain it.
+                the factorization can be written  a = l*u  where
+                l  is a product of permutation and unit lower
+                triangular matrices and  u  is upper triangular.
+
+        ipvt    integer[n]
+                an integer vector of pivot indices.
+
+        info    integer
+                = 0  normal value.
+                = k  if  u[k][k] .eq. 0.0 .  this is not an error
+                     condition for this subroutine, but it does
+                     indicate that dgesl or dgedi will divide by zero
+                     if called.  use  rcond  in dgeco for a reliable
+                     indication of singularity.
+
+     linpack. this version dated 08/14/78 .
+     cleve moler, university of new mexico, argonne national lab.
+
+     functions
+
+     blas daxpy,dscal,idamax
+*/
+
+{
+    /*     internal variables       */
+
+    REAL t;
+    int j,k,kp1,l,nm1;
+
+
+    /*     gaussian elimination with partial pivoting       */
+
+    *info = 0;
+    nm1 = n - 1;
+    if (nm1 >=  0) {
+        for (k = 0; k < nm1; k++) {
+            kp1 = k + 1;
+
+            /* find l = pivot index */
+
+            l = idamax(n-k,&a[lda*k+k],1) + k;
+            ipvt[k] = l;
+
+            /* zero pivot implies this column already
+               triangularized */
+
+            if (a[lda*k+l] != ZERO) {
+
+                /* interchange if necessary */
 
-int do_benchmark( void );
-//double cpu_time( void );
-void daxpy( int n, double da, double dx[], int incx, double dy[], int incy );
-double ddot( int n, double dx[], int incx, double dy[], int incy );
-int dgefa( double a[], int lda, int n, int ipvt[] );
-void dgesl( double a[], int lda, int n, int ipvt[], double b[], int job );
-void dscal( int n, double sa, double x[], int incx );
-int idamax( int n, double dx[], int incx );
-double r8_abs( double x );
-double r8_epsilon( void );
-double r8_max( double x, double y );
-double r8_random(int iseed[4] );
-double *r8mat_gen ( int lda, int n );
+                if (l != k) {
+                    t = a[lda*k+l];
+                    a[lda*k+l] = a[lda*k+k];
+                    a[lda*k+k] = t;
+                }
+
+                /* compute multipliers */
+
+                t = -ONE/a[lda*k+k];
+                dscal(n-(k+1),t,&a[lda*k+k+1],1);
+
+                /* row elimination with column indexing */
+
+                for (j = kp1; j < n; j++) {
+                    t = a[lda*j+l];
+                    if (l != k) {
+                        a[lda*j+l] = a[lda*j+k];
+                        a[lda*j+k] = t;
+                    }
+                    daxpy(n-(k+1),t,&a[lda*k+k+1],1,
+                          &a[lda*j+k+1],1);
+                }
+            } else {
+                *info = k;
+            }
+        }
+    }
+    ipvt[n-1] = n-1;
+    if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1;
+    return;
+}
+
+/*----------------------*/
+
+void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job )
+
+
+/* We would like to declare a[][lda], but c does not allow it.  In this
+function, references to a[i][j] are written a[lda*i+j].  */
+
+/*
+     dgesl solves the double precision system
+     a * x = b  or  trans(a) * x = b
+     using the factors computed by dgeco or dgefa.
+
+     on entry
+
+        a       double precision[n][lda]
+                the output from dgeco or dgefa.
+
+        lda     integer
+                the leading dimension of the array  a .
+
+        n       integer
+                the order of the matrix  a .
+
+        ipvt    integer[n]
+                the pivot vector from dgeco or dgefa.
+
+        b       double precision[n]
+                the right hand side vector.
+
+        job     integer
+                = 0         to solve  a*x = b ,
+                = nonzero   to solve  trans(a)*x = b  where
+                            trans(a)  is the transpose.
+
+    on return
+
+        b       the solution vector  x .
+
+     error condition
+
+        a division by zero will occur if the input factor contains a
+        zero on the diagonal.  technically this indicates singularity
+        but it is often caused by improper arguments or improper
+        setting of lda .  it will not occur if the subroutines are
+        called correctly and if dgeco has set rcond .gt. 0.0
+        or dgefa has set info .eq. 0 .
+
+     to compute  inverse(a) * c  where  c  is a matrix
+     with  p  columns
+           dgeco(a,lda,n,ipvt,rcond,z)
+           if (!rcond is too small){
+                for (j=0,j<p,j++)
+                        dgesl(a,lda,n,ipvt,c[j][0],0);
+           }
+
+     linpack. this version dated 08/14/78 .
+     cleve moler, university of new mexico, argonne national lab.
+
+     functions
+
+     blas daxpy,ddot
+*/
+{
+    /*     internal variables       */
+
+    REAL t;
+    int k,kb,l,nm1;
+
+    nm1 = n - 1;
+    if (job == 0) {
+
+        /* job = 0 , solve  a * x = b
+           first solve  l*y = b         */
+
+        if (nm1 >= 1) {
+            for (k = 0; k < nm1; k++) {
+                l = ipvt[k];
+                t = b[l];
+                if (l != k) {
+                    b[l] = b[k];
+                    b[k] = t;
+                }
+                daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 );
+            }
+        }
+
+        /* now solve  u*x = y */
+
+        for (kb = 0; kb < n; kb++) {
+            k = n - (kb + 1);
+            b[k] = b[k]/a[lda*k+k];
+            t = -b[k];
+            daxpy(k,t,&a[lda*k+0],1,&b[0],1 );
+        }
+    } else {
+
+        /* job = nonzero, solve  trans(a) * x = b
+           first solve  trans(u)*y = b                  */
+
+        for (k = 0; k < n; k++) {
+            t = ddot(k,&a[lda*k+0],1,&b[0],1);
+            b[k] = (b[k] - t)/a[lda*k+k];
+        }
+
+        /* now solve trans(l)*x = y     */
+
+        if (nm1 >= 1) {
+            for (kb = 1; kb < nm1; kb++) {
+                k = n - (kb+1);
+                b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1);
+                l = ipvt[k];
+                if (l != k) {
+                    t = b[l];
+                    b[l] = b[k];
+                    b[k] = t;
+                }
+            }
+        }
+    }
+    return;
+}
+
+/*----------------------*/
+
+void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy)
+/*
+     constant times a vector plus a vector.
+     jack dongarra, linpack, 3/11/78.
+*/
+
+{
+    int i,ix,iy,m,mp1;
+
+    mp1 = 0;
+    m = 0;
+
+    if(n <= 0) return;
+    if (da == ZERO) return;
+
+    if(incx != 1 || incy != 1) {
+
+        /* code for unequal increments or equal increments
+           not equal to 1                                       */
+
+        ix = 0;
+        iy = 0;
+        if(incx < 0) ix = (-n+1)*incx;
+        if(incy < 0)iy = (-n+1)*incy;
+        for (i = 0; i < n; i++) {
+            dy[iy] = dy[iy] + da*dx[ix];
+            ix = ix + incx;
+            iy = iy + incy;
+
+        }
+        return;
+    }
+
+    /* code for both increments equal to 1 */
+
+
+#ifdef ROLL
+
+    for (i = 0; i < n; i++) {
+        dy[i] = dy[i] + da*dx[i];
+    }
+
+
+#endif
+
+#ifdef UNROLL
+
+    m = n % 4;
+    if ( m != 0) {
+        for (i = 0; i < m; i++)
+            dy[i] = dy[i] + da*dx[i];
+
+        if (n < 4) return;
+    }
+    for (i = m; i < n; i = i + 4) {
+        dy[i] = dy[i] + da*dx[i];
+        dy[i+1] = dy[i+1] + da*dx[i+1];
+        dy[i+2] = dy[i+2] + da*dx[i+2];
+        dy[i+3] = dy[i+3] + da*dx[i+3];
+
+    }
+
+#endif
+    return;
+}
+
+/*----------------------*/
+
+REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy)
+/*
+     forms the dot product of two vectors.
+     jack dongarra, linpack, 3/11/78.
+*/
+
+{
+    REAL dtemp;
+    int i,ix,iy,m,mp1;
+
+    mp1 = 0;
+    m = 0;
+
+    dtemp = ZERO;
+
+    if(n <= 0) return(ZERO);
+
+    if(incx != 1 || incy != 1) {
+
+        /* code for unequal increments or equal increments
+           not equal to 1                                       */
+
+        ix = 0;
+        iy = 0;
+        if (incx < 0) ix = (-n+1)*incx;
+        if (incy < 0) iy = (-n+1)*incy;
+        for (i = 0; i < n; i++) {
+            dtemp = dtemp + dx[ix]*dy[iy];
+            ix = ix + incx;
+            iy = iy + incy;
+
+        }
+        return(dtemp);
+    }
+
+    /* code for both increments equal to 1 */
+
+
+#ifdef ROLL
+
+    for (i=0; i < n; i++)
+        dtemp = dtemp + dx[i]*dy[i];
+
+    return(dtemp);
+
+#endif
 
-//static FILE uartout = {0} ;
+#ifdef UNROLL
+
+
+    m = n % 5;
+    if (m != 0) {
+        for (i = 0; i < m; i++)
+            dtemp = dtemp + dx[i]*dy[i];
+        if (n < 5) return(dtemp);
+    }
+    for (i = m; i < n; i = i + 5) {
+        dtemp = dtemp + dx[i]*dy[i] +
+                dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] +
+                dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4];
+    }
+    return(dtemp);
+
+#endif
+
+}
+
+/*----------------------*/
+void dscal(int n, REAL da, REAL dx[], int incx)
+
+/*     scales a vector by a constant.
+      jack dongarra, linpack, 3/11/78.
+*/
+
+{
+    int i,m,mp1,nincx;
+
+    mp1 = 0;
+    m = 0;
+
+    if(n <= 0)return;
+    if(incx != 1) {
+
+        /* code for increment not equal to 1 */
+
+        nincx = n*incx;
+        for (i = 0; i < nincx; i = i + incx)
+            dx[i] = da*dx[i];
+
+        return;
+    }
+
+    /* code for increment equal to 1 */
+
+
+#ifdef ROLL
+
+    for (i = 0; i < n; i++)
+        dx[i] = da*dx[i];
+
+
+#endif
+
+#ifdef UNROLL
+
+
+    m = n % 5;
+    if (m != 0) {
+        for (i = 0; i < m; i++)
+            dx[i] = da*dx[i];
+        if (n < 5) return;
+    }
+    for (i = m; i < n; i = i + 5) {
+        dx[i] = da*dx[i];
+        dx[i+1] = da*dx[i+1];
+        dx[i+2] = da*dx[i+2];
+        dx[i+3] = da*dx[i+3];
+        dx[i+4] = da*dx[i+4];
+    }
+
+#endif
+
+}
+
+/*----------------------*/
+int idamax(int n, REAL dx[], int incx)
+
+/*
+     finds the index of element having max. absolute value.
+     jack dongarra, linpack, 3/11/78.
+*/
+
+
+{
+    REAL dmax;
+    int i, ix, itemp;
+
+    if( n < 1 ) return(-1);
+    if(n ==1 ) return(0);
+    if(incx != 1) {
+
+        /* code for increment not equal to 1 */
+
+        ix = 1;
+        dmax = fabs((double)dx[0]);
+        ix = ix + incx;
+        for (i = 1; i < n; i++) {
+            if(fabs((double)dx[ix]) > dmax)  {
+                itemp = i;
+                dmax = fabs((double)dx[ix]);
+            }
+            ix = ix + incx;
+        }
+    } else {
+
+        /* code for increment equal to 1 */
+
+        itemp = 0;
+        dmax = fabs((double)dx[0]);
+        for (i = 1; i < n; i++) {
+            if(fabs((double)dx[i]) > dmax) {
+                itemp = i;
+                dmax = fabs((double)dx[i]);
+            }
+        }
+    }
+    return (itemp);
+}
+
+/*----------------------*/
+REAL epslon (REAL x)
+
+/*
+     estimate unit roundoff in quantities of size x.
+*/
 
-//static int uart_putchar (char c, FILE *stream)
-//{
-//  Serial.write(c) ;
-//  return 0 ;
-//}
+{
+    REAL a,b,c,eps;
+    /*
+         this program should function properly on all systems
+         satisfying the following two assumptions,
+            1.  the base used in representing dfloating point
+                numbers is not a power of three.
+            2.  the quantity  a  in statement 10 is represented to
+                the accuracy used in dfloating point variables
+                that are stored in memory.
+         the statement number 10 and the go to 10 are intended to
+         force optimizing compilers to generate code satisfying
+         assumption 2.
+         under these assumptions, it should be true that,
+                a  is not exactly equal to four-thirds,
+                b  has a zero for its last bit or digit,
+                c  is not exactly equal to one,
+                eps  measures the separation of 1.0 from
+                     the next larger dfloating point number.
+         the developers of eispack would appreciate being informed
+         about any systems where these assumptions do not hold.
+
+         *****************************************************************
+         this routine is one of the auxiliary routines used by eispack iii
+         to avoid machine dependencies.
+         *****************************************************************
+
+         this version dated 4/6/83.
+    */
+
+    a = 4.0e0/3.0e0;
+    eps = ZERO;
+    while (eps == ZERO) {
+        b = a - ONE;
+        c = b + b + b;
+        eps = fabs((double)(c-ONE));
+    }
+    return(eps*fabs((double)x));
+}
+
+/*----------------------*/
+void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[])
+
+
+/* We would like to declare m[][ldm], but c does not allow it.  In this
+function, references to m[i][j] are written m[ldm*i+j].  */
+
+/*
+   purpose:
+     multiply matrix m times vector x and add the result to vector y.
+
+   parameters:
+
+     n1 integer, number of elements in vector y, and number of rows in
+         matrix m
+
+     y double [n1], vector of length n1 to which is added
+         the product m*x
+
+     n2 integer, number of elements in vector x, and number of columns
+         in matrix m
+
+     ldm integer, leading dimension of array m
+
+     x double [n2], vector of length n2
+
+     m double [ldm][n2], matrix of n1 rows and n2 columns
 
-//void setup() {
-//  Serial.begin(9600);
-//  fdev_setup_stream (&uartout, uart_putchar, NULL, _FDEV_SETUP_WRITE);
-//  stdout = &uartout ;
-//}
+ ----------------------------------------------------------------------
+*/
+{
+    int j,i,jmin;
+    /* cleanup odd vector */
+
+    j = n2 % 2;
+    if (j >= 1) {
+        j = j - 1;
+        for (i = 0; i < n1; i++)
+            y[i] = (y[i]) + x[j]*m[ldm*j+i];
+    }
+
+    /* cleanup odd group of two vectors */
+
+    j = n2 % 4;
+    if (j >= 2) {
+        j = j - 1;
+        for (i = 0; i < n1; i++)
+            y[i] = ( (y[i])
+                     + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
+    }
+
+    /* cleanup odd group of four vectors */
+
+    j = n2 % 8;
+    if (j >= 4) {
+        j = j - 1;
+        for (i = 0; i < n1; i++)
+            y[i] = ((( (y[i])
+                       + x[j-3]*m[ldm*(j-3)+i])
+                     + x[j-2]*m[ldm*(j-2)+i])
+                    + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i];
+    }
+
+    /* cleanup odd group of eight vectors */
 
+    j = n2 % 16;
+    if (j >= 8) {
+        j = j - 1;
+        for (i = 0; i < n1; i++)
+            y[i] = ((((((( (y[i])
+                           + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i])
+                        + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i])
+                      + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i])
+                    + x[j-1]*m[ldm*(j-1)+i]) + x[j]  *m[ldm*j+i];
+    }
+
+    /* main loop - groups of sixteen vectors */
+
+    jmin = (n2%16)+16;
+    for (j = jmin-1; j < n2; j = j + 16) {
+        for (i = 0; i < n1; i++)
+            y[i] = ((((((((((((((( (y[i])
+                                   + x[j-15]*m[ldm*(j-15)+i])
+                                 + x[j-14]*m[ldm*(j-14)+i])
+                                + x[j-13]*m[ldm*(j-13)+i])
+                               + x[j-12]*m[ldm*(j-12)+i])
+                              + x[j-11]*m[ldm*(j-11)+i])
+                             + x[j-10]*m[ldm*(j-10)+i])
+                            + x[j- 9]*m[ldm*(j- 9)+i])
+                           + x[j- 8]*m[ldm*(j- 8)+i])
+                          + x[j- 7]*m[ldm*(j- 7)+i])
+                         + x[j- 6]*m[ldm*(j- 6)+i])
+                        + x[j- 5]*m[ldm*(j- 5)+i])
+                       + x[j- 4]*m[ldm*(j- 4)+i])
+                      + x[j- 3]*m[ldm*(j- 3)+i])
+                     + x[j- 2]*m[ldm*(j- 2)+i])
+                    + x[j- 1]*m[ldm*(j- 1)+i])
+                   + x[j]   *m[ldm*j+i];
+    }
+    return;
+}
+//*----------------------------------------
 int main()
 {
-    pc.baud(115200);
-    //pc.baud(9600);
-
+    pc.baud(9600);
     while(1) {
 
         pc.printf("Starting benchmark...\n");
@@ -57,1079 +1143,3 @@
         pc.printf(" kraj \n\n");
     }
 }
-
-/******************************************************************************/
-
-int do_benchmark ( void )
-{
-
-/******************************************************************************/
-    /*
-      Purpose:
-
-        MAIN is the main program for LINPACK_BENCH.
-
-      Discussion:
-
-        LINPACK_BENCH drives the double precision LINPACK benchmark program.
-
-      Modified:
-
-        25 July 2008
-
-      Parameters:
-
-        N is the problem size.
-    */
-
-# define N 2
-# define LDA ( N + 1 )
-
-    //static double a[90];
-    static double *a;
-    static double a_max;
-    //static double b[9];
-    static double *b;
-    static double b_max;
-    const double cray = 0.056;
-    static double eps;
-    int i;
-    int info;
-    static int *ipvt;
-    int j;
-    int job;
-    double ops;
-    static double *resid;
-    double resid_max;
-    double residn;
-    static double *rhs;
-    double t1 = 0.0;
-    double t2 = 0.0;
-    static double time[6];
-    double total;
-    double *x;
-
-    pc.printf ( "\n" );
-    pc.printf ( "LINPACK_BENCH\n" );
-    pc.printf ( "  C version\n" );
-    pc.printf ( "\n" );
-    pc.printf ( "  The LINPACK benchmark.\n" );
-    pc.printf ( "  Language: C\n" );
-    pc.printf ( "  Datatype: Double precision real\n" );
-    pc.printf ( "  Matrix order N               = %d\n", N );
-    pc.printf ( "  Leading matrix dimension LDA = %d\n", LDA );
-          
-    // ops = ( double ) ( 2 * N * N * N ) / 3.0 + 2.0 * ( double ) ( N * N );
-    ops = ( double ) ( 2L * N * N * N ) / 3.0 + 2.0 * ( double ) ( (long)N * N ); // Arduino C
-
-    /*
-      Allocate space for arrays.
-    */
-    a = r8mat_gen ( LDA, N );
-    //r8mat_gen ( LDA, N, a);
-
-    a_max = 0.0;
-    for ( j = 0; j < N; j++ ) {
-        for ( i = 0; i < N; i++ ) {
-            a_max = r8_max ( a_max, a[i+j*LDA] );
-        }
-    }
-
-    for ( i = 0; i < N; i++ ) {
-        x[i] = 1.0;
-    }
-
-    for ( i = 0; i < N; i++ ) {
-        b[i] = 0.0;
-        for ( j = 0; j < N; j++ ) {
-            b[i] = b[i] + a[i+j*LDA] * x[j];
-        }
-    }
-
-    timer.start();
-
-    //*****************
-    t1 = ( double ) timer.read_us() / 1000000.0;
-
-    info = dgefa ( a, LDA, N, ipvt );
-
-    t2 = ( double ) timer.read_us() / 1000000.0;
-
-    if ( info != 0 ) {
-        pc.printf ( "\n" );
-        pc.printf ( "LINPACK_BENCH - Fatal error!\n" );
-        pc.printf ( "  The matrix A is apparently singular.\n" );
-        pc.printf ( "  Abnormal end of execution.\n" );
-        return 1;
-    }
-    time[0] = t2 - t1;
-
-
-    timer.reset();
-
-    //*********
-
-    t1 = ( double ) timer.read_us() / 1000000.0;
-
-    job = 0;
-    dgesl ( a, LDA, N, ipvt, b, job );
-
-    t2 = ( double ) timer.read_us() / 1000000.0;
-    time[1] = t2 - t1;
-
-    total = time[0] + time[1];
-
-    timer.stop();
-
-    //*********
-
-    /*
-      Compute a residual to verify results.
-    */
-    a = r8mat_gen ( LDA, N );
-    //r8mat_gen ( LDA, N, a);
-
-    for ( i = 0; i < N; i++ ) {
-        x[i] = 1.0;
-    }
-
-    for ( i = 0; i < N; i++ ) {
-        rhs[i] = 0.0;
-        for ( j = 0; j < N; j++ ) {
-            rhs[i] = rhs[i] + a[i+j*LDA] * x[j];
-        }
-    }
-
-    for ( i = 0; i < N; i++ ) {
-        resid[i] = -rhs[i];
-        for ( j = 0; j < N; j++ ) {
-            resid[i] = resid[i] + a[i+j*LDA] * b[j];
-        }
-    }
-
-    resid_max = 0.0;
-    for ( i = 0; i < N; i++ ) {
-        resid_max = r8_max ( resid_max, r8_abs ( resid[i] ) );
-    }
-
-    b_max = 0.0;
-    for ( i = 0; i < N; i++ ) {
-        b_max = r8_max ( b_max, r8_abs ( b[i] ) );
-    }
-
-    eps = r8_epsilon ( );
-
-    residn = resid_max / ( double ) N / a_max / b_max / eps;
-
-    time[2] = total;
-
-    time[3] = ( double ) ops / ( 1000000.0 * total );
-    /*
-    if ( total > 0.0 )
-     {
-           time[3] = ( double ) ops / ( 1000000.0 * total );
-     }
-    else
-     {
-         time[3] = -1.0;
-     }
-    */
-
-    time[4] = 2.0 / time[3];
-    time[5] = total / cray;
-
-    //pc.printf( " \n\n ");
-    pc.printf( "\n  Norm. Resid          Resid           MACHEP         X[1]          X[N]\n" );
-    //pc.printf( "\n    MACHEP         X[1]          X[N]\n" );
-    pc.printf(" %4.6f   ", residn);
-    pc.printf(" %4.6f    ", resid_max);
-    pc.printf(" %14e", eps);
-    pc.printf(" %14f", b[0]);
-    pc.printf(" %14f ",b[N-1]);
-    pc.printf("\n\n");
-    //pc.printf( "    %14f    %14f    %14e    %14f    %14f \n", residn, resid_max, eps, b[0], b[N-1] );
-
-    pc.printf( " \n\n ");
-    pc.printf( "      Factor         Solve         Total        MFLOPS         Unit         Cray-Ratio \n\n" );
-
-    for(int ii=0; ii<6; ii++) {
-        pc.printf("     %9f", time[ii]);
-    }
-      
-    //pc.printf( "  %9f  %9f  %9f  %9f  %9f  %9f\n", time[0], time[1], time[2], time[3], time[4], time[5] );
-    
-    /*
-      Terminate.
-      Free Mem
-    */
-
-    free ( a );
-    free ( b );
-    free ( ipvt );
-    free ( resid );
-    free ( rhs );
-    free ( x );
-
-    pc.printf( "\n" );
-    pc.printf( "LINPACK_BENCH\n" );
-    pc.printf( "  Normal end of execution.\n" );
-
-    pc.printf( "\n" );
-
-    return 0;
-# undef LDA
-# undef N
-}
-
-/******************************************************************************/
-
-//double cpu_time ( void )
-
-/******************************************************************************/
-/*
-  Purpose:
-
-    CPU_TIME returns the current reading on the CPU clock.
-
-  Discussion:
-
-    The CPU time measurements available through this routine are often
-    not very accurate.  In some cases, the accuracy is no better than
-    a hundredth of a second.
-
-  koristi mbed.Timer
-
-*/
-//{
-//    double vreme;
-
-//    vreme = timer.read_ms() / 1000;
-
-//    return vreme;
-//}
-/******************************************************************************/
-
-
-void daxpy ( int n, double da, double dx[], int incx, double dy[], int incy )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        DAXPY computes constant times a vector plus a vector.
-
-      Discussion:
-
-        This routine uses unrolled loops for increments equal to one.
-
-      Modified:
-
-        30 March 2007
-
-      Author:
-
-        FORTRAN77 original by Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart.
-        C version by John Burkardt
-
-      Reference:
-
-        Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-        LINPACK User's Guide,
-        SIAM, 1979.
-
-        Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh,
-        Basic Linear Algebra Subprograms for Fortran Usage,
-        Algorithm 539,
-        ACM Transactions on Mathematical Software,
-        Volume 5, Number 3, September 1979, pages 308-323.
-
-      Parameters:
-
-        Input, int N, the number of elements in DX and DY.
-
-        Input, double DA, the multiplier of DX.
-
-        Input, double DX[*], the first vector.
-
-        Input, int INCX, the increment between successive entries of DX.
-
-        Input/output, double DY[*], the second vector.
-        On output, DY[*] has been replaced by DY[*] + DA * DX[*].
-
-        Input, int INCY, the increment between successive entries of DY.
-    */
-    int i;
-    int ix;
-    int iy;
-    int m;
-
-    if ( n <= 0 ) {
-        return;
-    }
-
-    if ( da == 0.0 ) {
-        return;
-    }
-    /*
-      Code for unequal increments or equal increments
-      not equal to 1.
-    */
-    if ( incx != 1 || incy != 1 ) {
-        if ( 0 <= incx ) {
-            ix = 0;
-        } else {
-            ix = ( - n + 1 ) * incx;
-        }
-
-        if ( 0 <= incy ) {
-            iy = 0;
-        } else {
-            iy = ( - n + 1 ) * incy;
-        }
-
-        for ( i = 0; i < n; i++ ) {
-            dy[iy] = dy[iy] + da * dx[ix];
-            ix = ix + incx;
-            iy = iy + incy;
-        }
-    }
-    /*
-      Code for both increments equal to 1.
-    */
-    else {
-        m = n % 4;
-
-        for ( i = 0; i < m; i++ ) {
-            dy[i] = dy[i] + da * dx[i];
-        }
-
-        for ( i = m; i < n; i = i + 4 ) {
-            dy[i  ] = dy[i  ] + da * dx[i  ];
-            dy[i+1] = dy[i+1] + da * dx[i+1];
-            dy[i+2] = dy[i+2] + da * dx[i+2];
-            dy[i+3] = dy[i+3] + da * dx[i+3];
-        }
-    }
-    return;
-}
-/******************************************************************************/
-
-double ddot ( int n, double dx[], int incx, double dy[], int incy )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        DDOT forms the dot product of two vectors.
-
-      Discussion:
-
-        This routine uses unrolled loops for increments equal to one.
-
-      Modified:
-
-        30 March 2007
-
-      Author:
-
-        FORTRAN77 original by Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart.
-        C version by John Burkardt
-
-      Reference:
-
-        Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-        LINPACK User's Guide,
-        SIAM, 1979.
-
-        Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh,
-        Basic Linear Algebra Subprograms for Fortran Usage,
-        Algorithm 539,
-        ACM Transactions on Mathematical Software,
-        Volume 5, Number 3, September 1979, pages 308-323.
-
-      Parameters:
-
-        Input, int N, the number of entries in the vectors.
-
-        Input, double DX[*], the first vector.
-
-        Input, int INCX, the increment between successive entries in DX.
-
-        Input, double DY[*], the second vector.
-
-        Input, int INCY, the increment between successive entries in DY.
-
-        Output, double DDOT, the sum of the product of the corresponding
-        entries of DX and DY.
-    */
-
-    double dtemp;
-    int i;
-    int ix;
-    int iy;
-    int m;
-
-    dtemp = 0.0;
-
-    if ( n <= 0 ) {
-        return dtemp;
-    }
-    /*
-      Code for unequal increments or equal increments
-      not equal to 1.
-    */
-    if ( incx != 1 || incy != 1 ) {
-        if ( 0 <= incx ) {
-            ix = 0;
-        } else {
-            ix = ( - n + 1 ) * incx;
-        }
-
-        if ( 0 <= incy ) {
-            iy = 0;
-        } else {
-            iy = ( - n + 1 ) * incy;
-        }
-
-        for ( i = 0; i < n; i++ ) {
-            dtemp = dtemp + dx[ix] * dy[iy];
-            ix = ix + incx;
-            iy = iy + incy;
-        }
-    }
-    /*
-      Code for both increments equal to 1.
-    */
-    else {
-        m = n % 5;
-
-        for ( i = 0; i < m; i++ ) {
-            dtemp = dtemp + dx[i] * dy[i];
-        }
-
-        for ( i = m; i < n; i = i + 5 ) {
-            dtemp = dtemp + dx[i  ] * dy[i  ]
-                    + dx[i+1] * dy[i+1]
-                    + dx[i+2] * dy[i+2]
-                    + dx[i+3] * dy[i+3]
-                    + dx[i+4] * dy[i+4];
-        }
-    }
-    return dtemp;
-}
-/******************************************************************************/
-
-int dgefa ( double a[], int lda, int n, int ipvt[] )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        DGEFA factors a real general matrix.
-
-      Modified:
-
-        16 May 2005
-
-      Author:
-
-        C version by John Burkardt.
-
-      Reference:
-
-        Jack Dongarra, Cleve Moler, Jim Bunch and Pete Stewart,
-        LINPACK User's Guide,
-        SIAM, (Society for Industrial and Applied Mathematics),
-        3600 University City Science Center,
-        Philadelphia, PA, 19104-2688.
-        ISBN 0-89871-172-X
-
-      Parameters:
-
-        Input/output, double A[LDA*N].
-        On intput, the matrix to be factored.
-        On output, an upper triangular matrix and the multipliers used to obtain
-        it.  The factorization can be written A=L*U, where L is a product of
-        permutation and unit lower triangular matrices, and U is upper triangular.
-
-        Input, int LDA, the leading dimension of A.
-
-        Input, int N, the order of the matrix A.
-
-        Output, int IPVT[N], the pivot indices.
-
-        Output, int DGEFA, singularity indicator.
-        0, normal value.
-        K, if U(K,K) == 0.  This is not an error condition for this subroutine,
-        but it does indicate that DGESL or DGEDI will divide by zero if called.
-        Use RCOND in DGECO for a reliable indication of singularity.
-    */
-
-    int info;
-    int j;
-    int k;
-    int l;
-    double t;
-    /*
-      Gaussian elimination with partial pivoting.
-    */
-    info = 0;
-
-    for ( k = 1; k <= n-1; k++ ) {
-        /*
-          Find L = pivot index.
-        */
-        l = idamax ( n-k+1, a+(k-1)+(k-1)*lda, 1 ) + k - 1;
-        ipvt[k-1] = l;
-        /*
-          Zero pivot implies this column already triangularized.
-        */
-        if ( a[l-1+(k-1)*lda] == 0.0 ) {
-            info = k;
-            continue;
-        }
-        /*
-          Interchange if necessary.
-        */
-        if ( l != k ) {
-            t = a[l-1+(k-1)*lda];
-            a[l-1+(k-1)*lda] = a[k-1+(k-1)*lda];
-            a[k-1+(k-1)*lda] = t;
-        }
-        /*
-          Compute multipliers.
-        */
-        t = -1.0 / a[k-1+(k-1)*lda];
-
-        dscal ( n-k, t, a+k+(k-1)*lda, 1 );
-        /*
-          Row elimination with column indexing.
-        */
-        for ( j = k+1; j <= n; j++ ) {
-            t = a[l-1+(j-1)*lda];
-            if ( l != k ) {
-                a[l-1+(j-1)*lda] = a[k-1+(j-1)*lda];
-                a[k-1+(j-1)*lda] = t;
-            }
-            daxpy ( n-k, t, a+k+(k-1)*lda, 1, a+k+(j-1)*lda, 1 );
-        }
-
-    }
-
-    ipvt[n-1] = n;
-
-    if ( a[n-1+(n-1)*lda] == 0.0 ) {
-        info = n;
-    }
-
-    return info;
-}
-/******************************************************************************/
-
-void dgesl ( double a[], int lda, int n, int ipvt[], double b[], int job )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        DGESL solves a real general linear system A * X = B.
-
-      Discussion:
-
-        DGESL can solve either of the systems A * X = B or A' * X = B.
-
-        The system matrix must have been factored by DGECO or DGEFA.
-
-        A division by zero will occur if the input factor contains a
-        zero on the diagonal.  Technically this indicates singularity
-        but it is often caused by improper arguments or improper
-        setting of LDA.  It will not occur if the subroutines are
-        called correctly and if DGECO has set 0.0 < RCOND
-        or DGEFA has set INFO == 0.
-
-      Modified:
-
-        16 May 2005
-
-      Author:
-
-        C version by John Burkardt.
-
-      Reference:
-
-        Jack Dongarra, Cleve Moler, Jim Bunch and Pete Stewart,
-        LINPACK User's Guide,
-        SIAM, (Society for Industrial and Applied Mathematics),
-        3600 University City Science Center,
-        Philadelphia, PA, 19104-2688.
-        ISBN 0-89871-172-X
-
-      Parameters:
-
-        Input, double A[LDA*N], the output from DGECO or DGEFA.
-
-        Input, int LDA, the leading dimension of A.
-
-        Input, int N, the order of the matrix A.
-
-        Input, int IPVT[N], the pivot vector from DGECO or DGEFA.
-
-        Input/output, double B[N].
-        On input, the right hand side vector.
-        On output, the solution vector.
-
-        Input, int JOB.
-        0, solve A * X = B;
-        nonzero, solve A' * X = B.
-    */
-
-    int k;
-    int l;
-    double t;
-    /*
-      Solve A * X = B.
-    */
-    if ( job == 0 ) {
-        for ( k = 1; k <= n-1; k++ ) {
-            l = ipvt[k-1];
-            t = b[l-1];
-
-            if ( l != k ) {
-                b[l-1] = b[k-1];
-                b[k-1] = t;
-            }
-
-            daxpy ( n-k, t, a+k+(k-1)*lda, 1, b+k, 1 );
-
-        }
-
-        for ( k = n; 1 <= k; k-- ) {
-            b[k-1] = b[k-1] / a[k-1+(k-1)*lda];
-            t = -b[k-1];
-            daxpy ( k-1, t, a+0+(k-1)*lda, 1, b, 1 );
-        }
-    }
-    /*
-      Solve A' * X = B.
-    */
-    else {
-        for ( k = 1; k <= n; k++ ) {
-            t = ddot ( k-1, a+0+(k-1)*lda, 1, b, 1 );
-            b[k-1] = ( b[k-1] - t ) / a[k-1+(k-1)*lda];
-        }
-
-        for ( k = n-1; 1 <= k; k-- ) {
-            b[k-1] = b[k-1] + ddot ( n-k, a+k+(k-1)*lda, 1, b+k, 1 );
-            l = ipvt[k-1];
-
-            if ( l != k ) {
-                t = b[l-1];
-                b[l-1] = b[k-1];
-                b[k-1] = t;
-            }
-        }
-    }
-    return;
-}
-/******************************************************************************/
-
-void dscal ( int n, double sa, double x[], int incx )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        DSCAL scales a vector by a constant.
-
-      Modified:
-
-        30 March 2007
-
-      Author:
-
-        FORTRAN77 original by Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart.
-        C version by John Burkardt
-
-      Reference:
-
-        Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-        LINPACK User's Guide,
-        SIAM, 1979.
-
-        Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh,
-        Basic Linear Algebra Subprograms for Fortran Usage,
-        Algorithm 539,
-        ACM Transactions on Mathematical Software,
-        Volume 5, Number 3, September 1979, pages 308-323.
-
-      Parameters:
-
-        Input, int N, the number of entries in the vector.
-
-        Input, double SA, the multiplier.
-
-        Input/output, double X[*], the vector to be scaled.
-
-        Input, int INCX, the increment between successive entries of X.
-    */
-
-    int i;
-    int ix;
-    int m;
-
-    if ( n <= 0 ) {
-    } else if ( incx == 1 ) {
-        m = n % 5;
-
-        for ( i = 0; i < m; i++ ) {
-            x[i] = sa * x[i];
-        }
-
-        for ( i = m; i < n; i = i + 5 ) {
-            x[i]   = sa * x[i];
-            x[i+1] = sa * x[i+1];
-            x[i+2] = sa * x[i+2];
-            x[i+3] = sa * x[i+3];
-            x[i+4] = sa * x[i+4];
-        }
-    } else {
-        if ( 0 <= incx ) {
-            ix = 0;
-        } else {
-            ix = ( - n + 1 ) * incx;
-        }
-
-        for ( i = 0; i < n; i++ ) {
-            x[ix] = sa * x[ix];
-            ix = ix + incx;
-        }
-    }
-    return;
-}
-/******************************************************************************/
-
-int idamax ( int n, double dx[], int incx )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        IDAMAX finds the index of the vector element of maximum absolute value.
-
-      Discussion:
-
-        WARNING: This index is a 1-based index, not a 0-based index!
-
-      Modified:
-
-        30 March 2007
-
-      Author:
-
-        FORTRAN77 original by Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart.
-        C version by John Burkardt
-
-      Reference:
-
-        Jack Dongarra, Cleve Moler, Jim Bunch, Pete Stewart,
-        LINPACK User's Guide,
-        SIAM, 1979.
-
-        Charles Lawson, Richard Hanson, David Kincaid, Fred Krogh,
-        Basic Linear Algebra Subprograms for Fortran Usage,
-        Algorithm 539,
-        ACM Transactions on Mathematical Software,
-        Volume 5, Number 3, September 1979, pages 308-323.
-
-      Parameters:
-
-        Input, int N, the number of entries in the vector.
-
-        Input, double X[*], the vector to be examined.
-
-        Input, int INCX, the increment between successive entries of SX.
-
-        Output, int IDAMAX, the index of the element of maximum
-        absolute value.
-    */
-
-    double dmax;
-    int i;
-    int ix;
-    int value;
-
-    value = 0;
-
-    if ( n < 1 || incx <= 0 ) {
-        return value;
-    }
-
-    value = 1;
-
-    if ( n == 1 ) {
-        return value;
-    }
-
-    if ( incx == 1 ) {
-        dmax = r8_abs ( dx[0] );
-
-        for ( i = 1; i < n; i++ ) {
-            if ( dmax < r8_abs ( dx[i] ) ) {
-                value = i + 1;
-                dmax = r8_abs ( dx[i] );
-            }
-        }
-    } else {
-        ix = 0;
-        dmax = r8_abs ( dx[0] );
-        ix = ix + incx;
-
-        for ( i = 1; i < n; i++ ) {
-            if ( dmax < r8_abs ( dx[ix] ) ) {
-                value = i + 1;
-                dmax = r8_abs ( dx[ix] );
-            }
-            ix = ix + incx;
-        }
-    }
-
-    return value;
-}
-/******************************************************************************/
-
-double r8_abs ( double x )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        R8_ABS returns the absolute value of a R8.
-
-      Modified:
-
-        02 April 2005
-
-      Author:
-
-        John Burkardt
-
-      Parameters:
-
-        Input, double X, the quantity whose absolute value is desired.
-
-        Output, double R8_ABS, the absolute value of X.
-    */
-
-    double value;
-
-    if ( 0.0 <= x ) {
-        value = x;
-    } else {
-        value = -x;
-    }
-    return value;
-}
-/******************************************************************************/
-
-double r8_epsilon ( void )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        R8_EPSILON returns the R8 round off unit.
-
-      Discussion:
-
-        R8_EPSILON is a number R which is a power of 2 with the property that,
-        to the precision of the computer's arithmetic,
-          1 < 1 + R
-        but
-          1 = ( 1 + R / 2 )
-
-      Licensing:
-
-        This code is distributed under the GNU LGPL license.
-
-      Modified:
-
-        08 May 2006
-
-      Author:
-
-        John Burkardt
-
-      Parameters:
-
-        Output, double R8_EPSILON, the double precision round-off unit.
-    */
-
-    double r;
-
-    r = 1.0;
-
-    while ( 1.0 < ( double ) ( 1.0 + r )  ) {
-        r = r / 2.0;
-    }
-    r = 2.0 * r;
-
-    return r;
-}
-/******************************************************************************/
-
-double r8_max ( double x, double y )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        R8_MAX returns the maximum of two R8's.
-
-      Modified:
-
-        18 August 2004
-
-      Author:
-
-        John Burkardt
-
-      Parameters:
-
-        Input, double X, Y, the quantities to compare.
-
-        Output, double R8_MAX, the maximum of X and Y.
-    */
-
-    double value;
-
-    if ( y < x ) {
-        value = x;
-    } else {
-        value = y;
-    }
-    return value;
-}
-/******************************************************************************/
-
-double r8_random ( int iseed[4] )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        R8_RANDOM returns a uniformly distributed random number between 0 and 1.
-
-      Discussion:
-
-        This routine uses a multiplicative congruential method with modulus
-        2**48 and multiplier 33952834046453 (see G.S.Fishman,
-        'Multiplicative congruential random number generators with modulus
-        2**b: an exhaustive analysis for b = 32 and a partial analysis for
-        b = 48', Math. Comp. 189, pp 331-344, 1990).
-
-        48-bit integers are stored in 4 integer array elements with 12 bits
-        per element. Hence the routine is portable across machines with
-        integers of 32 bits or more.
-
-      Parameters:
-
-        Input/output, integer ISEED(4).
-        On entry, the seed of the random number generator; the array
-        elements must be between 0 and 4095, and ISEED(4) must be odd.
-        On exit, the seed is updated.
-
-        Output, double R8_RANDOM, the next pseudorandom number.
-    */
-
-    int ipw2 = 4096;
-    int it1;
-    int it2;
-    int it3;
-    int it4;
-    int m1 = 494;
-    int m2 = 322;
-    int m3 = 2508;
-    int m4 = 2549;
-    double r = 1.0 / 4096.0;
-    double value;
-    /*
-      Multiply the seed by the multiplier modulo 2**48.
-    */
-    it4 = iseed[3] * m4;
-    it3 = it4 / ipw2;
-    it4 = it4 - ipw2 * it3;
-    it3 = it3 + iseed[2] * m4 + iseed[3] * m3;
-    it2 = it3 / ipw2;
-    it3 = it3 - ipw2 * it2;
-    it2 = it2 + iseed[1] * m4 + iseed[2] * m3 + iseed[3] * m2;
-    it1 = it2 / ipw2;
-    it2 = it2 - ipw2 * it1;
-    it1 = it1 + iseed[0] * m4 + iseed[1] * m3 + iseed[2] * m2 + iseed[3] * m1;
-    it1 = ( it1 % ipw2 );
-    /*
-      Return updated seed
-    */
-    iseed[0] = it1;
-    iseed[1] = it2;
-    iseed[2] = it3;
-    iseed[3] = it4;
-    /*
-      Convert 48-bit integer to a real number in the interval (0,1)
-    */
-    value =
-        r * ( ( double ) ( it1 )
-              + r * ( ( double ) ( it2 )
-                      + r * ( ( double ) ( it3 )
-                              + r * ( ( double ) ( it4 ) ) ) ) );
-
-    return value;
-}
-/******************************************************************************/
-
-double *r8mat_gen ( int lda, int n )
-{
-
-    /******************************************************************************/
-    /*
-      Purpose:
-
-        R8MAT_GEN generates a random R8MAT.
-
-      Modified:
-
-        06 June 2005
-
-      Parameters:
-
-        Input, integer LDA, the leading dimension of the matrix.
-
-        Input, integer N, the order of the matrix.
-
-        Output, double R8MAT_GEN[LDA*N], the N by N matrix.
-    */
-
-    double *ba;
-    int i;
-    int init[4] = { 1, 2, 3, 1325 };
-    int j;
-
-    ba = ( double * ) malloc ( lda * n * sizeof ( double ) );
-
-    for ( j = 1; j <= n; j++ ) {
-        for ( i = 1; i <= n; i++ ) {
-            ba[i-1+(j-1)*lda] = r8_random ( init ) - 0.5;
-        }
-    }
-
-    return ba;
-}
\ No newline at end of file