changes to prefetch...hope it makes a difference...
authorbdemsky <bdemsky>
Tue, 10 Mar 2009 19:48:56 +0000 (19:48 +0000)
committerbdemsky <bdemsky>
Tue, 10 Mar 2009 19:48:56 +0000 (19:48 +0000)
Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/MatrixMultiplyN.java

index ae2ae5d11f949b4e4fc7d46c5daa811a69ec41c6..5b2d22cbb33755da0cb2963d635b8e1cd1e3517b 100644 (file)
@@ -11,41 +11,51 @@ public class MatrixMultiply extends Thread{
     
     public void run() {
        atomic {
-        // Prefetch mmul.a[][] matrix
-        Object o = mmul;
         short[] offsets = new short[4];
+        // Prefetch mmul.btranspose[][] matrix
+       //Get all of B first...we need them first
+        offsets[0] = getoffset{MMul, btranspose};
+        offsets[1] = (short) 0;
+        offsets[2] = (short) y0;
+        offsets[3] = (short) (y1 - y0 -1);
+        System.rangePrefetch(mmul, offsets);
+
+       //Get first part of A
         offsets[0] = getoffset{MMul, a};
         offsets[1] = (short) 0;
         offsets[2] = (short) x0;
-        offsets[3] = (short) (x1 - x0 -1);
-        System.rangePrefetch(o, offsets);
-
-        // Prefetch mmul.btranspose[][] matrix
-        Object o1 = mmul;
-        short[] offsets1 = new short[4];
-        offsets1[0] = getoffset{MMul, btranspose};
-        offsets1[1] = (short) 0;
-        offsets1[2] = (short) x0;
-        offsets1[3] = (short) (x1 - x0 -1);
-        System.rangePrefetch(o1, offsets1);
+        offsets[3] = (short) 63;
+        System.rangePrefetch(mmul, offsets);
 
-        // Prefetch mmul.c[][] matrix
-        Object o2 = mmul;
-        short[] offsets2 = new short[4];
-        offsets2[0] = getoffset{MMul, c};
-        offsets2[1] = (short) 0;
-        offsets2[2] = (short) x0;
-        offsets2[3] = (short) (x1 - x0 -1);
-        System.rangePrefetch(o2, offsets2);
+        //Get first part of C
+        offsets[0] = getoffset{MMul, c};
+        offsets[1] = (short) 0;
+        System.rangePrefetch(mmul, offsets);
 
            double la[][]=mmul.a;
            double lc[][]=mmul.c;
            double lb[][]=mmul.btranspose;
            int M=mmul.M;
+           int l=0;
         //Use btranspose for cache performance
-           for(int i = x0; i< x1; i++){
+           for(int i = x0; i< x1; i++,l++){
                double a[]=la[i];
                double c[]=lc[i];
+               if (((l+32)&63)==0) {
+                   offsets[0] = getoffset{MMul, a};
+                   offsets[1] = (short) 0;
+                   offsets[2] = (short) x0+l;
+                   if ((x0+l+64)>x1)
+                       offsets[3]=x1-x0-l-1;
+                   else
+                       offsets[3] = (short) 63;
+                   System.rangePrefetch(mmul, offsets);
+
+                   //Get first part of C
+                   offsets[0] = getoffset{MMul, c};
+                   offsets[1] = (short) 0;
+                   System.rangePrefetch(mmul, offsets);
+               }
                for (int j = y0; j < y1; j++) {
                    double innerProduct=0;
                    double b[] = lb[j];