Fix a performance bug in the multicore gc version. In hvc files, should not reserve...
authorjzhou <jzhou>
Sat, 21 Aug 2010 00:20:34 +0000 (00:20 +0000)
committerjzhou <jzhou>
Sat, 21 Aug 2010 00:20:34 +0000 (00:20 +0000)
Robust/src/Benchmarks/Scheduling/GC/RayTracer/RayTracerBench.java
Robust/src/Runtime/mem.c
Robust/src/Runtime/multicoregarbage.c
Robust/src/Runtime/multicoregarbage.h
Robust/src/Runtime/multicoreruntime.h
Robust/src/Runtime/multicoretask.c
Robust/src/buildscript

index 3e562ea31474ef38aebf835eb73fb8950e904953..5a288fda65213ac8280ed607c162ed5d52ce8052 100644 (file)
@@ -1,7 +1,7 @@
 task t1(StartupObject s{initialstate}) {
   //System.printString("task t1\n");
 
-  int threadnum = 56; // 62; // 56;
+  int threadnum = 62; // 56;
   int size = threadnum * 25;
   Composer comp = new Composer(threadnum, size){compose};
   RayTracer rt = new RayTracer();
index 227c4ee2e44e1dab3752cb203c4685ce5f15f050..016312ed26773a13a71591cc063725fcada76079 100644 (file)
@@ -9,8 +9,21 @@ void * mycalloc(int m,
   void * p = NULL;
   int isize = size; 
   BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
+#ifdef MULTICORE_GC
+  extern bool gc_localheap_s;
+inermycalloc_i:
+  p = gc_localheap_s ? BAMBOO_LOCAL_MEM_CALLOC_S(m, isize) : 
+       BAMBOO_LOCAL_MEM_CALLOC(m, isize);
+#else
   p = BAMBOO_LOCAL_MEM_CALLOC(m, isize); // calloc(m, isize);
+#endif
   if(p == NULL) {
+#ifdef MULTICORE_GC
+       if(!gc_localheap_s) {
+         gc_localheap_s = true;
+         goto inermycalloc_i;
+       }
+#endif
          BAMBOO_EXIT(0xc001);
   }
   BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
@@ -121,18 +134,39 @@ void * mycalloc_i(int m,
 #ifdef DEBUG
   tprintf("ask for local mem: %x \n", isize);
 #endif
+#ifdef MULTICORE_GC
+  extern bool gc_localheap_s;
+inermycalloc_i:
+  p = gc_localheap_s ? BAMBOO_LOCAL_MEM_CALLOC_S(m, isize) : 
+       BAMBOO_LOCAL_MEM_CALLOC(m, isize);
+#else
   p = BAMBOO_LOCAL_MEM_CALLOC(m, isize); // calloc(m, isize);
+#endif
 #ifdef DEBUG
   tprintf("new obj in local mem: %x, %x \n", p, isize);
 #endif
   if(p == NULL) {
+#ifdef MULTICORE_GC
+       if(!gc_localheap_s) {
+         gc_localheap_s = true;
+         goto inermycalloc_i;
+       }
+#endif
        BAMBOO_EXIT(0xc004);
   }
   return p;
 }
 
 void myfree(void * ptr) {
-  BAMBOO_LOCAL_MEM_FREE(ptr);
+#ifdef MULTICORE_GC
+  if(ptr >= BAMBOO_LOCAL_HEAP_START_VA ) {
+#endif
+       BAMBOO_LOCAL_MEM_FREE(ptr);
+#ifdef MULTICORE_GC
+  } else if(ptr >= BAMBOO_LOCAL_HEAP_START_VA_S) {
+       BAMBOO_LOCAL_MEM_FREE_S(ptr);
+  }
+#endif
   return;
 }
 
index ca476ad0f9f65efc8e143d36a97804b7090a3f87..8d669e4942737286cd79d56e708d143715f5aca4 100644 (file)
@@ -3040,6 +3040,12 @@ pregccheck:
 #ifdef RAWPATH // TODO GC_DEBUG
     printf("(%x,%x) start gc! \n", udn_tile_coord_x(), udn_tile_coord_y());
     //dumpSMem();
+#endif
+#ifdef GC_FLUSH_DTLB
+       if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
+         BAMBOO_CLEAN_DTLB();
+         gc_num_flush_dtlb++;
+       }
 #endif
     gcprocessing = true;
     gcphase = INITPHASE;
@@ -3405,6 +3411,12 @@ pregccheck:
   gc_num_forwardobj = 0;
 #endif // GC_PROFLIE_S*/
   } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
+#ifdef GC_FLUSH_DTLB
+       if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
+         BAMBOO_CLEAN_DTLB();
+         gc_num_flush_dtlb++;
+       }
+#endif
     gcprocessing = true;
     gc_collect(stackptr);
 
@@ -3415,6 +3427,12 @@ pregccheck:
     gcflag = false;
     gcprocessing = false;
   } else {
+#ifdef GC_FLUSH_DTLB
+       if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
+         BAMBOO_CLEAN_DTLB();
+         gc_num_flush_dtlb++;
+       }
+#endif
     // not a gc core, should wait for gcfinish msg
     gcprocessing = true;
     gc_nocollect(stackptr);
index a6722e871cec08d3d645f07c92072db9302e0303..93a1b252deafaa3e834c2581a2e046be28442bac 100644 (file)
 // let each gc core to have one big block, this is very important
 // for the computation of NUMBLOCKS(s, n), DO NOT change this!
 
+#ifdef GC_FLUSH_DTLB
+#define GC_NUM_FLUSH_DTLB 1
+int gc_num_flush_dtlb;
+#endif
+
 #define NUMPTRS 100
 
 // for GC profile
index 675821d664ebcf9b5f7d6f7a5b8207c9b6a6963f..5147ed2d8f29f84231445e561564d979efd4addc 100644 (file)
@@ -318,6 +318,10 @@ struct Queue * totransobjqueue; // queue to hold objs to be transferred
 //((unsigned long long int)(3.0 * 1024 * 1024 * 1024)) // 3G 
 #endif // GC_DEBUG
 
+#ifdef MULTICORE_GC
+volatile bool gc_localheap_s;
+#endif
+
 #ifdef MULTICORE_GC
 #include "multicoregarbage.h"
 
@@ -562,11 +566,19 @@ void outputProfileData();
 //                            request response                             //
 // BAMBOO_LOCAL_MEM_CALLOC(x, y): allocate an array of x elements each of  //
 //                                whose size in bytes is y on local memory //
+//                                which is given by the hypervisor         //
 // BAMBOO_LOCAL_MEM_FREE(x): free space with ptr x on local memory         //
 // BAMBOO_LOCAL_MEM_CLOSE(): close the local heap                          //
+// BAMBOO_LOCAL_MEM_CALLOC_S(x, y): allocate an array of x elements each of//
+//                                  whose size in bytes is y on local      //
+//                                  memory which is not from the hypervisor//
+//                                  but is allocated from the free memory  //
+// BAMBOO_LOCAL_MEM_FREE_S(x): free space with ptr x on self-allocated     //
+//                             local memory                                //
+// BAMBOO_LOCAL_MEM_CLOSE_S(): close the self-allocated local heap        //
 // BAMBOO_SHARE_MEM_CALLOC_I(x, y): allocate an array of x elements each of//
 //                                whose size in bytes is y on shared memory//
-// BAMBOO_SHARE_MEM_CLOSE(): close the shared heap                        //
+// BAMBOO_SHARE_MEM_CLOSE(): close the shared heap                         //
 // BAMBOO_CACHE_LINE_SIZE: the cache line size                             //
 // BAMBOO_CACHE_LINE_MASK: mask for a cache line                           //
 // BAMBOO_CACHE_FLUSH_RANGE(x, y): flush cache lines started at x with     //
@@ -577,6 +589,7 @@ void outputProfileData();
 //                            hint, the processor will not fetch the       //
 //                            current content of the memory and directly   //
 //                            write                                        //
+// BAMBOO_CLEAN_DTLB(): zero-out all the dtlb entries                      //
 /////////////////////////////////////////////////////////////////////////////
 
 #endif  // #ifdef MULTICORE
index 9a6e6ae86b3b0bf028a9214140e4bc02f886b5cb..8bf89843194ed914b6150c722b296f484728ecdd 100644 (file)
@@ -311,6 +311,10 @@ void initruntimedata() {
   gc_num_forwardobj = 0;
   gc_num_profiles = NUMCORESACTIVE - 1;
 #endif
+#ifdef GC_FLUSH_DTLB
+  gc_num_flush_dtlb = 0;
+#endif
+  gc_localheap_s = false;
 #else
   // create the lock table, lockresult table and obj queue
   locktable.size = 20;
@@ -602,6 +606,9 @@ void checkCoreStatus() {
 
          BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME() - bamboo_start_time);
          //BAMBOO_DEBUGPRINT_REG(total_num_t6); // TODO for test
+#ifdef GC_FLUSH_DTLB
+         BAMBOO_DEBUGPRINT_REG(gc_num_flush_dtlb);
+#endif
 #ifndef BAMBOO_MEMPROF
          BAMBOO_DEBUGPRINT(0xbbbbbbbb);
 #endif
index 4b9ea31dce62492682ff273a6d943a8dabec7f48..b8171764429ae06adefb50d0d5b7d71e505e9d48 100755 (executable)
@@ -201,7 +201,6 @@ OPTIONALFLAG=false
 EXITAFTERANALYSIS=false
 ASSEMBLY=false
 GCCORES=''
-GC1COREFLAG=false
 TILERAN1COREFLAG=false
 TILERA56COREFLAG=false
 
@@ -390,10 +389,6 @@ elif [[ $1 = '-numcore4gc' ]]
 then
 JAVAOPTS="$JAVAOPTS -numcore4gc $2"
 GCCORES="GC_$2"
-if [[ "$2" -eq "1" ]]
-then
-GC1COREFLAG=true
-fi
 shift
 elif [[ $1 = '-raw' ]]
 then
@@ -812,13 +807,6 @@ TILERA_INDIR="BME"
 MAKEFILE="Makefile.tilera.$TILERACONFIG"
 SIMHVC="sim.hvc.$TILERACONFIG"
 PCIHVC="pci.hvc.$TILERACONFIG"
-if $GC1COREFLAG 
-then # 1-core gc
-  if $TILERAN1COREFLAG
-  then # not only with 1 core
-       PCIHVC="$PCIHVC.1gc"
-  fi
-fi
 if $TILERA56COREFLAG
 then
   PCIHVC="$PCIHVC.56"