fix performance bug
[IRC.git] / Robust / src / Runtime / bamboo / multicoregarbage.c
index b5897919e609a498f463158cf7b7ab8ac4bd2ff5..9f6f714fedba8c816445a4a035f36d0d3842ae97 100644 (file)
@@ -395,6 +395,13 @@ inline int hostcore(void * ptr) {
   return host;
 } // int hostcore(void * ptr)
 
+inline void cpu2coords(int coren,
+                          int * x,
+                                          int * y) {
+  *x = bamboo_cpu2coords[2*coren];
+  *y = bamboo_cpu2coords[2*coren+1];
+} // void cpu2coords(...)
+
 inline bool isLocal(void * ptr) {
   // check if a pointer is in shared heap on this core
   return hostcore(ptr) == BAMBOO_NUM_OF_CORE;
@@ -1871,7 +1878,7 @@ inline bool initOrig_Dst(struct moveHelper * orig,
        ((to->base-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
   gc_cache_revise_infomation.to_page_index = 
        (to->base-gcbaseva)/(BAMBOO_PAGE_SIZE);
-  gc_cache_revise_infomation.orig_page_start_va = -1; 
+  gc_cache_revise_infomation.orig_page_start_va = -1;
 #endif // GC_CACHE_ADAPT
 
   // init the orig ptr
@@ -1954,25 +1961,23 @@ innermoveobj:
   if(orig->ptr >= gc_cache_revise_infomation.orig_page_end_va) {
        // end of an orig page
        // compute the impact of this page for the new page
-       float tmp_factor = 
-         ((float)(to->ptr-gc_cache_revise_infomation.to_page_start_va))/
-         ((float)(BAMBOO_PAGE_SIZE));
+       int tmp_factor = to->ptr-gc_cache_revise_infomation.to_page_start_va; 
+       int topage=gc_cache_revise_infomation.to_page_index;
+       int oldpage = gc_cache_revise_infomation.orig_page_index;
+       int * newtable=&gccachesamplingtbl_r[topage];
+       int * oldtable=&gccachesamplingtbl[oldpage];
+       
        for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-         ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
-                )[gc_cache_revise_infomation.to_page_index] += (int)(
-                  ((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
-               gc_cache_revise_infomation.orig_page_index]*tmp_factor);
-         // TODO
-/*       if(((gc_cache_revise_infomation.orig_page_start_va-gcbaseva)/(BAMBOO_PAGE_SIZE))*(BAMBOO_PAGE_SIZE)+gcbaseva == 0xd180000) {
-               tprintf("0xd180000 -> %x %d, %d, %d\n",(int)(gcbaseva+(BAMBOO_PAGE_SIZE)*gc_cache_revise_infomation.to_page_index), (int)(((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[gc_cache_revise_infomation.orig_page_index]*tmp_factor), (int)(tmp_factor*100000), (int)(to->ptr-gc_cache_revise_infomation.to_page_start_va));
-         }*/
+         (*newtable) += (*oldtable)*tmp_factor;
+         newtable=(int*)(((char *)newtable)+size_cachesamplingtbl_local_r);
+         oldtable=(int*)(((char *)oldtable)+size_cachesamplingtbl_local);
        }
        // prepare for an new orig page
+       int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
        gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
        gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-         (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-       gc_cache_revise_infomation.orig_page_index = 
-         (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+         (BAMBOO_PAGE_SIZE)*(tmp_index+1);
+       gc_cache_revise_infomation.orig_page_index = tmp_index;
        gc_cache_revise_infomation.to_page_start_va = to->ptr;
   }
 #endif
@@ -2039,32 +2044,30 @@ innermoveobj:
       nextBlock(to);
 #ifdef GC_CACHE_ADAPT
          if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
-         // end of an to page, wrap up its information
-         float tmp_factor = 
-               ((float)(tmp_ptr-gc_cache_revise_infomation.to_page_start_va))/
-               ((float)(BAMBOO_PAGE_SIZE));
-         for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-               ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
-                )[gc_cache_revise_infomation.to_page_index] += (int)(
-                 ((int*)((void*)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
-                 gc_cache_revise_infomation.orig_page_index]*tmp_factor);
-               // TODO
-/*       if(((gc_cache_revise_infomation.orig_page_start_va-gcbaseva)/(BAMBOO_PAGE_SIZE))*(BAMBOO_PAGE_SIZE)+gcbaseva == 0xd180000) {
-               tprintf("0xd180000 -> %x %d, %d, %d\n",(int)(gcbaseva+(BAMBOO_PAGE_SIZE)*gc_cache_revise_infomation.to_page_index), (int)(((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[gc_cache_revise_infomation.orig_page_index]*tmp_factor), (int)(tmp_factor*100000), (int)(to->ptr-gc_cache_revise_infomation.to_page_start_va));
-         }*/
-         }
-         // prepare for an new to page
-         gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-         gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-               (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-         gc_cache_revise_infomation.orig_page_index = 
-               (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-         gc_cache_revise_infomation.to_page_start_va = to->ptr;
-         gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
-               (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-         gc_cache_revise_infomation.to_page_index = 
-               (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-       }
+               // end of an to page, wrap up its information
+               int tmp_factor = tmp_ptr-gc_cache_revise_infomation.to_page_start_va;
+               int topage=gc_cache_revise_infomation.to_page_index;
+               int oldpage = gc_cache_revise_infomation.orig_page_index;
+               int * newtable=&gccachesamplingtbl_r[topage];
+               int * oldtable=&gccachesamplingtbl[oldpage];
+         
+               for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
+                 (*newtable)=((*newtable)+(*oldtable)*tmp_factor);
+                 newtable=(int*) (((char *)newtable)+size_cachesamplingtbl_local_r);
+                 oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
+               }
+               // prepare for an new to page
+               int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+               gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
+               gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
+                 (BAMBOO_PAGE_SIZE)*(tmp_index+1);
+               gc_cache_revise_infomation.orig_page_index = tmp_index;
+               gc_cache_revise_infomation.to_page_start_va = to->ptr;
+               gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
+                 (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
+               gc_cache_revise_infomation.to_page_index = 
+                 (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+      }
 #endif // GC_CACHE_ADAPT
       if(stopblock == to->numblocks) {
                // already fulfilled the block
@@ -2111,6 +2114,7 @@ innermoveobj:
     to->ptr += isize;
     to->offset += isize;
     to->top += isize;
+#if 0
 #ifdef GC_CACHE_ADAPT
        int tmp_ptr = to->ptr;
 #endif // GC_CACHE_ADAPT
@@ -2119,36 +2123,36 @@ innermoveobj:
       BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
       (*((int*)(to->base))) = to->offset;
       nextBlock(to);
-    }
 #ifdef GC_CACHE_ADAPT
-       if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
-         // end of an to page, wrap up its information
-         float tmp_factor = 
-               ((float)(tmp_ptr-gc_cache_revise_infomation.to_page_start_va))/
-               ((float)(BAMBOO_PAGE_SIZE));
-         for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-               ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
-                )[gc_cache_revise_infomation.to_page_index] += (int)(
-                 ((int*)((void*)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
-                 gc_cache_revise_infomation.orig_page_index]*tmp_factor);
-      // TODO
-/*       if(((gc_cache_revise_infomation.orig_page_start_va-gcbaseva)/(BAMBOO_PAGE_SIZE))*(BAMBOO_PAGE_SIZE)+gcbaseva == 0xd180000) {
-               tprintf("0xd180000 -> %x %d, %d, %d\n",(int)(gcbaseva+(BAMBOO_PAGE_SIZE)*gc_cache_revise_infomation.to_page_index), (int)(((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[gc_cache_revise_infomation.orig_page_index]*tmp_factor), (int)(tmp_factor*100000), (int)(to->ptr-gc_cache_revise_infomation.to_page_start_va));
-         }*/
+         if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
+               // end of an to page, wrap up its information
+               int tmp_factor = tmp_ptr-gc_cache_revise_infomation.to_page_start_va;
+               int topage=gc_cache_revise_infomation.to_page_index;
+               int oldpage = gc_cache_revise_infomation.orig_page_index;
+               int * newtable=&gccachesamplingtbl_r[topage];
+               int * oldtable=&gccachesamplingtbl[oldpage];
+         
+               for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
+                 (*newtable)=((*newtable)+(*oldtable)*tmp_factor);
+                 newtable=(int*) (((char *)newtable)+size_cachesamplingtbl_local_r);
+                 oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
+               }
+               // prepare for an new to page
+               int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+               gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
+               gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
+                 (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
+               gc_cache_revise_infomation.orig_page_index = 
+                 (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+               gc_cache_revise_infomation.to_page_start_va = to->ptr;
+               gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
+                 (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
+               gc_cache_revise_infomation.to_page_index = 
+                 (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
          }
-         // prepare for an new to page
-         gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-         gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-               (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-         gc_cache_revise_infomation.orig_page_index = 
-               (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-         gc_cache_revise_infomation.to_page_start_va = to->ptr;
-         gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
-               (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-         gc_cache_revise_infomation.to_page_index = 
-               (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-       }
 #endif // GC_CACHE_ADAPT
+    }
+#endif
   } // if(mark == 1)
 #ifdef DEBUG
   BAMBOO_DEBUGPRINT(0xe205);
@@ -2245,16 +2249,18 @@ innercompact:
     }
   }
 #ifdef GC_CACHE_ADAPT
-       // end of an to page, wrap up its information
-       float tmp_factor = 
-         ((float)(to->ptr-gc_cache_revise_infomation.to_page_start_va))/
-         ((float)(BAMBOO_PAGE_SIZE));
-       for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-         ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
-          )[gc_cache_revise_infomation.to_page_index] += (int)(
-               ((int*)((void*)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
-               gc_cache_revise_infomation.orig_page_index]*tmp_factor);
-       }
+  // end of an to page, wrap up its information
+  int tmp_factor = to->ptr-gc_cache_revise_infomation.to_page_start_va;
+  int topage=gc_cache_revise_infomation.to_page_index;
+  int oldpage = gc_cache_revise_infomation.orig_page_index;
+  int * newtable=&gccachesamplingtbl_r[topage];
+  int * oldtable=&gccachesamplingtbl[oldpage];
+  
+  for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
+    (*newtable) = ((*newtable)+(*oldtable)*tmp_factor);
+    newtable=(int*) (((char *)newtable)+size_cachesamplingtbl_local_r);
+    oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
+  }
 #endif // GC_CACHE_ADAPT
   // if no objs have been compact, do nothing,
   // otherwise, fill the header of this block
@@ -2378,7 +2384,7 @@ innercompact:
        gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
          (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
        gc_cache_revise_infomation.orig_page_index = 
-         orig->blockbase/(BAMBOO_PAGE_SIZE);
+         (orig->blockbase-gcbaseva)/(BAMBOO_PAGE_SIZE);
 #endif // GC_CACHE_ADAPT
     goto innercompact;
   }
@@ -2417,8 +2423,8 @@ inline void compact() {
   gc_cache_revise_infomation.orig_page_end_va = gcbaseva +  
        (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
   gc_cache_revise_infomation.orig_page_index = 
-       orig->blockbase/(BAMBOO_PAGE_SIZE);
-#endif
+       (orig->blockbase-gcbaseva)/(BAMBOO_PAGE_SIZE);
+#endif // GC_CACHE_ADAPT
 
   int filledblocks = 0;
   INTPTR heaptopptr = 0;
@@ -2876,7 +2882,6 @@ void cacheAdapt_gc(bool isgccachestage) {
 
 // the master core decides how to adapt cache strategy for the mutator 
 // according to collected statistic data
-extern int gc_num_sampling;
 
 // make all pages hfh
 int cacheAdapt_policy_h4h(){
@@ -2941,8 +2946,8 @@ int cacheAdapt_policy_hotest(){
 
        for(int i = 0; i < NUMCORESACTIVE; i++) {
          int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-                 +page_num*sizeof(float)*i);
-         int freq = local_tbl[page_index];
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
          // TODO
          // check the freqency, decide if this page is hot for the core
          if(hotfreq < freq) {
@@ -2995,8 +3000,8 @@ int cacheAdapt_policy_dominate(){
        
        for(int i = 0; i < NUMCORESACTIVE; i++) {
          int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-                 +page_num*sizeof(float)*i);
-         int freq = local_tbl[page_index];
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
          totalfreq += freq;
          // TODO
          // check the freqency, decide if this page is hot for the core
@@ -3034,7 +3039,7 @@ int cacheAdapt_policy_dominate(){
   return numchanged;
 } // int cacheAdapt_policy_dominate()
 
-#define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 1000
+#define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 20000
 
 void gc_quicksort(int *array, 
                      int left,
@@ -3047,10 +3052,10 @@ void gc_quicksort(int *array,
        pivot = (left+right)/2;
        while((leftIdx <= pivot) && (rightIdx >= pivot)) {
          int pivotValue = array[pivot*3-offset];
-         while((array[leftIdx*3-offset] < pivotValue) && (leftIdx <= pivot)) {
+         while((array[leftIdx*3-offset] > pivotValue) && (leftIdx <= pivot)) {
                leftIdx++;
          }
-         while((array[rightIdx*3-offset] > pivotValue) && (rightIdx >= pivot)) {
+         while((array[rightIdx*3-offset] < pivotValue) && (rightIdx >= pivot)) {
                rightIdx--;
          }
          // swap [leftIdx] & [rightIdx]
@@ -3084,8 +3089,9 @@ int cacheAdapt_policy_overload(){
   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
   int numchanged = 0;
   int * tmp_p = gccachepolicytbl+1;
-  int workload[NUMCORESACTIVE];
-  memset(workload, 0, NUMCORESACTIVE*sizeof(int));
+  unsigned long long workload[NUMCORESACTIVE];
+  memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
+  unsigned long long total_workload = 0;
   int core2heavypages[NUMCORESACTIVE][page_num*3+1];
   memset(core2heavypages, 0, sizeof(int)*(page_num*3+1)*NUMCORESACTIVE);
   for(page_index = 0; page_index < page_num; page_index++) {
@@ -3097,8 +3103,8 @@ int cacheAdapt_policy_overload(){
        
        for(int i = 0; i < NUMCORESACTIVE; i++) {
          int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-                 +page_num*sizeof(float)*i);
-         int freq = local_tbl[page_index];
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
          totalfreq += freq;
          // TODO
          // check the freqency, decide if this page is hot for the core
@@ -3106,6 +3112,10 @@ int cacheAdapt_policy_overload(){
                hotfreq = freq;
                hotestcore = i;
          }
+         // TODO
+         /*if(page_sva == 0x10e90000) {
+               if(freq != 0) tprintf("0x10e90000 core %d, %d\n", i, freq);
+         }*/
        }
        // TODO
        // Decide the cache strategy for this page
@@ -3128,6 +3138,7 @@ int cacheAdapt_policy_overload(){
        tmp_p++;
        numchanged++;
        workload[hotestcore] += totalfreq;
+       total_workload += totalfreq;
        // insert into core2heavypages using quicksort
        int remoteaccess = totalfreq - hotfreq;
        int index = core2heavypages[hotestcore][0];
@@ -3135,16 +3146,26 @@ int cacheAdapt_policy_overload(){
        core2heavypages[hotestcore][3*index+2] = totalfreq;
        core2heavypages[hotestcore][3*index+1] = tmp_p-1;
        core2heavypages[hotestcore][0]++;
+       // TODO
+       /*if(page_sva == 0x10f10000) {
+       int block = 0;
+       BLOCKINDEX(page_sva, &block);
+       int coren = gc_block2core[block%(NUMCORES4GC*2)];
+       int coord_x =  bamboo_cpu2coords[2*coren]+1;
+       int coord_y = bamboo_cpu2coords[2*coren+1]+1;
+         tprintf("+++ %x(%d-%d,%d) hotcore %d, total %d, hot %d, remote %d, index %d p %x\n", (int)page_sva, coren, coord_x, coord_y, hotestcore, totalfreq, hotfreq, remoteaccess, index, (int)(tmp_p-1));
+       }*/
   }
 
+  int workload_threshold = total_workload / 10;
   // Check the workload of each core
   for(int i = 0; i < NUMCORESACTIVE; i++) {
        int j = 1;
        int index = core2heavypages[i][0];
-       if(workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) {
+       if(workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) {
          // sort according to the remoteaccess
          gc_quicksort(&core2heavypages[i][0], 1, index, 0);
-         while((workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) && (j<index*3)) {
+         while((workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) && (j<index*3)) {
                // hfh those pages with more remote accesses 
                bamboo_cache_policy_t policy = {0};
                policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
@@ -3159,7 +3180,7 @@ int cacheAdapt_policy_overload(){
 } // int cacheAdapt_policy_overload()
 
 #define GC_CACHE_ADAPT_ACCESS_THRESHOLD 70
-#define GC_CACHE_ADAPT_CROWD_THRESHOLD  10
+#define GC_CACHE_ADAPT_CROWD_THRESHOLD  20
 // Every page cached on the core that accesses it the most. 
 // Check to see if any core's pages total more accesses than threshold 
 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
@@ -3177,8 +3198,9 @@ int cacheAdapt_policy_crowd(){
   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
   int numchanged = 0;
   int * tmp_p = gccachepolicytbl+1;
-  int workload[NUMCORESACTIVE];
-  memset(workload, 0, NUMCORESACTIVE*sizeof(int));
+  unsigned long long workload[NUMCORESACTIVE];
+  memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
+  unsigned long long total_workload = 0;
   int core2heavypages[NUMCORESACTIVE][page_num*3+1];
   memset(core2heavypages, 0, sizeof(int)*(page_num*3+1)*NUMCORESACTIVE);
   for(page_index = 0; page_index < page_num; page_index++) {
@@ -3190,8 +3212,8 @@ int cacheAdapt_policy_crowd(){
        
        for(int i = 0; i < NUMCORESACTIVE; i++) {
          int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-                 +page_num*sizeof(float)*i);
-         int freq = local_tbl[page_index];
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
          totalfreq += freq;
          // TODO
          // check the freqency, decide if this page is hot for the core
@@ -3199,6 +3221,10 @@ int cacheAdapt_policy_crowd(){
                hotfreq = freq;
                hotestcore = i;
          }
+         // TODO
+         /*if(page_sva == 0x10e90000) {
+               if(freq != 0) tprintf("0x10e90000 core %d, %d\n", i, freq);
+         }*/
        }
        // TODO
        // Decide the cache strategy for this page
@@ -3221,6 +3247,7 @@ int cacheAdapt_policy_crowd(){
        tmp_p++;
        numchanged++;
        workload[hotestcore] += totalfreq;
+       total_workload += totalfreq;
        // insert into core2heavypages using quicksort
        int remoteaccess = totalfreq - hotfreq;
        int index = core2heavypages[hotestcore][0];
@@ -3228,16 +3255,26 @@ int cacheAdapt_policy_crowd(){
        core2heavypages[hotestcore][3*index+2] = totalfreq;
        core2heavypages[hotestcore][3*index+1] = tmp_p-1;
        core2heavypages[hotestcore][0]++;
+       // TODO
+       /*if(page_sva == 0x10f10000) {
+       int block = 0;
+       BLOCKINDEX(page_sva, &block);
+       int coren = gc_block2core[block%(NUMCORES4GC*2)];
+       int coord_x =  bamboo_cpu2coords[2*coren]+1;
+       int coord_y = bamboo_cpu2coords[2*coren+1]+1;
+         tprintf("+++ %x(%d-%d,%d) hotcore %d, total %d, hot %d, remote %d, index %d p %x\n", (int)page_sva, coren, coord_x, coord_y, hotestcore, totalfreq, hotfreq, remoteaccess, index, (int)(tmp_p-1));
+       }*/
   }
 
+  int workload_threshold = total_workload / 10;
   // Check the workload of each core
   for(int i = 0; i < NUMCORESACTIVE; i++) {
        int j = 1;
        int index = core2heavypages[i][0];
-       if(workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) {
-         // sort according to the remote access
+       if(workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) {
+         // sort according to the remoteaccess
          gc_quicksort(&core2heavypages[i][0], 1, index, 0);
-         while((workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) && (j<index*3)) {
+         while((workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) && (j<index*3)) {
                // hfh those pages with more remote accesses 
                bamboo_cache_policy_t policy = {0};
                policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
@@ -3246,46 +3283,47 @@ int cacheAdapt_policy_crowd(){
                j += 3;
          }
        }
-       
+
        // Check if the accesses are crowded on few pages
        // sort according to the total access
+inner_crowd:
        gc_quicksort(&core2heavypages[i][0], j/3+1, index, 1);
        int threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
        int num_crowded = 0;
        int t_workload = 0;
-       for(;(num_crowded<GC_CACHE_ADAPT_CROWD_THRESHOLD)
-               &&(num_crowded<(index-j/3)); num_crowded++) {
+       do {
          t_workload += core2heavypages[i][j+num_crowded*3+1];
-       }
+         num_crowded++;
+       } while(t_workload < threshold);
        // num_crowded <= GC_CACHE_ADAPT_CROWD_THRESHOLD and if there are enough 
        // items, it is always == GC_CACHE_ADAPT_CROWD_THRESHOLD
-       if(t_workload > threshold) {
-inner_crowd:
+       if(num_crowded > GC_CACHE_ADAPT_CROWD_THRESHOLD) {
+//inner_crowd:
          // need to hfh these pages
          // sort the pages according to remote access
          gc_quicksort(&core2heavypages[i][0], j/3+1, j/3+num_crowded, 0);
-         while((num_crowded--) && (j < index*3)) {
+         //while((num_crowded--) && (j < index*3)) {
                // h4h those pages with more remote accesses 
                bamboo_cache_policy_t policy = {0};
                policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
                *((int*)core2heavypages[i][j]) = policy.word;
                workload[i] -= core2heavypages[i][j+1];
                t_workload -= core2heavypages[i][j+1];
-               if((j/3+GC_CACHE_ADAPT_CROWD_THRESHOLD) < index) {
+               /*if((j/3+GC_CACHE_ADAPT_CROWD_THRESHOLD) < index) {
                  t_workload += 
                        core2heavypages[i][j+GC_CACHE_ADAPT_CROWD_THRESHOLD*3+1];
-               }
+               }*/
                j += 3;
                threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
-               if(t_workload <= threshold) {
+               /*if(t_workload <= threshold) {
                  break;
                }
          }
          if((j < index*3) && (t_workload > threshold)) {
                num_crowded = ((index-j/3) > GC_CACHE_ADAPT_CROWD_THRESHOLD) ?
-                 (GC_CACHE_ADAPT_CROWD_THRESHOLD) : (index-j/3);
+                 (GC_CACHE_ADAPT_CROWD_THRESHOLD) : (index-j/3);*/
                goto inner_crowd;
-         }
+//       }
        }
   }
 
@@ -3293,14 +3331,18 @@ inner_crowd:
 } // int cacheAdapt_policy_overload()
 
 void cacheAdapt_master() {
+#ifdef GC_CACHE_ADAPT
+  //gc_output_cache_sampling_r();
+#endif // GC_CACHE_ADAPT
+  int numchanged = 0;
   // check the statistic data
   // for each page, decide the new cache strategy
-  //int numchanged = cacheAdapt_policy_h4h();
-  //int numchanged = cacheAdapt_policy_local();
-  //int numchanged = cacheAdapt_policy_hotest();
-  //int numchanged = cacheAdapt_policy_dominate();
-  int numchanged = cacheAdapt_policy_overload();
-  //int numchanged = cacheAdapt_policy_crowd();
+  numchanged = cacheAdapt_policy_h4h();
+  //numchanged = cacheAdapt_policy_local();
+  //numchanged = cacheAdapt_policy_hotest();
+  //numchanged = cacheAdapt_policy_dominate();
+  //numchanged = cacheAdapt_policy_overload();
+  //numchanged = cacheAdapt_policy_crowd();
   *gccachepolicytbl = numchanged;
   // TODO
   //if(numchanged > 0) tprintf("=================\n");
@@ -3329,6 +3371,50 @@ void cacheAdapt_mutator() {
   }
   //if(BAMBOO_NUM_OF_CORE == 0) tprintf("=================\n"); // TODO
 }
+
+void gc_output_cache_sampling() {
+  unsigned int page_index = 0;
+  VA page_sva = 0;
+  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  for(page_index = 0; page_index < page_num; page_index++) {
+       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+       int block = 0;
+       BLOCKINDEX(page_sva, &block);
+       int coren = gc_block2core[block%(NUMCORES4GC*2)];
+       tprintf("va: %x page_index: %d host: %d\n", 
+               (int)page_sva, page_index, coren);
+       for(int i = 0; i < NUMCORESACTIVE; i++) {
+         int * local_tbl = (int *)((void *)gccachesamplingtbl
+                 +size_cachesamplingtbl_local*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+         printf("%8d ",freq);
+       }
+       printf("\n");
+  }
+  printf("=================\n");
+} // gc_output_cache_sampling
+
+void gc_output_cache_sampling_r() {
+  unsigned int page_index = 0;
+  VA page_sva = 0;
+  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  for(page_index = 0; page_index < page_num; page_index++) {
+       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+       int block = 0;
+       BLOCKINDEX(page_sva, &block);
+       int coren = gc_block2core[block%(NUMCORES4GC*2)];
+       tprintf("va: %x page_index: %d host: %d\n", 
+               (int)page_sva, page_index, coren);
+       for(int i = 0; i < NUMCORESACTIVE; i++) {
+         int * local_tbl = (int *)((void *)gccachesamplingtbl_r
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+         printf("%8d ",freq);
+       }
+       printf("\n");
+  }
+  printf("=================\n");
+} // gc_output_cache_sampling
 #endif // GC_CACHE_ADAPT
 
 inline void gc_collect(struct garbagelist * stackptr) {
@@ -3572,6 +3658,9 @@ inline void gc_master(struct garbagelist * stackptr) {
 #ifdef GC_PROFILE
   gc_profileItem();
 #endif
+#ifdef GC_CACHE_ADAPT
+  //gc_output_cache_sampling();
+#endif // GC_CACHE_ADAPT
 #ifdef RAWPATH // TODO GC_DEBUG
   printf("(%x,%x) Start mark phase \n", udn_tile_coord_x(), 
                 udn_tile_coord_y());
@@ -3882,6 +3971,9 @@ inline void gc_master(struct garbagelist * stackptr) {
 #endif
   // cache adapt phase
   cacheAdapt_mutator();
+#ifdef GC_CACHE_ADAPT_OUTPUT
+  bamboo_output_cache_policy();
+#endif
   cacheAdapt_gc(false);
   gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
   while(PREFINISHPHASE == gcphase) {
@@ -4024,10 +4116,12 @@ pregccheck:
        }
 #endif
 #ifdef GC_CACHE_ADAPT
+#ifdef GC_CACHE_SAMPLING
     // disable the timer interrupt
     bamboo_mask_timer_intr();
     // get the sampling data 
     bamboo_output_dtlb_sampling();
+#endif // GC_CACHE_SAMPLING
 #endif // GC_CACHE_ADAPT
        gcprocessing = true;
        gc_master(stackptr);
@@ -4050,12 +4144,14 @@ pregccheck:
        }
 #endif
 #ifdef GC_CACHE_ADAPT
+#ifdef GC_CACHE_SAMPLING
        // disable the timer interrupt
        bamboo_mask_timer_intr();
        if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
          // get the sampling data 
          bamboo_output_dtlb_sampling();
        }
+#endif // GC_CACHE_SAMPLING
 #endif // GC_CACHE_ADAPT
     gcprocessing = true;
     gc_collect(stackptr);
@@ -4085,12 +4181,14 @@ pregccheck:
        }
 #endif
 #ifdef GC_CACHE_ADAPT
+#ifdef GC_CACHE_SAMPLING
        // disable the timer interrupt
        bamboo_mask_timer_intr();
        if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
          // get the sampling data 
          bamboo_output_dtlb_sampling();
        }
+#endif // GC_CACHE_SAMPLING
 #endif // GC_CACHE_ADAPT
     // not a gc core, should wait for gcfinish msg
     gcprocessing = true;
@@ -4104,8 +4202,10 @@ pregccheck:
     gcprocessing = false;
   }
 #ifdef GC_CACHE_ADAPT
+#ifdef GC_CACHE_SAMPLING
   // reset the sampling arrays
   bamboo_dtlb_sampling_reset();
+#endif // GC_CACHE_SAMPLING
   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
        // zero out the gccachesamplingtbl
        BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);
@@ -4115,9 +4215,11 @@ pregccheck:
          BAMBOO_MEMSET_WH(gccachepolicytbl,0,size_cachepolicytbl);
        }
   }
+#ifdef GC_CACHE_SAMPLING
   // enable the timer interrupt
   bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING); 
   bamboo_unmask_timer_intr();
+#endif // GC_CACHE_SAMPLING
 #endif // GC_CACHE_ADAPT
   return true;
 } // void gc(struct garbagelist * stackptr)