fix performance bug

[IRC.git] / Robust / src / Runtime / bamboo / multicoregarbage.c
diff --git a/Robust/src/Runtime/bamboo/multicoregarbage.c b/Robust/src/Runtime/bamboo/multicoregarbage.c

index b5897919e609a498f463158cf7b7ab8ac4bd2ff5..9f6f714fedba8c816445a4a035f36d0d3842ae97 100644 (file)
--- a/Robust/src/Runtime/bamboo/multicoregarbage.c
+++ b/Robust/src/Runtime/bamboo/multicoregarbage.c
@@ -395,6 +395,13 @@ inline int hostcore(void * ptr) {
    return host;
  } // int hostcore(void * ptr)
  
+inline void cpu2coords(int coren,
+                          int * x,
+                                          int * y) {
+  *x = bamboo_cpu2coords[2*coren];
+  *y = bamboo_cpu2coords[2*coren+1];
+} // void cpu2coords(...)
+
  inline bool isLocal(void * ptr) {
    // check if a pointer is in shared heap on this core
    return hostcore(ptr) == BAMBOO_NUM_OF_CORE;
@@ -1871,7 +1878,7 @@ inline bool initOrig_Dst(struct moveHelper * orig,
         ((to->base-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
    gc_cache_revise_infomation.to_page_index = 
         (to->base-gcbaseva)/(BAMBOO_PAGE_SIZE);
-  gc_cache_revise_infomation.orig_page_start_va = -1; 
+  gc_cache_revise_infomation.orig_page_start_va = -1;
  #endif // GC_CACHE_ADAPT
  
    // init the orig ptr
@@ -1954,25 +1961,23 @@ innermoveobj:
    if(orig->ptr >= gc_cache_revise_infomation.orig_page_end_va) {
         // end of an orig page
         // compute the impact of this page for the new page
-       float tmp_factor = 
-         ((float)(to->ptr-gc_cache_revise_infomation.to_page_start_va))/
-         ((float)(BAMBOO_PAGE_SIZE));
+       int tmp_factor = to->ptr-gc_cache_revise_infomation.to_page_start_va; 
+       int topage=gc_cache_revise_infomation.to_page_index;
+       int oldpage = gc_cache_revise_infomation.orig_page_index;
+       int * newtable=&gccachesamplingtbl_r[topage];
+       int * oldtable=&gccachesamplingtbl[oldpage];
+       
         for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-         ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
-                )[gc_cache_revise_infomation.to_page_index] += (int)(
-                  ((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
-               gc_cache_revise_infomation.orig_page_index]*tmp_factor);
-         // TODO
-/*       if(((gc_cache_revise_infomation.orig_page_start_va-gcbaseva)/(BAMBOO_PAGE_SIZE))*(BAMBOO_PAGE_SIZE)+gcbaseva == 0xd180000) {
-               tprintf("0xd180000 -> %x %d, %d, %d\n",(int)(gcbaseva+(BAMBOO_PAGE_SIZE)*gc_cache_revise_infomation.to_page_index), (int)(((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[gc_cache_revise_infomation.orig_page_index]*tmp_factor), (int)(tmp_factor*100000), (int)(to->ptr-gc_cache_revise_infomation.to_page_start_va));
-         }*/
+         (*newtable) += (*oldtable)*tmp_factor;
+         newtable=(int*)(((char *)newtable)+size_cachesamplingtbl_local_r);
+         oldtable=(int*)(((char *)oldtable)+size_cachesamplingtbl_local);
         }
         // prepare for an new orig page
+       int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
         gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
         gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-         (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-       gc_cache_revise_infomation.orig_page_index = 
-         (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+         (BAMBOO_PAGE_SIZE)*(tmp_index+1);
+       gc_cache_revise_infomation.orig_page_index = tmp_index;
         gc_cache_revise_infomation.to_page_start_va = to->ptr;
    }
  #endif
@@ -2039,32 +2044,30 @@ innermoveobj:
        nextBlock(to);
  #ifdef GC_CACHE_ADAPT
           if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
-         // end of an to page, wrap up its information
-         float tmp_factor = 
-               ((float)(tmp_ptr-gc_cache_revise_infomation.to_page_start_va))/
-               ((float)(BAMBOO_PAGE_SIZE));
-         for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-               ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
-                )[gc_cache_revise_infomation.to_page_index] += (int)(
-                 ((int*)((void*)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
-                 gc_cache_revise_infomation.orig_page_index]*tmp_factor);
-               // TODO
-/*       if(((gc_cache_revise_infomation.orig_page_start_va-gcbaseva)/(BAMBOO_PAGE_SIZE))*(BAMBOO_PAGE_SIZE)+gcbaseva == 0xd180000) {
-               tprintf("0xd180000 -> %x %d, %d, %d\n",(int)(gcbaseva+(BAMBOO_PAGE_SIZE)*gc_cache_revise_infomation.to_page_index), (int)(((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[gc_cache_revise_infomation.orig_page_index]*tmp_factor), (int)(tmp_factor*100000), (int)(to->ptr-gc_cache_revise_infomation.to_page_start_va));
-         }*/
-         }
-         // prepare for an new to page
-         gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-         gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-               (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-         gc_cache_revise_infomation.orig_page_index = 
-               (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-         gc_cache_revise_infomation.to_page_start_va = to->ptr;
-         gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
-               (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-         gc_cache_revise_infomation.to_page_index = 
-               (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-       }
+               // end of an to page, wrap up its information
+               int tmp_factor = tmp_ptr-gc_cache_revise_infomation.to_page_start_va;
+               int topage=gc_cache_revise_infomation.to_page_index;
+               int oldpage = gc_cache_revise_infomation.orig_page_index;
+               int * newtable=&gccachesamplingtbl_r[topage];
+               int * oldtable=&gccachesamplingtbl[oldpage];
+         
+               for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
+                 (*newtable)=((*newtable)+(*oldtable)*tmp_factor);
+                 newtable=(int*) (((char *)newtable)+size_cachesamplingtbl_local_r);
+                 oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
+               }
+               // prepare for an new to page
+               int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+               gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
+               gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
+                 (BAMBOO_PAGE_SIZE)*(tmp_index+1);
+               gc_cache_revise_infomation.orig_page_index = tmp_index;
+               gc_cache_revise_infomation.to_page_start_va = to->ptr;
+               gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
+                 (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
+               gc_cache_revise_infomation.to_page_index = 
+                 (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+      }
  #endif // GC_CACHE_ADAPT
        if(stopblock == to->numblocks) {
                 // already fulfilled the block
@@ -2111,6 +2114,7 @@ innermoveobj:
      to->ptr += isize;
      to->offset += isize;
      to->top += isize;
+#if 0
  #ifdef GC_CACHE_ADAPT
         int tmp_ptr = to->ptr;
  #endif // GC_CACHE_ADAPT
@@ -2119,36 +2123,36 @@ innermoveobj:
        BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
        (*((int*)(to->base))) = to->offset;
        nextBlock(to);
-    }
  #ifdef GC_CACHE_ADAPT
-       if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
-         // end of an to page, wrap up its information
-         float tmp_factor = 
-               ((float)(tmp_ptr-gc_cache_revise_infomation.to_page_start_va))/
-               ((float)(BAMBOO_PAGE_SIZE));
-         for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-               ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
-                )[gc_cache_revise_infomation.to_page_index] += (int)(
-                 ((int*)((void*)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
-                 gc_cache_revise_infomation.orig_page_index]*tmp_factor);
-      // TODO
-/*       if(((gc_cache_revise_infomation.orig_page_start_va-gcbaseva)/(BAMBOO_PAGE_SIZE))*(BAMBOO_PAGE_SIZE)+gcbaseva == 0xd180000) {
-               tprintf("0xd180000 -> %x %d, %d, %d\n",(int)(gcbaseva+(BAMBOO_PAGE_SIZE)*gc_cache_revise_infomation.to_page_index), (int)(((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[gc_cache_revise_infomation.orig_page_index]*tmp_factor), (int)(tmp_factor*100000), (int)(to->ptr-gc_cache_revise_infomation.to_page_start_va));
-         }*/
+         if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
+               // end of an to page, wrap up its information
+               int tmp_factor = tmp_ptr-gc_cache_revise_infomation.to_page_start_va;
+               int topage=gc_cache_revise_infomation.to_page_index;
+               int oldpage = gc_cache_revise_infomation.orig_page_index;
+               int * newtable=&gccachesamplingtbl_r[topage];
+               int * oldtable=&gccachesamplingtbl[oldpage];
+         
+               for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
+                 (*newtable)=((*newtable)+(*oldtable)*tmp_factor);
+                 newtable=(int*) (((char *)newtable)+size_cachesamplingtbl_local_r);
+                 oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
+               }
+               // prepare for an new to page
+               int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+               gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
+               gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
+                 (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
+               gc_cache_revise_infomation.orig_page_index = 
+                 (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
+               gc_cache_revise_infomation.to_page_start_va = to->ptr;
+               gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
+                 (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
+               gc_cache_revise_infomation.to_page_index = 
+                 (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
           }
-         // prepare for an new to page
-         gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
-         gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
-               (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-         gc_cache_revise_infomation.orig_page_index = 
-               (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-         gc_cache_revise_infomation.to_page_start_va = to->ptr;
-         gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
-               (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
-         gc_cache_revise_infomation.to_page_index = 
-               (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
-       }
  #endif // GC_CACHE_ADAPT
+    }
+#endif
    } // if(mark == 1)
  #ifdef DEBUG
    BAMBOO_DEBUGPRINT(0xe205);
@@ -2245,16 +2249,18 @@ innercompact:
      }
    }
  #ifdef GC_CACHE_ADAPT
-       // end of an to page, wrap up its information
-       float tmp_factor = 
-         ((float)(to->ptr-gc_cache_revise_infomation.to_page_start_va))/
-         ((float)(BAMBOO_PAGE_SIZE));
-       for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
-         ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
-          )[gc_cache_revise_infomation.to_page_index] += (int)(
-               ((int*)((void*)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
-               gc_cache_revise_infomation.orig_page_index]*tmp_factor);
-       }
+  // end of an to page, wrap up its information
+  int tmp_factor = to->ptr-gc_cache_revise_infomation.to_page_start_va;
+  int topage=gc_cache_revise_infomation.to_page_index;
+  int oldpage = gc_cache_revise_infomation.orig_page_index;
+  int * newtable=&gccachesamplingtbl_r[topage];
+  int * oldtable=&gccachesamplingtbl[oldpage];
+  
+  for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
+    (*newtable) = ((*newtable)+(*oldtable)*tmp_factor);
+    newtable=(int*) (((char *)newtable)+size_cachesamplingtbl_local_r);
+    oldtable=(int*) (((char *)oldtable)+size_cachesamplingtbl_local);
+  }
  #endif // GC_CACHE_ADAPT
    // if no objs have been compact, do nothing,
    // otherwise, fill the header of this block
@@ -2378,7 +2384,7 @@ innercompact:
         gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
           (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
         gc_cache_revise_infomation.orig_page_index = 
-         orig->blockbase/(BAMBOO_PAGE_SIZE);
+         (orig->blockbase-gcbaseva)/(BAMBOO_PAGE_SIZE);
  #endif // GC_CACHE_ADAPT
      goto innercompact;
    }
@@ -2417,8 +2423,8 @@ inline void compact() {
    gc_cache_revise_infomation.orig_page_end_va = gcbaseva +  
         (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
    gc_cache_revise_infomation.orig_page_index = 
-       orig->blockbase/(BAMBOO_PAGE_SIZE);
-#endif
+       (orig->blockbase-gcbaseva)/(BAMBOO_PAGE_SIZE);
+#endif // GC_CACHE_ADAPT
  
    int filledblocks = 0;
    INTPTR heaptopptr = 0;
@@ -2876,7 +2882,6 @@ void cacheAdapt_gc(bool isgccachestage) {
  
  // the master core decides how to adapt cache strategy for the mutator 
  // according to collected statistic data
-extern int gc_num_sampling;
  
  // make all pages hfh
  int cacheAdapt_policy_h4h(){
@@ -2941,8 +2946,8 @@ int cacheAdapt_policy_hotest(){
  
         for(int i = 0; i < NUMCORESACTIVE; i++) {
           int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-                 +page_num*sizeof(float)*i);
-         int freq = local_tbl[page_index];
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
           // TODO
           // check the freqency, decide if this page is hot for the core
           if(hotfreq < freq) {
@@ -2995,8 +3000,8 @@ int cacheAdapt_policy_dominate(){
         
         for(int i = 0; i < NUMCORESACTIVE; i++) {
           int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-                 +page_num*sizeof(float)*i);
-         int freq = local_tbl[page_index];
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
           totalfreq += freq;
           // TODO
           // check the freqency, decide if this page is hot for the core
@@ -3034,7 +3039,7 @@ int cacheAdapt_policy_dominate(){
    return numchanged;
  } // int cacheAdapt_policy_dominate()
  
-#define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 1000
+#define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 20000
  
  void gc_quicksort(int *array, 
                       int left,
@@ -3047,10 +3052,10 @@ void gc_quicksort(int *array,
         pivot = (left+right)/2;
         while((leftIdx <= pivot) && (rightIdx >= pivot)) {
           int pivotValue = array[pivot*3-offset];
-         while((array[leftIdx*3-offset] < pivotValue) && (leftIdx <= pivot)) {
+         while((array[leftIdx*3-offset] > pivotValue) && (leftIdx <= pivot)) {
                 leftIdx++;
           }
-         while((array[rightIdx*3-offset] > pivotValue) && (rightIdx >= pivot)) {
+         while((array[rightIdx*3-offset] < pivotValue) && (rightIdx >= pivot)) {
                 rightIdx--;
           }
           // swap [leftIdx] & [rightIdx]
@@ -3084,8 +3089,9 @@ int cacheAdapt_policy_overload(){
    unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
    int numchanged = 0;
    int * tmp_p = gccachepolicytbl+1;
-  int workload[NUMCORESACTIVE];
-  memset(workload, 0, NUMCORESACTIVE*sizeof(int));
+  unsigned long long workload[NUMCORESACTIVE];
+  memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
+  unsigned long long total_workload = 0;
    int core2heavypages[NUMCORESACTIVE][page_num*3+1];
    memset(core2heavypages, 0, sizeof(int)*(page_num*3+1)*NUMCORESACTIVE);
    for(page_index = 0; page_index < page_num; page_index++) {
@@ -3097,8 +3103,8 @@ int cacheAdapt_policy_overload(){
         
         for(int i = 0; i < NUMCORESACTIVE; i++) {
           int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-                 +page_num*sizeof(float)*i);
-         int freq = local_tbl[page_index];
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
           totalfreq += freq;
           // TODO
           // check the freqency, decide if this page is hot for the core
@@ -3106,6 +3112,10 @@ int cacheAdapt_policy_overload(){
                 hotfreq = freq;
                 hotestcore = i;
           }
+         // TODO
+         /*if(page_sva == 0x10e90000) {
+               if(freq != 0) tprintf("0x10e90000 core %d, %d\n", i, freq);
+         }*/
         }
         // TODO
         // Decide the cache strategy for this page
@@ -3128,6 +3138,7 @@ int cacheAdapt_policy_overload(){
         tmp_p++;
         numchanged++;
         workload[hotestcore] += totalfreq;
+       total_workload += totalfreq;
         // insert into core2heavypages using quicksort
         int remoteaccess = totalfreq - hotfreq;
         int index = core2heavypages[hotestcore][0];
@@ -3135,16 +3146,26 @@ int cacheAdapt_policy_overload(){
         core2heavypages[hotestcore][3*index+2] = totalfreq;
         core2heavypages[hotestcore][3*index+1] = tmp_p-1;
         core2heavypages[hotestcore][0]++;
+       // TODO
+       /*if(page_sva == 0x10f10000) {
+       int block = 0;
+       BLOCKINDEX(page_sva, &block);
+       int coren = gc_block2core[block%(NUMCORES4GC*2)];
+       int coord_x =  bamboo_cpu2coords[2*coren]+1;
+       int coord_y = bamboo_cpu2coords[2*coren+1]+1;
+         tprintf("+++ %x(%d-%d,%d) hotcore %d, total %d, hot %d, remote %d, index %d p %x\n", (int)page_sva, coren, coord_x, coord_y, hotestcore, totalfreq, hotfreq, remoteaccess, index, (int)(tmp_p-1));
+       }*/
    }
  
+  int workload_threshold = total_workload / 10;
    // Check the workload of each core
    for(int i = 0; i < NUMCORESACTIVE; i++) {
         int j = 1;
         int index = core2heavypages[i][0];
-       if(workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) {
+       if(workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) {
           // sort according to the remoteaccess
           gc_quicksort(&core2heavypages[i][0], 1, index, 0);
-         while((workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) && (j<index*3)) {
+         while((workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) && (j<index*3)) {
                 // hfh those pages with more remote accesses 
                 bamboo_cache_policy_t policy = {0};
                 policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
@@ -3159,7 +3180,7 @@ int cacheAdapt_policy_overload(){
  } // int cacheAdapt_policy_overload()
  
  #define GC_CACHE_ADAPT_ACCESS_THRESHOLD 70
-#define GC_CACHE_ADAPT_CROWD_THRESHOLD  10
+#define GC_CACHE_ADAPT_CROWD_THRESHOLD  20
  // Every page cached on the core that accesses it the most. 
  // Check to see if any core's pages total more accesses than threshold 
  // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
@@ -3177,8 +3198,9 @@ int cacheAdapt_policy_crowd(){
    unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
    int numchanged = 0;
    int * tmp_p = gccachepolicytbl+1;
-  int workload[NUMCORESACTIVE];
-  memset(workload, 0, NUMCORESACTIVE*sizeof(int));
+  unsigned long long workload[NUMCORESACTIVE];
+  memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
+  unsigned long long total_workload = 0;
    int core2heavypages[NUMCORESACTIVE][page_num*3+1];
    memset(core2heavypages, 0, sizeof(int)*(page_num*3+1)*NUMCORESACTIVE);
    for(page_index = 0; page_index < page_num; page_index++) {
@@ -3190,8 +3212,8 @@ int cacheAdapt_policy_crowd(){
         
         for(int i = 0; i < NUMCORESACTIVE; i++) {
           int * local_tbl = (int *)((void *)gccachesamplingtbl_r
-                 +page_num*sizeof(float)*i);
-         int freq = local_tbl[page_index];
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
           totalfreq += freq;
           // TODO
           // check the freqency, decide if this page is hot for the core
@@ -3199,6 +3221,10 @@ int cacheAdapt_policy_crowd(){
                 hotfreq = freq;
                 hotestcore = i;
           }
+         // TODO
+         /*if(page_sva == 0x10e90000) {
+               if(freq != 0) tprintf("0x10e90000 core %d, %d\n", i, freq);
+         }*/
         }
         // TODO
         // Decide the cache strategy for this page
@@ -3221,6 +3247,7 @@ int cacheAdapt_policy_crowd(){
         tmp_p++;
         numchanged++;
         workload[hotestcore] += totalfreq;
+       total_workload += totalfreq;
         // insert into core2heavypages using quicksort
         int remoteaccess = totalfreq - hotfreq;
         int index = core2heavypages[hotestcore][0];
@@ -3228,16 +3255,26 @@ int cacheAdapt_policy_crowd(){
         core2heavypages[hotestcore][3*index+2] = totalfreq;
         core2heavypages[hotestcore][3*index+1] = tmp_p-1;
         core2heavypages[hotestcore][0]++;
+       // TODO
+       /*if(page_sva == 0x10f10000) {
+       int block = 0;
+       BLOCKINDEX(page_sva, &block);
+       int coren = gc_block2core[block%(NUMCORES4GC*2)];
+       int coord_x =  bamboo_cpu2coords[2*coren]+1;
+       int coord_y = bamboo_cpu2coords[2*coren+1]+1;
+         tprintf("+++ %x(%d-%d,%d) hotcore %d, total %d, hot %d, remote %d, index %d p %x\n", (int)page_sva, coren, coord_x, coord_y, hotestcore, totalfreq, hotfreq, remoteaccess, index, (int)(tmp_p-1));
+       }*/
    }
  
+  int workload_threshold = total_workload / 10;
    // Check the workload of each core
    for(int i = 0; i < NUMCORESACTIVE; i++) {
         int j = 1;
         int index = core2heavypages[i][0];
-       if(workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) {
-         // sort according to the remote access
+       if(workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) {
+         // sort according to the remoteaccess
           gc_quicksort(&core2heavypages[i][0], 1, index, 0);
-         while((workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) && (j<index*3)) {
+         while((workload[i] > workload_threshold/*GC_CACHE_ADAPT_OVERLOAD_THRESHOLD*/) && (j<index*3)) {
                 // hfh those pages with more remote accesses 
                 bamboo_cache_policy_t policy = {0};
                 policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
@@ -3246,46 +3283,47 @@ int cacheAdapt_policy_crowd(){
                 j += 3;
           }
         }
-       
+
         // Check if the accesses are crowded on few pages
         // sort according to the total access
+inner_crowd:
         gc_quicksort(&core2heavypages[i][0], j/3+1, index, 1);
         int threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
         int num_crowded = 0;
         int t_workload = 0;
-       for(;(num_crowded<GC_CACHE_ADAPT_CROWD_THRESHOLD)
-               &&(num_crowded<(index-j/3)); num_crowded++) {
+       do {
           t_workload += core2heavypages[i][j+num_crowded*3+1];
-       }
+         num_crowded++;
+       } while(t_workload < threshold);
         // num_crowded <= GC_CACHE_ADAPT_CROWD_THRESHOLD and if there are enough 
         // items, it is always == GC_CACHE_ADAPT_CROWD_THRESHOLD
-       if(t_workload > threshold) {
-inner_crowd:
+       if(num_crowded > GC_CACHE_ADAPT_CROWD_THRESHOLD) {
+//inner_crowd:
           // need to hfh these pages
           // sort the pages according to remote access
           gc_quicksort(&core2heavypages[i][0], j/3+1, j/3+num_crowded, 0);
-         while((num_crowded--) && (j < index*3)) {
+         //while((num_crowded--) && (j < index*3)) {
                 // h4h those pages with more remote accesses 
                 bamboo_cache_policy_t policy = {0};
                 policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
                 *((int*)core2heavypages[i][j]) = policy.word;
                 workload[i] -= core2heavypages[i][j+1];
                 t_workload -= core2heavypages[i][j+1];
-               if((j/3+GC_CACHE_ADAPT_CROWD_THRESHOLD) < index) {
+               /*if((j/3+GC_CACHE_ADAPT_CROWD_THRESHOLD) < index) {
                   t_workload += 
                         core2heavypages[i][j+GC_CACHE_ADAPT_CROWD_THRESHOLD*3+1];
-               }
+               }*/
                 j += 3;
                 threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
-               if(t_workload <= threshold) {
+               /*if(t_workload <= threshold) {
                   break;
                 }
           }
           if((j < index*3) && (t_workload > threshold)) {
                 num_crowded = ((index-j/3) > GC_CACHE_ADAPT_CROWD_THRESHOLD) ?
-                 (GC_CACHE_ADAPT_CROWD_THRESHOLD) : (index-j/3);
+                 (GC_CACHE_ADAPT_CROWD_THRESHOLD) : (index-j/3);*/
                 goto inner_crowd;
-         }
+//       }
         }
    }
  
@@ -3293,14 +3331,18 @@ inner_crowd:
  } // int cacheAdapt_policy_overload()
  
  void cacheAdapt_master() {
+#ifdef GC_CACHE_ADAPT
+  //gc_output_cache_sampling_r();
+#endif // GC_CACHE_ADAPT
+  int numchanged = 0;
    // check the statistic data
    // for each page, decide the new cache strategy
-  //int numchanged = cacheAdapt_policy_h4h();
-  //int numchanged = cacheAdapt_policy_local();
-  //int numchanged = cacheAdapt_policy_hotest();
-  //int numchanged = cacheAdapt_policy_dominate();
-  int numchanged = cacheAdapt_policy_overload();
-  //int numchanged = cacheAdapt_policy_crowd();
+  numchanged = cacheAdapt_policy_h4h();
+  //numchanged = cacheAdapt_policy_local();
+  //numchanged = cacheAdapt_policy_hotest();
+  //numchanged = cacheAdapt_policy_dominate();
+  //numchanged = cacheAdapt_policy_overload();
+  //numchanged = cacheAdapt_policy_crowd();
    *gccachepolicytbl = numchanged;
    // TODO
    //if(numchanged > 0) tprintf("=================\n");
@@ -3329,6 +3371,50 @@ void cacheAdapt_mutator() {
    }
    //if(BAMBOO_NUM_OF_CORE == 0) tprintf("=================\n"); // TODO
  }
+
+void gc_output_cache_sampling() {
+  unsigned int page_index = 0;
+  VA page_sva = 0;
+  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  for(page_index = 0; page_index < page_num; page_index++) {
+       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+       int block = 0;
+       BLOCKINDEX(page_sva, &block);
+       int coren = gc_block2core[block%(NUMCORES4GC*2)];
+       tprintf("va: %x page_index: %d host: %d\n", 
+               (int)page_sva, page_index, coren);
+       for(int i = 0; i < NUMCORESACTIVE; i++) {
+         int * local_tbl = (int *)((void *)gccachesamplingtbl
+                 +size_cachesamplingtbl_local*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+         printf("%8d ",freq);
+       }
+       printf("\n");
+  }
+  printf("=================\n");
+} // gc_output_cache_sampling
+
+void gc_output_cache_sampling_r() {
+  unsigned int page_index = 0;
+  VA page_sva = 0;
+  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  for(page_index = 0; page_index < page_num; page_index++) {
+       page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
+       int block = 0;
+       BLOCKINDEX(page_sva, &block);
+       int coren = gc_block2core[block%(NUMCORES4GC*2)];
+       tprintf("va: %x page_index: %d host: %d\n", 
+               (int)page_sva, page_index, coren);
+       for(int i = 0; i < NUMCORESACTIVE; i++) {
+         int * local_tbl = (int *)((void *)gccachesamplingtbl_r
+                 +size_cachesamplingtbl_local_r*i);
+         int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+         printf("%8d ",freq);
+       }
+       printf("\n");
+  }
+  printf("=================\n");
+} // gc_output_cache_sampling
  #endif // GC_CACHE_ADAPT
  
  inline void gc_collect(struct garbagelist * stackptr) {
@@ -3572,6 +3658,9 @@ inline void gc_master(struct garbagelist * stackptr) {
  #ifdef GC_PROFILE
    gc_profileItem();
  #endif
+#ifdef GC_CACHE_ADAPT
+  //gc_output_cache_sampling();
+#endif // GC_CACHE_ADAPT
  #ifdef RAWPATH // TODO GC_DEBUG
    printf("(%x,%x) Start mark phase \n", udn_tile_coord_x(), 
                  udn_tile_coord_y());
@@ -3882,6 +3971,9 @@ inline void gc_master(struct garbagelist * stackptr) {
  #endif
    // cache adapt phase
    cacheAdapt_mutator();
+#ifdef GC_CACHE_ADAPT_OUTPUT
+  bamboo_output_cache_policy();
+#endif
    cacheAdapt_gc(false);
    gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
    while(PREFINISHPHASE == gcphase) {
@@ -4024,10 +4116,12 @@ pregccheck:
         }
  #endif
  #ifdef GC_CACHE_ADAPT
+#ifdef GC_CACHE_SAMPLING
      // disable the timer interrupt
      bamboo_mask_timer_intr();
      // get the sampling data 
      bamboo_output_dtlb_sampling();
+#endif // GC_CACHE_SAMPLING
  #endif // GC_CACHE_ADAPT
         gcprocessing = true;
         gc_master(stackptr);
@@ -4050,12 +4144,14 @@ pregccheck:
         }
  #endif
  #ifdef GC_CACHE_ADAPT
+#ifdef GC_CACHE_SAMPLING
         // disable the timer interrupt
         bamboo_mask_timer_intr();
         if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
           // get the sampling data 
           bamboo_output_dtlb_sampling();
         }
+#endif // GC_CACHE_SAMPLING
  #endif // GC_CACHE_ADAPT
      gcprocessing = true;
      gc_collect(stackptr);
@@ -4085,12 +4181,14 @@ pregccheck:
         }
  #endif
  #ifdef GC_CACHE_ADAPT
+#ifdef GC_CACHE_SAMPLING
         // disable the timer interrupt
         bamboo_mask_timer_intr();
         if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
           // get the sampling data 
           bamboo_output_dtlb_sampling();
         }
+#endif // GC_CACHE_SAMPLING
  #endif // GC_CACHE_ADAPT
      // not a gc core, should wait for gcfinish msg
      gcprocessing = true;
@@ -4104,8 +4202,10 @@ pregccheck:
      gcprocessing = false;
    }
  #ifdef GC_CACHE_ADAPT
+#ifdef GC_CACHE_SAMPLING
    // reset the sampling arrays
    bamboo_dtlb_sampling_reset();
+#endif // GC_CACHE_SAMPLING
    if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
         // zero out the gccachesamplingtbl
         BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);
@@ -4115,9 +4215,11 @@ pregccheck:
           BAMBOO_MEMSET_WH(gccachepolicytbl,0,size_cachepolicytbl);
         }
    }
+#ifdef GC_CACHE_SAMPLING
    // enable the timer interrupt
    bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING); 
    bamboo_unmask_timer_intr();
+#endif // GC_CACHE_SAMPLING
  #endif // GC_CACHE_ADAPT
    return true;
  } // void gc(struct garbagelist * stackptr)