code changes...fix all of the statistics collection...clean up collection code
[IRC.git] / Robust / src / Runtime / bamboo / multicorecache.c
index a897765ca4acb348db8055149f7d33fa70e87afc..e5415f6959ee39be28b56c7fd2d7f55001ed490c 100644 (file)
@@ -3,7 +3,99 @@
 #include "multicoremsg.h"
 #include "multicoregcprofile.h"
 
-gc_cache_revise_info_t gc_cache_revise_information;
+void cacheadapt_finish_compact(void *toptr) {
+  unsigned int dstpage=(toptr-gcbaseva)>>BAMBOO_PAGE_SIZE_BITS;
+  unsigned int * newtable=&gccachesamplingtbl_r[dstpage*NUMCORESACTIVE];
+
+  for(int core = 0; core < NUMCORESACTIVE; core++) {
+    (*newtable)=(*newtable)>>6;
+    newtable++;
+  }  
+}
+
+void cacheadapt_finish_src_page(void *srcptr, void *tostart, void *tofinish) {
+  unsigned int srcpage=(srcptr-gcbaseva)>>BAMBOO_PAGE_SIZE_BITS;
+  unsigned int dstpage=(tostart-gcbaseva)>>BAMBOO_PAGE_SIZE_BITS;
+  unsigned int numbytes=tofinish-tostart;
+  
+  unsigned int * oldtable=&gccachesamplingtbl[srcpage*NUMCORESACTIVE];
+  unsigned int * newtable=&gccachesamplingtbl_r[dstpage*NUMCORESACTIVE];
+  
+  unsigned int page64th=numbytes>>(BAMBOO_PAGE_SIZE_BITS-6);
+
+  for(int core = 0; core < NUMCORESACTIVE; core++) {
+    (*newtable)+=page64th*(*oldtable);
+    newtable++;
+    oldtable++;
+  }  
+}
+
+/* Bytes needed equal to zero is a special case...  It means that we should finish the dst page */
+
+void cacheadapt_finish_dst_page(void *origptr, void *tostart, void *toptr, unsigned int bytesneeded) {
+  unsigned int numbytes=toptr-tostart;
+
+  void *tobound=(void *)((((unsigned INTPTR)toptr-1)&~(BAMBOO_PAGE_SIZE-1))+BAMBOO_PAGE_SIZE);
+  void *origbound=(void *)((((unsigned INTPTR)origptr)&~(BAMBOO_PAGE_SIZE-1))+BAMBOO_PAGE_SIZE);
+  
+  unsigned int topage=(toptr-1-gcbaseva)>>BAMBOO_PAGE_SIZE_BITS; 
+  unsigned int origpage=(origptr-gcbaseva)>>BAMBOO_PAGE_SIZE_BITS;
+
+  unsigned int * totable=&gccachesamplingtbl_r[topage*NUMCORESACTIVE];
+  unsigned int * origtable=&gccachesamplingtbl[origpage*NUMCORESACTIVE];
+
+  //handler
+  unsigned int remaintobytes=(bytesneeded==0)?0:(tobound-toptr);
+  unsigned int remainorigbytes=origbound-origptr;
+
+  do {
+    //round source bytes down....don't want to close out page if not necessary
+    remainorigbytes=(remainorigbytes>bytesneeded)?bytesneeded:remainorigbytes;
+
+    if (remaintobytes<=remainorigbytes) {
+      //Need to close out to page
+
+      numbytes+=remaintobytes;
+      unsigned int page64th=numbytes>>(BAMBOO_PAGE_SIZE_BITS-6);
+
+      for(int core = 0; core < NUMCORESACTIVE; core++) {
+       (*totable)=(*totable+page64th*(*origtable))>>6;
+       totable++;
+       origtable++;
+      }
+      toptr+=remaintobytes;
+      origptr+=remaintobytes;
+      bytesneeded-=remaintobytes;
+      topage++;//to page is definitely done
+      tobound+=BAMBOO_PAGE_SIZE;
+      origpage=(origptr-gcbaseva)>>BAMBOO_PAGE_SIZE_BITS;//handle exact match case
+      origbound=(void *) ((((unsigned INTPTR)origptr)&~(BAMBOO_PAGE_SIZE-1))+BAMBOO_PAGE_SIZE);
+    } else {
+      //Finishing off orig page
+
+      numbytes+=remainorigbytes;
+      unsigned int page64th=numbytes>>(BAMBOO_PAGE_SIZE_BITS-6);
+      
+      for(int core = 0; core < NUMCORESACTIVE; core++) {
+       (*totable)+=page64th*(*origtable);
+       totable++;
+       origtable++;
+      }
+      toptr+=remainorigbytes;
+      origptr+=remainorigbytes;
+      bytesneeded-=remainorigbytes;
+      origpage++;//just orig page is done
+      origbound+=BAMBOO_PAGE_SIZE;
+    }
+    totable=&gccachesamplingtbl_r[topage*NUMCORESACTIVE];
+    origtable=&gccachesamplingtbl[origpage*NUMCORESACTIVE];
+    
+    remaintobytes=tobound-toptr;
+    remainorigbytes=origbound-origptr;
+    
+    numbytes=0;
+  } while(bytesneeded!=0);
+}
 
 // prepare for cache adaption:
 //   -- flush the shared heap
@@ -16,8 +108,11 @@ void cacheAdapt_gc(bool isgccachestage) {
   // clean the dtlb entries
   BAMBOO_CLEAN_DTLB();
 
-  // change the cache strategy
-  gccachestage = isgccachestage;
+  if(isgccachestage) {
+    bamboo_install_dtlb_handler_for_gc();
+  } else {
+    bamboo_install_dtlb_handler_for_mutator();
+  }
 } 
 
 // the master core decides how to adapt cache strategy for the mutator 
@@ -26,10 +121,10 @@ void cacheAdapt_gc(bool isgccachestage) {
 // find the core that accesses the page #page_index most
 #define CACHEADAPT_FIND_HOTTEST_CORE(page_index,hottestcore,hotfreq) \
   { \
-    int *local_tbl=&gccachesamplingtbl_r[page_index]; \
+    unsigned int *local_tbl=&gccachesamplingtbl_r[page_index*NUMCORESACTIVE];  \
     for(int i = 0; i < NUMCORESACTIVE; i++) { \
       int freq = *local_tbl; \
-      local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r); \
+      local_tbl++; \
       if(hotfreq < freq) { \
         hotfreq = freq; \
         hottestcore = i; \
@@ -40,10 +135,10 @@ void cacheAdapt_gc(bool isgccachestage) {
 // access time of the page at the same time
 #define CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq) \
   { \
-    int *local_tbl=&gccachesamplingtbl_r[page_index]; \
+    unsigned int *local_tbl=&gccachesamplingtbl_r[page_index*NUMCORESACTIVE];  \
     for(int i = 0; i < NUMCORESACTIVE; i++) { \
       int freq = *local_tbl; \
-      local_tbl=(int *)(((char *)local_tbl)+size_cachesamplingtbl_local_r); \
+      local_tbl++; \
       totalfreq += freq; \
       if(hotfreq < freq) { \
         hotfreq = freq; \
@@ -67,12 +162,12 @@ void cacheAdapt_gc(bool isgccachestage) {
 
 // make all pages hfh
 void cacheAdapt_policy_h4h(int coren){
-  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
   unsigned int page_gap=page_num/NUMCORESACTIVE;
   unsigned int page_index=page_gap*coren;
   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
-  int * tmp_p = gccachepolicytbl;
+  unsigned int * tmp_p = gccachepolicytbl;
   for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
@@ -83,16 +178,16 @@ void cacheAdapt_policy_h4h(int coren){
 
 // make all pages local as non-cache-adaptable gc local mode
 void cacheAdapt_policy_local(int coren){
-  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
   unsigned int page_gap=page_num/NUMCORESACTIVE;
   unsigned int page_index=page_gap*coren;
   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
-  int * tmp_p = gccachepolicytbl;
+  unsigned int * tmp_p = gccachepolicytbl;
   for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     unsigned int block = 0;
-    BLOCKINDEX((void *) page_sva, block);
+    BLOCKINDEX(block, (void *) page_sva);
     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
     CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);
     CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
@@ -101,12 +196,12 @@ void cacheAdapt_policy_local(int coren){
 } 
 
 void cacheAdapt_policy_hottest(int coren){
-  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
   unsigned int page_gap=page_num/NUMCORESACTIVE;
   unsigned int page_index=page_gap*coren;
   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
-  int * tmp_p = gccachepolicytbl;
+  unsigned int * tmp_p = gccachepolicytbl;
   for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     unsigned int hottestcore = 0;
@@ -127,21 +222,21 @@ void cacheAdapt_policy_hottest(int coren){
   }
 } 
 
-#define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  64
+#define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  1
 // cache the page on the core that accesses it the most if that core accesses 
 // it more than (GC_CACHE_ADAPT_DOMINATE_THRESHOLD)% of the total.  Otherwise,
 // h4h the page.
 void cacheAdapt_policy_dominate(int coren){
-  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_PAGE_SIZE);
+  unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
   unsigned int page_gap=page_num/NUMCORESACTIVE;
   unsigned int page_index=page_gap*coren;
   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
-  int * tmp_p = gccachepolicytbl;
+  unsigned int * tmp_p = gccachepolicytbl;
   for(; page_index < page_index_end; page_index++) {
     bamboo_cache_policy_t policy = {0};
     unsigned int hottestcore = 0;
-    unsigned long long totalfreq = 0;
+    unsigned int totalfreq = 0;
     unsigned int hotfreq = 0;
     CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq);
     // Decide the cache strategy for this page
@@ -149,10 +244,14 @@ void cacheAdapt_policy_dominate(int coren){
     // the gcpolicytbl 
     // Format: page start va + cache policy
     if(hotfreq != 0) {
-      totalfreq=(totalfreq*GC_CACHE_ADAPT_DOMINATE_THRESHOLD)>>7;
-      if(hotfreq < totalfreq) {
+      totalfreq=totalfreq>>GC_CACHE_ADAPT_DOMINATE_THRESHOLD;
+      if((unsigned int)hotfreq < (unsigned int)totalfreq) {
         // use hfh
         policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
+        /*unsigned int block = 0;
+        BLOCKINDEX(block, (void *) page_sva);
+        unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
+        CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);*/
       } else {
         // locally cache the page in the hottest core
         CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
@@ -163,204 +262,6 @@ void cacheAdapt_policy_dominate(int coren){
   }
 }
 
-#if 0
-#define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 10
-// record the worklocad of the hottestcore into core2heavypages
-#define CACHEADAPT_RECORD_PAGE_WORKLOAD(hottestcore,totalfreq,hotfreq,remoteaccess,tmp_p) \
-  { \
-    workload[hottestcore] += (totalfreq); \
-    total_workload += (totalfreq); \
-    unsigned long long remoteaccess = (totalfreq) - (hotfreq); \
-    unsigned int index = (unsigned int)core2heavypages[hottestcore][0]; \
-    core2heavypages[hottestcore][3*index+3] = (remoteaccess); \
-    core2heavypages[hottestcore][3*index+2] = (totalfreq); \
-    core2heavypages[hottestcore][3*index+1] = (unsigned long long)((tmp_p)-1); \
-    core2heavypages[hottestcore][0]++; \
-  }
-
-void gc_quicksort(unsigned long long *array,unsigned int left,unsigned int right,unsigned int offset) {
-  unsigned int pivot = 0;;
-  unsigned int leftIdx = left;
-  unsigned int rightIdx = right;
-  if((right-left+1) >= 1) {
-    pivot = (left+right)/2;
-    while((leftIdx <= pivot) && (rightIdx >= pivot)) {
-      unsigned long long pivotValue = array[pivot*3-offset];
-      while((array[leftIdx*3-offset] > pivotValue) && (leftIdx <= pivot)) {
-        leftIdx++;
-      }
-      while((array[rightIdx*3-offset] < pivotValue) && (rightIdx >= pivot)) {
-        rightIdx--;
-      }
-      // swap [leftIdx] & [rightIdx]
-      for(int k = 0; k < 3; k++) {
-        unsigned long long tmp = array[3*rightIdx-k];
-        array[3*rightIdx-k] = array[3*leftIdx-k];
-        array[3*leftIdx-k] = tmp;
-      }
-      leftIdx++;
-      rightIdx--;
-      if((leftIdx-1) == pivot) {
-        pivot = rightIdx = rightIdx + 1;
-      } else if((leftIdx+1) == pivot) {
-        pivot = leftIdx = leftIdx-1;
-      }
-    }
-    gc_quicksort(array, left, pivot-1, offset);
-    gc_quicksort(array, pivot+1, right, offset);
-  }
-  return;
-}
-
-INLINE int cacheAdapt_h4h_remote_accesses(unsigned long long workload_threshold,unsigned long long ** core2heavypages, unsigned long long * workload,int i) {
-  int j = 1;
-  unsigned int index = (unsigned int)core2heavypages[i][0];
-  if(workload[i] > workload_threshold) {
-    // sort according to the remoteaccess
-    gc_quicksort(&core2heavypages[i][0], 1, index, 0);
-    while((workload[i] > workload_threshold) && (j<index*3)) {
-      // hfh those pages with more remote accesses 
-      bamboo_cache_policy_t policy = {0};
-      policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-      *((unsigned int*)core2heavypages[i][j]) = policy.word;
-      workload[i] -= core2heavypages[i][j+1];
-      j += 3;
-    }
-  }
-  return j;
-}
-
-// Every page cached on the core that accesses it the most. 
-// Check to see if any core's pages total more accesses than threshold 
-// GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
-// most remote accesses and hash for home them until we get below 
-// GC_CACHE_ADAPT_OVERLOAD_THRESHOLD
-int cacheAdapt_policy_overload(int coren){
-  unsigned int page_index = 0;
-  VA page_sva = gcbaseva;
-  unsigned int page_num = BAMBOO_SHARED_MEM_SIZE/BAMBOO_PAGE_SIZE;
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  unsigned long long workload[NUMCORESACTIVE];
-  memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
-  unsigned long long total_workload = 0;
-  unsigned long long core2heavypages[NUMCORESACTIVE][page_num*3+1];
-  memset(core2heavypages,0,sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
-  for(page_index = 0; page_sva < gctopva; page_index++) {
-    bamboo_cache_policy_t policy = {0};
-    unsigned int hottestcore = 0;
-    unsigned long long totalfreq = 0;
-    unsigned int hotfreq = 0;
-    CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq);
-    // Decide the cache strategy for this page
-    // If decide to adapt a new cache strategy, write into the shared block of
-    // the gcsharedsamplingtbl. The mem recording information that has been 
-    // written is enough to hold the information.
-    // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-    if(hotfreq != 0) {
-      totalfreq/=BAMBOO_PAGE_SIZE;
-      hotfreq/=BAMBOO_PAGE_SIZE;
-      // locally cache the page in the hottest core
-      CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
-      CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
-      CACHEADAPT_RECORD_PAGE_WORKLOAD(hottestcore,totalfreq,hotfreq,remoteaccess,tmp_p);    
-    }
-    page_sva += BAMBOO_PAGE_SIZE;
-  }
-
-  unsigned long long workload_threshold=total_workload/GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
-  // Check the workload of each core
-  for(int i = 0; i < NUMCORESACTIVE; i++) {
-    cacheAdapt_h4h_remote_accesses(workload_threshold,core2heavypages,workload,i);
-  }
-
-  return numchanged;
-}
-
-#define GC_CACHE_ADAPT_ACCESS_THRESHOLD 70
-#define GC_CACHE_ADAPT_CROWD_THRESHOLD  20
-// Every page cached on the core that accesses it the most. 
-// Check to see if any core's pages total more accesses than threshold 
-// GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
-// most remote accesses and hash for home them until we get below 
-// GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  
-// Sort pages based on activity.... 
-// If more then GC_CACHE_ADAPT_ACCESS_THRESHOLD% of the accesses for a
-// core's pages are from more than GC_CACHE_ADAPT_CROWD_THRESHOLD pages, 
-// then start hfh these pages(selecting the ones with the most remote 
-// accesses first or fewest local accesses) until we get below 
-// GC_CACHE_ADAPT_CROWD_THRESHOLD pages.
-int cacheAdapt_policy_crowd(int coren){
-  unsigned int page_index = 0;
-  VA page_sva = gcbaseva;
-  unsigned int page_num = BAMBOO_SHARED_MEM_SIZE/BAMBOO_PAGE_SIZE;
-  unsigned int numchanged = 0;
-  int * tmp_p = gccachepolicytbl+1;
-  unsigned long long workload[NUMCORESACTIVE];
-  memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
-  unsigned long long total_workload = 0;
-  unsigned long long core2heavypages[NUMCORESACTIVE][page_num*3+1];
-  memset(core2heavypages,0,sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
-  for(page_index = 0; page_sva < gctopva; page_index++) {
-    bamboo_cache_policy_t policy = {0};
-    unsigned int hottestcore = 0;
-    unsigned long long totalfreq = 0;
-    unsigned int hotfreq = 0;
-    CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq);
-    // Decide the cache strategy for this page
-    // If decide to adapt a new cache strategy, write into the shared block of
-    // the gcsharedsamplingtbl. The mem recording information that has been 
-    // written is enough to hold the information.
-    // Format: page start va + cache strategy(hfh/(host core+[x,y]))
-    if(hotfreq != 0) {
-      totalfreq/=BAMBOO_PAGE_SIZE;
-      hotfreq/=BAMBOO_PAGE_SIZE;
-      // locally cache the page in the hottest core
-      CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
-      CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy,numchanged);
-      CACHEADAPT_RECORD_PAGE_WORKLOAD(hottestcore,totalfreq,hotfreq,remoteaccess,tmp_p);
-    }
-    page_sva += BAMBOO_PAGE_SIZE;
-  }
-
-  unsigned long long workload_threshold=total_workload/GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
-  // Check the workload of each core
-  for(int i = 0; i < NUMCORESACTIVE; i++) {
-    unsigned int index=(unsigned int)core2heavypages[i][0];
-    int j=cacheAdapt_h4h_remote_accesses(workload_threshold,core2heavypages,workload,i);
-    // Check if the accesses are crowded on few pages
-    // sort according to the total access
-inner_crowd:
-    gc_quicksort(&core2heavypages[i][0], j/3+1, index, 1);
-    unsigned long long threshold=GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
-    int num_crowded = 0;
-    unsigned long long t_workload = 0;
-    do {
-      t_workload += core2heavypages[i][j+num_crowded*3+1];
-      num_crowded++;
-    } while(t_workload < threshold);
-    // num_crowded <= GC_CACHE_ADAPT_CROWD_THRESHOLD and if there are enough 
-    // items, it is always == GC_CACHE_ADAPT_CROWD_THRESHOLD
-    if(num_crowded > GC_CACHE_ADAPT_CROWD_THRESHOLD) {
-      // need to hfh these pages
-      // sort the pages according to remote access
-      gc_quicksort(&core2heavypages[i][0], j/3+1, j/3+num_crowded, 0);
-      // h4h those pages with more remote accesses 
-      bamboo_cache_policy_t policy = {0};
-      policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
-      *((unsigned int*)core2heavypages[i][j]) = policy.word;
-      workload[i] -= core2heavypages[i][j+1];
-      t_workload -= core2heavypages[i][j+1];
-      j += 3;
-      threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
-      goto inner_crowd;
-    }
-  }
-
-  return numchanged;
-} 
-#endif
-
 unsigned int cacheAdapt_decision(int coren) {
   BAMBOO_CACHE_MF();
   // check the statistic data
@@ -373,10 +274,6 @@ unsigned int cacheAdapt_decision(int coren) {
   cacheAdapt_policy_hottest(coren);
 #elif defined GC_CACHE_ADAPT_POLICY4
   cacheAdapt_policy_dominate(coren);
-//#elif defined GC_CACHE_ADAPT_POLICY5
-//  cacheAdapt_policy_overload(coren);
-//#elif defined GC_CACHE_ADAPT_POLICY6
-//  cacheAdapt_policy_crowd(coren);
 #endif
 }
 
@@ -384,7 +281,7 @@ unsigned int cacheAdapt_decision(int coren) {
 void cacheAdapt_mutator() {
   BAMBOO_CACHE_MF();
   // check the changes and adapt them
-  int * tmp_p = gccachepolicytbl;
+  unsigned int * tmp_p = gccachepolicytbl;
   unsigned int page_sva = gcbaseva;
   for(; page_sva<gctopva; page_sva+=BAMBOO_PAGE_SIZE) {
     // read out the policy
@@ -397,6 +294,7 @@ void cacheAdapt_mutator() {
   }
 }
 
+// Cache adapt phase process for clients
 void cacheAdapt_phase_client() {
   WAITFORGCPHASE(CACHEPOLICYPHASE);
   GC_PRINTF("Start cachepolicy phase\n");
@@ -413,7 +311,7 @@ void cacheAdapt_phase_client() {
   //send init finish msg to core coordinator
   send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE);
   GC_PRINTF("Finish prefinish phase\n");
-  CACHEADAPT_SAMPING_RESET();
+  CACHEADAPT_SAMPLING_RESET();
   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
     // zero out the gccachesamplingtbl
     BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);  
@@ -423,6 +321,7 @@ void cacheAdapt_phase_client() {
 
 extern unsigned long long gc_output_cache_policy_time;
 
+// Cache adpat phase process for the master
 void cacheAdapt_phase_master() {
   GCPROFILE_ITEM();
   unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
@@ -435,7 +334,7 @@ void cacheAdapt_phase_master() {
   GC_PRINTF("Start cachepolicy phase \n");
   // cache adapt phase
   cacheAdapt_decision(BAMBOO_NUM_OF_CORE);
-  GC_CHECK_ALL_CORE_STATUS(CACHEPOLICYPHASE==gc_status_info.gcphase);
+  GC_CHECK_ALL_CORE_STATUS();
   BAMBOO_CACHE_MF();
 
   // let all cores to adopt new policies
@@ -446,9 +345,9 @@ void cacheAdapt_phase_master() {
   // cache adapt phase
   cacheAdapt_mutator();
   cacheAdapt_gc(false);
-  GC_CHECK_ALL_CORE_STATUS(PREFINISHPHASE==gc_status_info.gcphase);
-
-  CACHEADAPT_SAMPING_RESET();
+  GC_CHECK_ALL_CORE_STATUS();
+  
+  CACHEADAPT_SAMPLING_RESET();
   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
     // zero out the gccachesamplingtbl
     BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);
@@ -457,29 +356,48 @@ void cacheAdapt_phase_master() {
   }
 }
 
+// output original cache sampling data for each page
 void gc_output_cache_sampling() {
+  extern volatile bool gc_profile_flag;
+  if(!gc_profile_flag) return;
   unsigned int page_index = 0;
   VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) >> (BAMBOO_PAGE_SIZE_BITS);
   for(page_index = 0; page_index < page_num; page_index++) {
     page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
     unsigned int block = 0;
-    BLOCKINDEX((void *) page_sva, block);
+    BLOCKINDEX(block, (void *) page_sva);
     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
-    printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
+    //printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
+    unsigned int * local_tbl = &gccachesamplingtbl[page_index*NUMCORESACTIVE];
+    int accesscore = 0;
     for(int i = 0; i < NUMCORESACTIVE; i++) {
-      int * local_tbl = (int *)((void *)gccachesamplingtbl+size_cachesamplingtbl_local*i);
-      int freq = local_tbl[page_index];
-      //if(freq != 0) {
-        printf("%d,  ", freq);
-      //}
+      int freq = *local_tbl;
+      local_tbl++;
+      if(freq != 0) {
+        accesscore++;
+        //printf("%d,  ", freq);
+      }
     }
-    printf("\n");
+    if(accesscore!=0) {
+      printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
+      unsigned int * local_tbl = &gccachesamplingtbl[page_index*NUMCORESACTIVE];
+      for(int i = 0; i < NUMCORESACTIVE; i++) {
+        unsigned int freq = *local_tbl;
+        local_tbl++;
+        printf("%u,  ", freq);
+      }
+      printf("\n");
+    }
+    //printf("\n");
   }
   printf("=================\n");
 } 
 
+// output revised cache sampling data for each page after compaction
 void gc_output_cache_sampling_r() {
+  extern volatile bool gc_profile_flag;
+  if(!gc_profile_flag) return;
   // TODO summary data
   unsigned int sumdata[NUMCORESACTIVE][NUMCORESACTIVE];
   for(int i = 0; i < NUMCORESACTIVE; i++) {
@@ -490,37 +408,42 @@ void gc_output_cache_sampling_r() {
   tprintf("cache sampling_r \n");
   unsigned int page_index = 0;
   VA page_sva = 0;
-  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
+  unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) >> (BAMBOO_PAGE_SIZE_BITS);
   for(page_index = 0; page_index < page_num; page_index++) {
     page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
     unsigned int block = 0;
-    BLOCKINDEX((void *)page_sva, block);
+    BLOCKINDEX(block, (void *)page_sva);
     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
-    printf(" %x,  %d,  %d,  ",(int)page_sva,page_index,coren);
+    //printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
     int accesscore = 0; // TODO
+    unsigned int * local_tbl = &gccachesamplingtbl_r[page_index*NUMCORESACTIVE];
     for(int i = 0; i < NUMCORESACTIVE; i++) {
-      int * local_tbl = (int *)((void *)gccachesamplingtbl_r+size_cachesamplingtbl_local_r*i);
-      int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
-      printf("%d,  ", freq);
+      unsigned int freq = *local_tbl; 
+      //printf("%d,  ", freq);
       if(freq != 0) {
         accesscore++;// TODO
       }
+      local_tbl++;
     }
     if(accesscore!=0) {
+      printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
+      unsigned int * local_tbl = &gccachesamplingtbl_r[page_index*NUMCORESACTIVE];
       for(int i = 0; i < NUMCORESACTIVE; i++) {
-        int * local_tbl = (int *)((void *)gccachesamplingtbl_r+size_cachesamplingtbl_local_r*i);
-        int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
+        unsigned int freq = *local_tbl;
+        printf("%u,  ", freq);
         sumdata[accesscore-1][i]+=freq;
+        local_tbl++;
       }
-    }
-  
-    printf("\n");
+      printf("\n");
+    }  
+    //printf("\n");
   }
+  printf("+++++\n");
   // TODO printout the summary data
   for(int i = 0; i < NUMCORESACTIVE; i++) {
     printf("%d  ", i);
     for(int j = 0; j < NUMCORESACTIVE; j++) {
-      printf(" %d  ", sumdata[j][i]);
+      printf(" %u  ", sumdata[j][i]);
     }
     printf("\n");
   }