Robust/src/Runtime/bamboo/multicoregarbage.c

   1 #ifdef MULTICORE_GC
   2 #include "runtime.h"
   3 #include "multicoregarbage.h"
   4 #include "multicoreruntime.h"
   5 #include "runtime_arch.h"
   6 #include "SimpleHash.h"
   7 #include "GenericHashtable.h"
   8 #include "ObjectHash.h"
   9 #include "GCSharedHash.h"
  10
  11 extern int corenum;
  12 extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
  13 extern int numqueues[][NUMCLASSES];
  14
  15 extern struct genhashtable * activetasks;
  16 extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
  17 extern struct taskparamdescriptor *currtpd;
  18
  19 extern struct LockValue runtime_locks[MAXTASKPARAMS];
  20 extern int runtime_locklen;
  21
  22 #ifdef SMEMM
  23 extern unsigned int gcmem_mixed_threshold;
  24 extern unsigned int gcmem_mixed_usedmem;
  25 #endif
  26
  27 struct pointerblock {
  28   void * ptrs[NUMPTRS];
  29   struct pointerblock *next;
  30 };
  31
  32 struct pointerblock *gchead=NULL;
  33 int gcheadindex=0;
  34 struct pointerblock *gctail=NULL;
  35 int gctailindex=0;
  36 struct pointerblock *gctail2=NULL;
  37 int gctailindex2=0;
  38 struct pointerblock *gcspare=NULL;
  39
  40 #define NUMLOBJPTRS 20
  41
  42 struct lobjpointerblock {
  43   void * lobjs[NUMLOBJPTRS];
  44   int lengths[NUMLOBJPTRS];
  45   int hosts[NUMLOBJPTRS];
  46   struct lobjpointerblock *next;
  47   struct lobjpointerblock *prev;
  48 };
  49
  50 struct lobjpointerblock *gclobjhead=NULL;
  51 int gclobjheadindex=0;
  52 struct lobjpointerblock *gclobjtail=NULL;
  53 int gclobjtailindex=0;
  54 struct lobjpointerblock *gclobjtail2=NULL;
  55 int gclobjtailindex2=0;
  56 struct lobjpointerblock *gclobjspare=NULL;
  57
  58 #ifdef GC_CACHE_ADAPT
  59 typedef struct gc_cache_revise_info {
  60   int orig_page_start_va;
  61   int orig_page_end_va;
  62   int orig_page_index;
  63   int to_page_start_va;
  64   int to_page_end_va;
  65   int to_page_index;
  66   unsigned int revised_sampling[NUMCORESACTIVE];
  67 } gc_cache_revise_info_t;
  68 gc_cache_revise_info_t gc_cache_revise_infomation;
  69 #endif// GC_CACHE_ADAPT
  70
  71 #ifdef GC_DEBUG
  72 // dump whole mem in blocks
  73 inline void dumpSMem() {
  74   int block = 0;
  75   int sblock = 0;
  76   int j = 0;
  77   int i = 0;
  78   int coren = 0;
  79   int x = 0;
  80   int y = 0;
  81   printf("(%x,%x) Dump shared mem: \n", udn_tile_coord_x(),
  82              udn_tile_coord_y());
  83   // reserved blocks for sblocktbl
  84   printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(),
  85              udn_tile_coord_y());
  86   for(i=BAMBOO_BASE_VA; i<gcbaseva; i+= 4*16) {
  87     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
  88                    udn_tile_coord_x(), udn_tile_coord_y(),
  89            *((int *)(i)), *((int *)(i + 4)),
  90            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
  91            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
  92            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
  93            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
  94            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
  95            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
  96            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
  97   }
  98   sblock = gcreservedsb;
  99   bool advanceblock = false;
 100   // remaining memory
 101   for(i=gcbaseva; i<gcbaseva+BAMBOO_SHARED_MEM_SIZE; i+=4*16) {
 102     advanceblock = false;
 103     // computing sblock # and block #, core coordinate (x,y) also
 104     if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
 105       // finished a sblock
 106       if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
 107                 if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
 108                   // finished a block
 109                   block++;
 110                   advanceblock = true;
 111                 }
 112       } else {
 113                 // finished a block
 114                 block++;
 115                 advanceblock = true;
 116       }
 117       // compute core #
 118       if(advanceblock) {
 119                 coren = gc_block2core[block%(NUMCORES4GC*2)];
 120       }
 121       // compute core coordinate
 122       BAMBOO_COORDS(coren, &x, &y);
 123       printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
 124                      udn_tile_coord_x(), udn_tile_coord_y(),
 125              block, sblock++, x, y,
 126              (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
 127     }
 128     j++;
 129     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
 130                    udn_tile_coord_x(), udn_tile_coord_y(),
 131            *((int *)(i)), *((int *)(i + 4)),
 132            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
 133            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
 134            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
 135            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
 136            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
 137            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
 138            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
 139   }
 140   printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
 141 }
 142 #endif
 143
 144 // should be invoked with interruption closed
 145 inline void gc_enqueue_I(void *ptr) {
 146 #ifdef DEBUG
 147   BAMBOO_DEBUGPRINT(0xe601);
 148   BAMBOO_DEBUGPRINT_REG(ptr);
 149 #endif
 150   if (gcheadindex==NUMPTRS) {
 151     struct pointerblock * tmp;
 152     if (gcspare!=NULL) {
 153       tmp=gcspare;
 154       gcspare=NULL;
 155     } else {
 156       tmp=RUNMALLOC_I(sizeof(struct pointerblock));
 157     }  // if (gcspare!=NULL)
 158     gchead->next=tmp;
 159     gchead=tmp;
 160     gcheadindex=0;
 161   } // if (gcheadindex==NUMPTRS)
 162   gchead->ptrs[gcheadindex++]=ptr;
 163 #ifdef DEBUG
 164   BAMBOO_DEBUGPRINT(0xe602);
 165 #endif
 166 } // void gc_enqueue_I(void *ptr)
 167
 168 // dequeue and destroy the queue
 169 inline void * gc_dequeue_I() {
 170   if (gctailindex==NUMPTRS) {
 171     struct pointerblock *tmp=gctail;
 172     gctail=gctail->next;
 173     gctailindex=0;
 174     if (gcspare!=NULL) {
 175       RUNFREE(tmp);
 176     } else {
 177       gcspare=tmp;
 178     }  // if (gcspare!=NULL)
 179   } // if (gctailindex==NUMPTRS)
 180   return gctail->ptrs[gctailindex++];
 181 } // void * gc_dequeue()
 182
 183 // dequeue and do not destroy the queue
 184 inline void * gc_dequeue2_I() {
 185   if (gctailindex2==NUMPTRS) {
 186     struct pointerblock *tmp=gctail2;
 187     gctail2=gctail2->next;
 188     gctailindex2=0;
 189   } // if (gctailindex2==NUMPTRS)
 190   return gctail2->ptrs[gctailindex2++];
 191 } // void * gc_dequeue2()
 192
 193 inline int gc_moreItems_I() {
 194   if ((gchead==gctail)&&(gctailindex==gcheadindex))
 195     return 0;
 196   return 1;
 197 } // int gc_moreItems()
 198
 199 inline int gc_moreItems2_I() {
 200   if ((gchead==gctail2)&&(gctailindex2==gcheadindex))
 201     return 0;
 202   return 1;
 203 } // int gc_moreItems2()
 204
 205 // should be invoked with interruption closed
 206 // enqueue a large obj: start addr & length
 207 inline void gc_lobjenqueue_I(void *ptr,
 208                              int length,
 209                              int host) {
 210 #ifdef DEBUG
 211   BAMBOO_DEBUGPRINT(0xe901);
 212 #endif
 213   if (gclobjheadindex==NUMLOBJPTRS) {
 214     struct lobjpointerblock * tmp;
 215     if (gclobjspare!=NULL) {
 216       tmp=gclobjspare;
 217       gclobjspare=NULL;
 218     } else {
 219       tmp=RUNMALLOC_I(sizeof(struct lobjpointerblock));
 220     }  // if (gclobjspare!=NULL)
 221     gclobjhead->next=tmp;
 222     tmp->prev = gclobjhead;
 223     gclobjhead=tmp;
 224     gclobjheadindex=0;
 225   } // if (gclobjheadindex==NUMLOBJPTRS)
 226   gclobjhead->lobjs[gclobjheadindex]=ptr;
 227   gclobjhead->lengths[gclobjheadindex]=length;
 228   gclobjhead->hosts[gclobjheadindex++]=host;
 229 #ifdef DEBUG
 230   BAMBOO_DEBUGPRINT_REG(gclobjhead->lobjs[gclobjheadindex-1]);
 231   BAMBOO_DEBUGPRINT_REG(gclobjhead->lengths[gclobjheadindex-1]);
 232   BAMBOO_DEBUGPRINT_REG(gclobjhead->hosts[gclobjheadindex-1]);
 233 #endif
 234 } // void gc_lobjenqueue_I(void *ptr...)
 235
 236 // dequeue and destroy the queue
 237 inline void * gc_lobjdequeue_I(int * length,
 238                                int * host) {
 239   if (gclobjtailindex==NUMLOBJPTRS) {
 240     struct lobjpointerblock *tmp=gclobjtail;
 241     gclobjtail=gclobjtail->next;
 242     gclobjtailindex=0;
 243     gclobjtail->prev = NULL;
 244     if (gclobjspare!=NULL) {
 245       RUNFREE(tmp);
 246     } else {
 247       gclobjspare=tmp;
 248       tmp->next = NULL;
 249       tmp->prev = NULL;
 250     }  // if (gclobjspare!=NULL)
 251   } // if (gclobjtailindex==NUMLOBJPTRS)
 252   if(length != NULL) {
 253     *length = gclobjtail->lengths[gclobjtailindex];
 254   }
 255   if(host != NULL) {
 256     *host = (int)(gclobjtail->hosts[gclobjtailindex]);
 257   }
 258   return gclobjtail->lobjs[gclobjtailindex++];
 259 } // void * gc_lobjdequeue()
 260
 261 inline int gc_lobjmoreItems_I() {
 262   if ((gclobjhead==gclobjtail)&&(gclobjtailindex==gclobjheadindex))
 263     return 0;
 264   return 1;
 265 } // int gc_lobjmoreItems()
 266
 267 // dequeue and don't destroy the queue
 268 inline void gc_lobjdequeue2_I() {
 269   if (gclobjtailindex2==NUMLOBJPTRS) {
 270     gclobjtail2=gclobjtail2->next;
 271     gclobjtailindex2=1;
 272   } else {
 273     gclobjtailindex2++;
 274   }  // if (gclobjtailindex2==NUMLOBJPTRS)
 275 } // void * gc_lobjdequeue2()
 276
 277 inline int gc_lobjmoreItems2_I() {
 278   if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
 279     return 0;
 280   return 1;
 281 } // int gc_lobjmoreItems2()
 282
 283 // 'reversly' dequeue and don't destroy the queue
 284 inline void gc_lobjdequeue3_I() {
 285   if (gclobjtailindex2==0) {
 286     gclobjtail2=gclobjtail2->prev;
 287     gclobjtailindex2=NUMLOBJPTRS-1;
 288   } else {
 289     gclobjtailindex2--;
 290   }  // if (gclobjtailindex2==NUMLOBJPTRS)
 291 } // void * gc_lobjdequeue3()
 292
 293 inline int gc_lobjmoreItems3_I() {
 294   if ((gclobjtail==gclobjtail2)&&(gclobjtailindex2==gclobjtailindex))
 295     return 0;
 296   return 1;
 297 } // int gc_lobjmoreItems3()
 298
 299 inline void gc_lobjqueueinit4_I() {
 300   gclobjtail2 = gclobjtail;
 301   gclobjtailindex2 = gclobjtailindex;
 302 } // void gc_lobjqueueinit2()
 303
 304 inline void * gc_lobjdequeue4_I(int * length,
 305                                 int * host) {
 306   if (gclobjtailindex2==NUMLOBJPTRS) {
 307     gclobjtail2=gclobjtail2->next;
 308     gclobjtailindex2=0;
 309   } // if (gclobjtailindex==NUMLOBJPTRS)
 310   if(length != NULL) {
 311     *length = gclobjtail2->lengths[gclobjtailindex2];
 312   }
 313   if(host != NULL) {
 314     *host = (int)(gclobjtail2->hosts[gclobjtailindex2]);
 315   }
 316   return gclobjtail2->lobjs[gclobjtailindex2++];
 317 } // void * gc_lobjdequeue()
 318
 319 inline int gc_lobjmoreItems4_I() {
 320   if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
 321     return 0;
 322   return 1;
 323 } // int gc_lobjmoreItems(
 324
 325 INTPTR gccurr_heapbound = 0;
 326
 327 inline void gettype_size(void * ptr,
 328                          int * ttype,
 329                          int * tsize) {
 330   int type = ((int *)ptr)[0];
 331   int size = 0;
 332   if(type < NUMCLASSES) {
 333     // a normal object
 334     size = classsize[type];
 335   } else {
 336     // an array
 337     struct ArrayObject *ao=(struct ArrayObject *)ptr;
 338     int elementsize=classsize[type];
 339     int length=ao->___length___;
 340     size=sizeof(struct ArrayObject)+length*elementsize;
 341   }  // if(type < NUMCLASSES)
 342   *ttype = type;
 343   *tsize = size;
 344 }
 345
 346 inline bool isLarge(void * ptr,
 347                     int * ttype,
 348                     int * tsize) {
 349 #ifdef DEBUG
 350   BAMBOO_DEBUGPRINT(0xe701);
 351   BAMBOO_DEBUGPRINT_REG(ptr);
 352 #endif
 353   // check if a pointer is referring to a large object
 354   gettype_size(ptr, ttype, tsize);
 355 #ifdef DEBUG
 356   BAMBOO_DEBUGPRINT(*tsize);
 357 #endif
 358   int bound = (BAMBOO_SMEM_SIZE);
 359   if(((int)ptr-gcbaseva) < (BAMBOO_LARGE_SMEM_BOUND)) {
 360     bound = (BAMBOO_SMEM_SIZE_L);
 361   }
 362   if((((int)ptr-gcbaseva)%(bound))==0) {
 363     // ptr is a start of a block
 364 #ifdef DEBUG
 365     BAMBOO_DEBUGPRINT(0xe702);
 366     BAMBOO_DEBUGPRINT(1);
 367 #endif
 368     return true;
 369   }
 370   if((bound-(((int)ptr-gcbaseva)%bound)) < (*tsize)) {
 371     // it acrosses the boundary of current block
 372 #ifdef DEBUG
 373     BAMBOO_DEBUGPRINT(0xe703);
 374     BAMBOO_DEBUGPRINT(1);
 375 #endif
 376     return true;
 377   }
 378 #ifdef DEBUG
 379   BAMBOO_DEBUGPRINT(0);
 380 #endif
 381   return false;
 382 } // bool isLarge(void * ptr, int * ttype, int * tsize)
 383
 384 inline int hostcore(void * ptr) {
 385   // check the host core of ptr
 386   int host = 0;
 387   RESIDECORE(ptr, &host);
 388 #ifdef DEBUG
 389   BAMBOO_DEBUGPRINT(0xedd0);
 390   BAMBOO_DEBUGPRINT_REG(ptr);
 391   BAMBOO_DEBUGPRINT_REG(host);
 392 #endif
 393   return host;
 394 } // int hostcore(void * ptr)
 395
 396 inline void cpu2coords(int coren,
 397                            int * x,
 398                                            int * y) {
 399   *x = bamboo_cpu2coords[2*coren];
 400   *y = bamboo_cpu2coords[2*coren+1];
 401 } // void cpu2coords(...)
 402
 403 inline bool isLocal(void * ptr) {
 404   // check if a pointer is in shared heap on this core
 405   return hostcore(ptr) == BAMBOO_NUM_OF_CORE;
 406 } // bool isLocal(void * ptr)
 407
 408 inline bool gc_checkCoreStatus_I() {
 409   bool allStall = true;
 410   for(int i = 0; i < NUMCORES4GC; ++i) {
 411     if(gccorestatus[i] != 0) {
 412       allStall = false;
 413       break;
 414     }  // if(gccorestatus[i] != 0)
 415   }  // for(i = 0; i < NUMCORES4GC; ++i)
 416   return allStall;
 417 }
 418
 419 inline bool gc_checkAllCoreStatus_I() {
 420   bool allStall = true;
 421   for(int i = 0; i < NUMCORESACTIVE; ++i) {
 422     if(gccorestatus[i] != 0) {
 423       allStall = false;
 424       break;
 425     }  // if(gccorestatus[i] != 0)
 426   }  // for(i = 0; i < NUMCORESACTIVE; ++i)
 427   return allStall;
 428 }
 429
 430 inline void checkMarkStatue() {
 431 #ifdef DEBUG
 432   BAMBOO_DEBUGPRINT(0xee01);
 433 #endif
 434   int i;
 435   if((!waitconfirm) ||
 436      (waitconfirm && (numconfirm == 0))) {
 437 #ifdef DEBUG
 438     BAMBOO_DEBUGPRINT(0xee02);
 439 #endif
 440         int entry_index = 0;
 441         if(waitconfirm) {
 442           // phase 2
 443           entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 444         } else {
 445           // phase 1
 446           entry_index = gcnumsrobjs_index;
 447         }
 448     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 449     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
 450     gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
 451     gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
 452     // check the status of all cores
 453     bool allStall = gc_checkAllCoreStatus_I();
 454 #ifdef DEBUG
 455     BAMBOO_DEBUGPRINT(0xee03);
 456 #endif
 457     if(allStall) {
 458 #ifdef DEBUG
 459       BAMBOO_DEBUGPRINT(0xee04);
 460 #endif
 461       // ask for confirm
 462       if(!waitconfirm) {
 463 #ifdef DEBUG
 464                 BAMBOO_DEBUGPRINT(0xee05);
 465 #endif
 466                 // the first time found all cores stall
 467                 // send out status confirm msg to all other cores
 468                 // reset the corestatus array too
 469                 gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
 470                 waitconfirm = true;
 471                 numconfirm = NUMCORESACTIVE - 1;
 472                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 473                 for(i = 1; i < NUMCORESACTIVE; ++i) {
 474                   gccorestatus[i] = 1;
 475                   // send mark phase finish confirm request msg to core i
 476                   send_msg_1(i, GCMARKCONFIRM, false);
 477                 }  // for(i = 1; i < NUMCORESACTIVE; ++i)
 478       } else {
 479                 // Phase 2
 480                 // check if the sum of send objs and receive obj are the same
 481                 // yes->check if the info is the latest; no->go on executing
 482                 int sumsendobj = 0;
 483                 for(i = 0; i < NUMCORESACTIVE; ++i) {
 484                   sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
 485                 }  // for(i = 0; i < NUMCORESACTIVE; ++i)
 486 #ifdef DEBUG
 487                 BAMBOO_DEBUGPRINT(0xee06);
 488                 BAMBOO_DEBUGPRINT_REG(sumsendobj);
 489 #endif
 490                 for(i = 0; i < NUMCORESACTIVE; ++i) {
 491                   sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
 492                 }  // for(i = 0; i < NUMCORESACTIVE; ++i)
 493 #ifdef DEBUG
 494                 BAMBOO_DEBUGPRINT(0xee07);
 495                 BAMBOO_DEBUGPRINT_REG(sumsendobj);
 496 #endif
 497                 if(0 == sumsendobj) {
 498                   // Check if there are changes of the numsendobjs or numreceiveobjs on
 499                   // each core
 500                   bool ischanged = false;
 501                   for(i = 0; i < NUMCORESACTIVE; ++i) {
 502                         if((gcnumsendobjs[0][i] != gcnumsendobjs[1][i]) ||
 503                                 (gcnumreceiveobjs[0][i] != gcnumreceiveobjs[1][i]) ) {
 504                           ischanged = true;
 505                           break;
 506                         }
 507                   }  // for(i = 0; i < NUMCORESACTIVE; ++i)
 508 #ifdef DEBUG
 509                   BAMBOO_DEBUGPRINT(0xee08);
 510                   BAMBOO_DEBUGPRINT_REG(ischanged);
 511 #endif
 512                   if(!ischanged) {
 513 #ifdef DEBUG
 514                         BAMBOO_DEBUGPRINT(0xee09);
 515 #endif
 516                         // all the core status info are the latest
 517                         // stop mark phase
 518                         gcphase = COMPACTPHASE;
 519                         // restore the gcstatus for all cores
 520                         for(i = 0; i < NUMCORESACTIVE; ++i) {
 521                           gccorestatus[i] = 1;
 522                         }  // for(i = 0; i < NUMCORESACTIVE; ++i)
 523                   } else {
 524                         waitconfirm = false;
 525                         gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 526                   } // if(!ischanged)
 527                 } else {
 528                   // There were changes between phase 1 and phase 2, can not decide
 529                   // whether the mark phase has been finished
 530                   waitconfirm = false;
 531                   // As it fails in phase 2, flip the entries
 532                   gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
 533                 } // if(0 == sumsendobj) else ...
 534                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 535       } // if(!gcwaitconfirm) else()
 536     } else {
 537       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 538     } // if(allStall)
 539   }  // if((!waitconfirm)...
 540 #ifdef DEBUG
 541   BAMBOO_DEBUGPRINT(0xee0a);
 542 #endif
 543 } // void checkMarkStatue()
 544
 545 inline void initGC() {
 546   int i;
 547   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
 548     for(i = 0; i < NUMCORES4GC; ++i) {
 549       gccorestatus[i] = 1;
 550       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 551       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 552       gcloads[i] = 0;
 553       gcrequiredmems[i] = 0;
 554       gcfilledblocks[i] = 0;
 555       gcstopblock[i] = 0;
 556     } // for(i = 0; i < NUMCORES4GC; ++i)
 557     for(i = NUMCORES4GC; i < NUMCORESACTIVE; ++i) {
 558       gccorestatus[i] = 1;
 559       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
 560       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
 561     }
 562     gcheaptop = 0;
 563     gctopcore = 0;
 564     gctopblock = 0;
 565   } // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
 566   gcself_numsendobjs = 0;
 567   gcself_numreceiveobjs = 0;
 568   gcmarkedptrbound = 0;
 569   gcobj2map = 0;
 570   gcmappedobj = 0;
 571   gcnumlobjs = 0;
 572   gcmovestartaddr = 0;
 573   gctomove = false;
 574   gcblock2fill = 0;
 575   gcmovepending = 0;
 576   gccurr_heaptop = 0;
 577   gcdstcore = 0;
 578
 579   // initialize queue
 580   if (gchead==NULL) {
 581     gcheadindex=gctailindex=gctailindex2 = 0;
 582     gchead=gctail=gctail2=RUNMALLOC(sizeof(struct pointerblock));
 583   } else {
 584     gctailindex = gctailindex2 = gcheadindex;
 585     gctail = gctail2 = gchead;
 586   }
 587
 588   // initialize the large obj queues
 589   if (gclobjhead==NULL) {
 590     gclobjheadindex=0;
 591     gclobjtailindex=0;
 592     gclobjtailindex2 = 0;
 593     gclobjhead=gclobjtail=gclobjtail2=
 594           RUNMALLOC(sizeof(struct lobjpointerblock));
 595   } else {
 596     gclobjtailindex = gclobjtailindex2 = gclobjheadindex = 0;
 597     gclobjtail = gclobjtail2 = gclobjhead;
 598   }
 599   gclobjhead->next = gclobjhead->prev = NULL;
 600
 601 #ifdef LOCALHASHTBL_TEST
 602   freeRuntimeHash(gcpointertbl);
 603   gcpointertbl = allocateRuntimeHash(20);
 604 #else
 605   mgchashreset(gcpointertbl);
 606 #endif
 607
 608   freeMGCHash(gcforwardobjtbl);
 609   gcforwardobjtbl = allocateMGCHash(20, 3);
 610
 611   // initialize the mapping info related structures
 612   if((BAMBOO_NUM_OF_CORE < NUMCORES4GC) && (gcsharedptbl != NULL)) {
 613         // Never free the shared hash table, just reset it
 614         mgcsharedhashReset(gcsharedptbl);
 615   }
 616 #ifdef GC_PROFILE
 617   gc_num_livespace = 0;
 618   gc_num_freespace = 0;
 619   gc_num_lobj = 0;
 620   gc_num_lobjspace = 0;
 621   gc_num_liveobj = 0;
 622   gc_num_forwardobj = 0;
 623   gc_num_profiles = NUMCORESACTIVE - 1;
 624 #endif
 625 } // void initGC()
 626
 627 // compute load balance for all cores
 628 inline int loadbalance(int * heaptop) {
 629   // compute load balance
 630   int i;
 631
 632   // get the total loads
 633   int tloads = gcloads[STARTUPCORE];
 634   for(i = 1; i < NUMCORES4GC; i++) {
 635     tloads += gcloads[i];
 636   }
 637   *heaptop = gcbaseva + tloads;
 638
 639 #ifdef DEBUG
 640   BAMBOO_DEBUGPRINT(0xdddd);
 641   BAMBOO_DEBUGPRINT_REG(tloads);
 642   BAMBOO_DEBUGPRINT_REG(*heaptop);
 643 #endif
 644   int b = 0;
 645   BLOCKINDEX(*heaptop, &b);
 646   int numbpc = b / NUMCORES4GC;       // num of blocks per core
 647 #ifdef DEBUG
 648   BAMBOO_DEBUGPRINT_REG(b);
 649   BAMBOO_DEBUGPRINT_REG(numbpc);
 650 #endif
 651   gctopblock = b;
 652   RESIDECORE(heaptop, &gctopcore);
 653 #ifdef DEBUG
 654   BAMBOO_DEBUGPRINT_REG(gctopcore);
 655 #endif
 656   return numbpc;
 657 } // void loadbalance(int * heaptop)
 658
 659 inline bool cacheLObjs() {
 660   // check the total mem size need for large objs
 661   unsigned long long sumsize = 0;
 662   int size = 0;
 663 #ifdef DEBUG
 664   BAMBOO_DEBUGPRINT(0xe801);
 665 #endif
 666   gclobjtail2 = gclobjtail;
 667   gclobjtailindex2 = gclobjtailindex;
 668   int tmp_lobj = 0;
 669   int tmp_len = 0;
 670   int tmp_host = 0;
 671   // compute total mem size required and sort the lobjs in ascending order
 672   while(gc_lobjmoreItems2_I()) {
 673     gc_lobjdequeue2_I();
 674     tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
 675     tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
 676     tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
 677     sumsize += tmp_len;
 678 #ifdef GC_PROFILE
 679         gc_num_lobj++;
 680 #endif
 681 #ifdef DEBUG
 682     BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2-1]);
 683     BAMBOO_DEBUGPRINT_REG(tmp_len);
 684     BAMBOO_DEBUGPRINT_REG(sumsize);
 685 #endif
 686     int i = gclobjtailindex2-1;
 687     struct lobjpointerblock * tmp_block = gclobjtail2;
 688     // find the place to insert
 689     while(true) {
 690       if(i == 0) {
 691                 if(tmp_block->prev == NULL) {
 692                   break;
 693                 }
 694                 if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
 695                   tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
 696                   tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
 697                   tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
 698                   tmp_block = tmp_block->prev;
 699                   i = NUMLOBJPTRS-1;
 700                 } else {
 701                   break;
 702                 }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
 703           } else {
 704                 if(tmp_block->lobjs[i-1] > tmp_lobj) {
 705                   tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
 706                   tmp_block->lengths[i] = tmp_block->lengths[i-1];
 707                   tmp_block->hosts[i] = tmp_block->hosts[i-1];
 708                   i--;
 709                 } else {
 710                   break;
 711                 }  // if(tmp_block->lobjs[i-1] < tmp_lobj)
 712       }  // if(i ==0 ) else {}
 713     }   // while(true)
 714     // insert it
 715     if(i != gclobjtailindex2 - 1) {
 716       tmp_block->lobjs[i] = tmp_lobj;
 717       tmp_block->lengths[i] = tmp_len;
 718       tmp_block->hosts[i] = tmp_host;
 719     }
 720   }  // while(gc_lobjmoreItems2())
 721
 722 #ifdef GC_PROFILE
 723   gc_num_lobjspace = sumsize;
 724 #endif
 725   // check if there are enough space to cache these large objs
 726   INTPTR dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
 727   if((unsigned long long)gcheaptop > (unsigned long long)dst) {
 728     // do not have enough room to cache large objs
 729 #ifdef DEBUG
 730     BAMBOO_DEBUGPRINT(0xe802);
 731     BAMBOO_DEBUGPRINT_REG(dst);
 732     BAMBOO_DEBUGPRINT_REG(gcheaptop);
 733         BAMBOO_DEBUGPRINT_REG(sumsize);
 734 #endif
 735     return false;
 736   }
 737 #ifdef DEBUG
 738   BAMBOO_DEBUGPRINT(0xe803);
 739   BAMBOO_DEBUGPRINT_REG(dst);
 740   BAMBOO_DEBUGPRINT_REG(gcheaptop);
 741 #endif
 742
 743   gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
 744   // cache the largeObjs to the top of the shared heap
 745   dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
 746   while(gc_lobjmoreItems3_I()) {
 747     gc_lobjdequeue3_I();
 748     size = gclobjtail2->lengths[gclobjtailindex2];
 749     // set the mark field to , indicating that this obj has been moved
 750     // and need to be flushed
 751     ((int *)(gclobjtail2->lobjs[gclobjtailindex2]))[6] = COMPACTED;
 752     dst -= size;
 753     if((int)dst < (int)(gclobjtail2->lobjs[gclobjtailindex2])+size) {
 754       memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 755     } else {
 756       memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
 757     }
 758 #ifdef DEBUG
 759     BAMBOO_DEBUGPRINT(0x804);
 760     BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2]);
 761     BAMBOO_DEBUGPRINT(dst);
 762     BAMBOO_DEBUGPRINT_REG(size);
 763     BAMBOO_DEBUGPRINT_REG(*((int*)gclobjtail2->lobjs[gclobjtailindex2]));
 764     BAMBOO_DEBUGPRINT_REG(*((int*)(dst)));
 765 #endif
 766   }
 767   return true;
 768 } // void cacheLObjs()
 769
 770 // update the bmmboo_smemtbl to record current shared mem usage
 771 void updateSmemTbl(int coren,
 772                    int localtop) {
 773   int ltopcore = 0;
 774   int bound = BAMBOO_SMEM_SIZE_L;
 775   BLOCKINDEX(localtop, &ltopcore);
 776   if(localtop >= (gcbaseva+(BAMBOO_LARGE_SMEM_BOUND))) {
 777     bound = BAMBOO_SMEM_SIZE;
 778   }
 779   int load = (localtop-gcbaseva)%bound;
 780   int i = 0;
 781   int j = 0;
 782   int toset = 0;
 783   do {
 784     toset = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
 785     if(toset < ltopcore) {
 786       bamboo_smemtbl[toset]=
 787         (toset<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 788 #ifdef SMEMM
 789           gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 790 #endif
 791     } else if(toset == ltopcore) {
 792       bamboo_smemtbl[toset] = load;
 793 #ifdef SMEMM
 794           gcmem_mixed_usedmem += bamboo_smemtbl[toset];
 795 #endif
 796       break;
 797     } else {
 798       break;
 799     }
 800     i++;
 801     if(i == 2) {
 802       i = 0;
 803       j++;
 804     }
 805   } while(true);
 806 } // void updateSmemTbl(int, int)
 807
 808 inline void moveLObjs() {
 809 #ifdef DEBUG
 810   BAMBOO_DEBUGPRINT(0xea01);
 811 #endif
 812 #ifdef SMEMM
 813   // update the gcmem_mixed_usedmem
 814   gcmem_mixed_usedmem = 0;
 815 #endif
 816   // zero out the smemtbl
 817   BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
 818   // find current heap top
 819   // flush all gcloads to indicate the real heap top on one core
 820   // previous it represents the next available ptr on a core
 821   if((gcloads[0] > (gcbaseva+(BAMBOO_SMEM_SIZE_L)))
 822      && ((gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
 823     // edge of a block, check if this is exactly the heaptop
 824     BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
 825     gcloads[0]+=(gcfilledblocks[0]>1 ?
 826                  (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
 827   }
 828   updateSmemTbl(0, gcloads[0]);
 829 #ifdef DEBUG
 830   BAMBOO_DEBUGPRINT(0xea02);
 831   BAMBOO_DEBUGPRINT_REG(gcloads[0]);
 832   BAMBOO_DEBUGPRINT_REG(bamboo_smemtbl[0]);
 833 #endif
 834   for(int i = 1; i < NUMCORES4GC; i++) {
 835     int tmptop = 0;
 836 #ifdef DEBUG
 837     BAMBOO_DEBUGPRINT(0xf000+i);
 838     BAMBOO_DEBUGPRINT_REG(gcloads[i]);
 839     BAMBOO_DEBUGPRINT_REG(gcfilledblocks[i]);
 840 #endif
 841     if((gcfilledblocks[i] > 0)
 842        && ((gcloads[i] % (BAMBOO_SMEM_SIZE)) == 0)) {
 843       // edge of a block, check if this is exactly the heaptop
 844       BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
 845       gcloads[i] +=
 846                 (gcfilledblocks[i]>1 ? (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
 847       tmptop = gcloads[i];
 848     }
 849     updateSmemTbl(i, gcloads[i]);
 850 #ifdef DEBUG
 851     BAMBOO_DEBUGPRINT_REG(gcloads[i]);
 852 #endif
 853   } // for(int i = 1; i < NUMCORES4GC; i++) {
 854
 855   // find current heap top
 856   // TODO
 857   // a bug here: when using local allocation, directly move large objects
 858   // to the highest free chunk might not be memory efficient
 859   int tmpheaptop = 0;
 860   int size = 0;
 861   int bound = 0;
 862   int i = 0;
 863   for(i = gcnumblock-1; i >= 0; i--) {
 864     if(bamboo_smemtbl[i] > 0) {
 865       break;
 866     }
 867   }
 868   if(i == -1) {
 869     tmpheaptop = gcbaseva;
 870   } else {
 871     tmpheaptop = gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC) ?
 872                 (BAMBOO_SMEM_SIZE_L*i) :
 873         (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
 874   }
 875
 876   // move large objs from gcheaptop to tmpheaptop
 877   // write the header first
 878   unsigned int tomove = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -gcheaptop;
 879 #ifdef SMEMM
 880   gcmem_mixed_usedmem += tomove;
 881 #endif
 882 #ifdef DEBUG
 883   BAMBOO_DEBUGPRINT(0xea03);
 884   BAMBOO_DEBUGPRINT_REG(tomove);
 885   BAMBOO_DEBUGPRINT_REG(tmpheaptop);
 886   BAMBOO_DEBUGPRINT_REG(gcheaptop);
 887 #endif
 888   // flush the sbstartbl
 889   BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0',
 890           (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-gcreservedsb)*sizeof(INTPTR));
 891   if(tomove == 0) {
 892     gcheaptop = tmpheaptop;
 893   } else {
 894     // check how many blocks it acrosses
 895     int remain = tmpheaptop-gcbaseva;
 896     int sb = remain/(BAMBOO_SMEM_SIZE) + gcreservedsb;//number of the sblock
 897     int b = 0;  // number of the block
 898     BLOCKINDEX(tmpheaptop, &b);
 899     // check the remaining space in this block
 900     bound = (BAMBOO_SMEM_SIZE);
 901     if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
 902       bound = (BAMBOO_SMEM_SIZE_L);
 903     }
 904     remain = bound - remain%bound;
 905
 906 #ifdef DEBUG
 907     BAMBOO_DEBUGPRINT(0xea04);
 908 #endif
 909     size = 0;
 910     int isize = 0;
 911     int host = 0;
 912     int ptr = 0;
 913     int base = tmpheaptop;
 914     int cpysize = 0;
 915     remain -= BAMBOO_CACHE_LINE_SIZE;
 916     tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 917     gc_lobjqueueinit4_I();
 918     while(gc_lobjmoreItems4_I()) {
 919       ptr = (int)(gc_lobjdequeue4_I(&size, &host));
 920       ALIGNSIZE(size, &isize);
 921       if(remain < isize) {
 922                 // this object acrosses blocks
 923                 if(cpysize > 0) {
 924                   // close current block, fill its header
 925                   BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
 926                   *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
 927                   bamboo_smemtbl[b]+=BAMBOO_CACHE_LINE_SIZE;//add the size of header
 928                   cpysize = 0;
 929                   base = tmpheaptop;
 930                   if(remain == 0) {
 931                         remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 932                                          BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 933                   }
 934                   remain -= BAMBOO_CACHE_LINE_SIZE;
 935                   tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
 936                   BLOCKINDEX(tmpheaptop, &b);
 937                   sb = (tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE) + gcreservedsb;
 938                 }  // if(cpysize > 0)
 939
 940                 // move the large obj
 941                 if((int)gcheaptop < (int)(tmpheaptop)+size) {
 942                   memmove(tmpheaptop, gcheaptop, size);
 943                 } else {
 944                   //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
 945                   memcpy(tmpheaptop, gcheaptop, size);
 946                 }
 947                 // fill the remaining space with -2 padding
 948                 BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
 949 #ifdef DEBUG
 950                 BAMBOO_DEBUGPRINT(0xea05);
 951                 BAMBOO_DEBUGPRINT_REG(gcheaptop);
 952                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
 953                 BAMBOO_DEBUGPRINT_REG(size);
 954                 BAMBOO_DEBUGPRINT_REG(isize);
 955                 BAMBOO_DEBUGPRINT_REG(base);
 956 #endif
 957                 gcheaptop += size;
 958                 // cache the mapping info anyway
 959                 BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
 960 #ifdef LOCALHASHTBL_TEST
 961                 RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
 962 #else
 963                 mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
 964 #endif
 965                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
 966 #ifdef DEBUG
 967                 BAMBOO_DEBUGPRINT(0xcdca);
 968                 BAMBOO_DEBUGPRINT_REG(ptr);
 969                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
 970 #endif
 971                 if(host != BAMBOO_NUM_OF_CORE) {
 972                   // send the original host core with the mapping info
 973                   send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
 974 #ifdef DEBUG
 975                   BAMBOO_DEBUGPRINT(0xcdcb);
 976                   BAMBOO_DEBUGPRINT_REG(ptr);
 977                   BAMBOO_DEBUGPRINT_REG(tmpheaptop);
 978 #endif
 979                 } // if(host != BAMBOO_NUM_OF_CORE)
 980                 tmpheaptop += isize;
 981
 982                 // set the gcsbstarttbl and bamboo_smemtbl
 983                 int tmpsbs = 1+(isize-remain-1)/BAMBOO_SMEM_SIZE;
 984                 for(int k = 1; k < tmpsbs; k++) {
 985                   gcsbstarttbl[sb+k] = (INTPTR)(-1);
 986                 }
 987                 sb += tmpsbs;
 988                 bound = (b<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
 989                 BLOCKINDEX(tmpheaptop-1, &tmpsbs);
 990                 for(; b < tmpsbs; b++) {
 991                   bamboo_smemtbl[b] = bound;
 992                   if(b==NUMCORES4GC-1) {
 993                         bound = BAMBOO_SMEM_SIZE;
 994                   }
 995                 }
 996                 if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
 997                   gcsbstarttbl[sb] = (INTPTR)(-1);
 998                   remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
 999                                    BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
1000                   bamboo_smemtbl[b] = bound;
1001                 } else {
1002                   gcsbstarttbl[sb] = (INTPTR)(tmpheaptop);
1003                   remain = tmpheaptop-gcbaseva;
1004                   bamboo_smemtbl[b] = remain%bound;
1005                   remain = bound - bamboo_smemtbl[b];
1006                 } // if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) else ...
1007
1008                 // close current block and fill the header
1009                 BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
1010                 *((int*)base) = isize + BAMBOO_CACHE_LINE_SIZE;
1011                 cpysize = 0;
1012                 base = tmpheaptop;
1013                 if(remain == BAMBOO_CACHE_LINE_SIZE) {
1014                   // fill with 0 in case
1015                   BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
1016                 }
1017                 remain -= BAMBOO_CACHE_LINE_SIZE;
1018                 tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
1019       } else {
1020                 remain -= isize;
1021                 // move the large obj
1022                 if((int)gcheaptop < (int)(tmpheaptop)+size) {
1023                   memmove(tmpheaptop, gcheaptop, size);
1024                 } else {
1025                   memcpy(tmpheaptop, gcheaptop, size);
1026                 }
1027                 // fill the remaining space with -2 padding
1028                 BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
1029 #ifdef DEBUG
1030                 BAMBOO_DEBUGPRINT(0xea06);
1031                 BAMBOO_DEBUGPRINT_REG(gcheaptop);
1032                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
1033                 BAMBOO_DEBUGPRINT_REG(size);
1034                 BAMBOO_DEBUGPRINT_REG(isize);
1035 #endif
1036
1037                 gcheaptop += size;
1038                 cpysize += isize;
1039                 // cache the mapping info anyway
1040                 BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1041 #ifdef LOCALHASHTBL_TEST
1042                 RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
1043 #else
1044                 mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
1045 #endif
1046                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1047 #ifdef DEBUG
1048                 BAMBOO_DEBUGPRINT(0xcdcc);
1049                 BAMBOO_DEBUGPRINT_REG(ptr);
1050                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
1051                 BAMBOO_DEBUGPRINT_REG(*((int*)tmpheaptop));
1052 #endif
1053                 if(host != BAMBOO_NUM_OF_CORE) {
1054                   // send the original host core with the mapping info
1055                   send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
1056 #ifdef DEBUG
1057                   BAMBOO_DEBUGPRINT(0xcdcd);
1058                   BAMBOO_DEBUGPRINT_REG(ptr);
1059                   BAMBOO_DEBUGPRINT_REG(tmpheaptop);
1060 #endif
1061                 }  // if(host != BAMBOO_NUM_OF_CORE)
1062                 tmpheaptop += isize;
1063
1064                 // update bamboo_smemtbl
1065                 bamboo_smemtbl[b] += isize;
1066           }  // if(remain < isize) else ...
1067     }  // while(gc_lobjmoreItems())
1068     if(cpysize > 0) {
1069       // close current block, fill the header
1070       BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
1071       *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
1072       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;// add the size of the header
1073     } else {
1074       tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
1075     }
1076     gcheaptop = tmpheaptop;
1077
1078   } // if(tomove == 0)
1079
1080 #ifdef DEBUG
1081   BAMBOO_DEBUGPRINT(0xea07);
1082   BAMBOO_DEBUGPRINT_REG(gcheaptop);
1083 #endif
1084
1085   bamboo_free_block = 0;
1086   int tbound = 0;
1087   do {
1088     tbound = (bamboo_free_block<NUMCORES4GC) ?
1089              BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
1090     if(bamboo_smemtbl[bamboo_free_block] == tbound) {
1091       bamboo_free_block++;
1092     } else {
1093       // the first non-full partition
1094       break;
1095     }
1096   } while(true);
1097
1098 #ifdef GC_PROFILE
1099   // check how many live space there are
1100   gc_num_livespace = 0;
1101   for(int tmpi = 0; tmpi < gcnumblock; tmpi++) {
1102         gc_num_livespace += bamboo_smemtbl[tmpi];
1103   }
1104   gc_num_freespace = (BAMBOO_SHARED_MEM_SIZE) - gc_num_livespace;
1105 #endif
1106 #ifdef DEBUG
1107   BAMBOO_DEBUGPRINT(0xea08);
1108   BAMBOO_DEBUGPRINT_REG(gcheaptop);
1109 #endif
1110 } // void moveLObjs()
1111
1112 inline void markObj(void * objptr) {
1113   if(objptr == NULL) {
1114     return;
1115   }
1116   if(ISSHAREDOBJ(objptr)) {
1117     int host = hostcore(objptr);
1118     if(BAMBOO_NUM_OF_CORE == host) {
1119       // on this core
1120       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1121       if(((int *)objptr)[6] == INIT) {
1122                 // this is the first time that this object is discovered,
1123                 // set the flag as DISCOVERED
1124                 ((int *)objptr)[6] |= DISCOVERED;
1125                 BAMBOO_CACHE_FLUSH_LINE(objptr);
1126                 gc_enqueue_I(objptr);
1127           }
1128       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1129     } else {
1130 #ifdef DEBUG
1131       BAMBOO_DEBUGPRINT(0xbbbb);
1132       BAMBOO_DEBUGPRINT_REG(host);
1133       BAMBOO_DEBUGPRINT_REG(objptr);
1134 #endif
1135       // check if this obj has been forwarded
1136       if(!MGCHashcontains(gcforwardobjtbl, (int)objptr)) {
1137                 // send a msg to host informing that objptr is active
1138                 send_msg_2(host, GCMARKEDOBJ, objptr, false);
1139 #ifdef GC_PROFILE
1140                 gc_num_forwardobj++;
1141 #endif // GC_PROFILE
1142                 gcself_numsendobjs++;
1143                 MGCHashadd(gcforwardobjtbl, (int)objptr);
1144       }
1145     }
1146   } else {
1147     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1148     gc_enqueue_I(objptr);
1149     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1150   }  // if(ISSHAREDOBJ(objptr))
1151 } // void markObj(void * objptr)
1152
1153 // enqueue root objs
1154 inline void tomark(struct garbagelist * stackptr) {
1155   if(MARKPHASE != gcphase) {
1156 #ifdef DEBUG
1157     BAMBOO_DEBUGPRINT_REG(gcphase);
1158 #endif
1159     BAMBOO_EXIT(0xb101);
1160   }
1161   gcbusystatus = true;
1162   gcnumlobjs = 0;
1163
1164   int i,j;
1165   // enqueue current stack
1166   while(stackptr!=NULL) {
1167 #ifdef DEBUG
1168     BAMBOO_DEBUGPRINT(0xe501);
1169     BAMBOO_DEBUGPRINT_REG(stackptr->size);
1170     BAMBOO_DEBUGPRINT_REG(stackptr->next);
1171     BAMBOO_DEBUGPRINT_REG(stackptr->array[0]);
1172 #endif
1173     for(i=0; i<stackptr->size; i++) {
1174       if(stackptr->array[i] != NULL) {
1175                 markObj(stackptr->array[i]);
1176       }
1177     }
1178     stackptr=stackptr->next;
1179   }
1180
1181 #ifdef DEBUG
1182   BAMBOO_DEBUGPRINT(0xe503);
1183 #endif
1184   // enqueue objectsets
1185   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
1186     for(i=0; i<NUMCLASSES; i++) {
1187       struct parameterwrapper ** queues =
1188         objectqueues[BAMBOO_NUM_OF_CORE][i];
1189       int length = numqueues[BAMBOO_NUM_OF_CORE][i];
1190       for(j = 0; j < length; ++j) {
1191                 struct parameterwrapper * parameter = queues[j];
1192                 struct ObjectHash * set=parameter->objectset;
1193                 struct ObjectNode * ptr=set->listhead;
1194                 while(ptr!=NULL) {
1195                   markObj((void *)ptr->key);
1196                   ptr=ptr->lnext;
1197                 }
1198       }
1199     }
1200   }
1201
1202   // euqueue current task descriptor
1203   if(currtpd != NULL) {
1204 #ifdef DEBUG
1205     BAMBOO_DEBUGPRINT(0xe504);
1206 #endif
1207     for(i=0; i<currtpd->numParameters; i++) {
1208       markObj(currtpd->parameterArray[i]);
1209     }
1210   }
1211
1212 #ifdef DEBUG
1213   BAMBOO_DEBUGPRINT(0xe505);
1214 #endif
1215   // euqueue active tasks
1216   if(activetasks != NULL) {
1217     struct genpointerlist * ptr=activetasks->list;
1218     while(ptr!=NULL) {
1219       struct taskparamdescriptor *tpd=ptr->src;
1220       int i;
1221       for(i=0; i<tpd->numParameters; i++) {
1222                 markObj(tpd->parameterArray[i]);
1223       }
1224       ptr=ptr->inext;
1225     }
1226   }
1227
1228 #ifdef DEBUG
1229   BAMBOO_DEBUGPRINT(0xe506);
1230 #endif
1231   // enqueue cached transferred obj
1232   struct QueueItem * tmpobjptr =  getHead(&objqueue);
1233   while(tmpobjptr != NULL) {
1234     struct transObjInfo * objInfo =
1235       (struct transObjInfo *)(tmpobjptr->objectptr);
1236     markObj(objInfo->objptr);
1237     tmpobjptr = getNextQueueItem(tmpobjptr);
1238   }
1239
1240 #ifdef DEBUG
1241   BAMBOO_DEBUGPRINT(0xe507);
1242 #endif
1243   // enqueue cached objs to be transferred
1244   struct QueueItem * item = getHead(totransobjqueue);
1245   while(item != NULL) {
1246     struct transObjInfo * totransobj =
1247       (struct transObjInfo *)(item->objectptr);
1248     markObj(totransobj->objptr);
1249     item = getNextQueueItem(item);
1250   } // while(item != NULL)
1251
1252 #ifdef DEBUG
1253   BAMBOO_DEBUGPRINT(0xe508);
1254 #endif
1255   // enqueue lock related info
1256   for(i = 0; i < runtime_locklen; ++i) {
1257     markObj((void *)(runtime_locks[i].redirectlock));
1258     if(runtime_locks[i].value != NULL) {
1259       markObj((void *)(runtime_locks[i].value));
1260     }
1261   }
1262
1263 } // void tomark(struct garbagelist * stackptr)
1264
1265 inline void mark(bool isfirst,
1266                  struct garbagelist * stackptr) {
1267 #ifdef DEBUG
1268   if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed01);
1269 #endif
1270   if(isfirst) {
1271 #ifdef DEBUG
1272     if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed02);
1273 #endif
1274     // enqueue root objs
1275     tomark(stackptr);
1276     gccurr_heaptop = 0; // record the size of all active objs in this core
1277                         // aligned but does not consider block boundaries
1278     gcmarkedptrbound = 0;
1279   }
1280 #ifdef DEBUG
1281   if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed03);
1282 #endif
1283   int isize = 0;
1284   bool checkfield = true;
1285   bool sendStall = false;
1286   // mark phase
1287   while(MARKPHASE == gcphase) {
1288 #ifdef DEBUG
1289     if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed04);
1290 #endif
1291     while(true) {
1292       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1293       bool hasItems = gc_moreItems2_I();
1294       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1295 #ifdef DEBUG
1296       BAMBOO_DEBUGPRINT(0xed05);
1297 #endif
1298       if(!hasItems) {
1299                 break;
1300       }
1301       sendStall = false;
1302       gcbusystatus = true;
1303       checkfield = true;
1304       void * ptr = gc_dequeue2_I();
1305
1306 #ifdef DEBUG
1307       BAMBOO_DEBUGPRINT_REG(ptr);
1308 #endif
1309       int size = 0;
1310       int isize = 0;
1311       int type = 0;
1312       // check if it is a shared obj
1313       if(ISSHAREDOBJ(ptr)) {
1314                 // a shared obj, check if it is a local obj on this core
1315                 int host = hostcore(ptr);
1316                 bool islocal = (host == BAMBOO_NUM_OF_CORE);
1317                 if(islocal) {
1318                   bool isnotmarked = ((((int *)ptr)[6] & DISCOVERED) != 0);
1319                   if(isLarge(ptr, &type, &size) && isnotmarked) {
1320                         // ptr is a large object and not marked or enqueued
1321 #ifdef DEBUG
1322                         BAMBOO_DEBUGPRINT(0xecec);
1323                         BAMBOO_DEBUGPRINT_REG(ptr);
1324                         BAMBOO_DEBUGPRINT_REG(*((int*)ptr));
1325 #endif
1326                         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1327                         gc_lobjenqueue_I(ptr, size, BAMBOO_NUM_OF_CORE);
1328                         gcnumlobjs++;
1329                         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1330                         // mark this obj
1331                         ((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
1332                         BAMBOO_CACHE_FLUSH_LINE(ptr);
1333                   } else if(isnotmarked) {
1334                         // ptr is an unmarked active object on this core
1335                         ALIGNSIZE(size, &isize);
1336                         gccurr_heaptop += isize;
1337 #ifdef DEBUG
1338                         BAMBOO_DEBUGPRINT(0xaaaa);
1339                         BAMBOO_DEBUGPRINT_REG(ptr);
1340                         BAMBOO_DEBUGPRINT_REG(isize);
1341                         BAMBOO_DEBUGPRINT(((int *)(ptr))[0]);
1342 #endif
1343                         // mark this obj
1344                         ((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
1345                         BAMBOO_CACHE_FLUSH_LINE(ptr);
1346
1347                         if(ptr + size > gcmarkedptrbound) {
1348                           gcmarkedptrbound = ptr + size;
1349                         } // if(ptr + size > gcmarkedptrbound)
1350                   } else {
1351                         // ptr is not an active obj or has been marked
1352                         checkfield = false;
1353                   } // if(isLarge(ptr, &type, &size)) else ...
1354                 }  /* can never reach here
1355                 else {
1356 #ifdef DEBUG
1357                   if(BAMBOO_NUM_OF_CORE == 0) {
1358                         BAMBOO_DEBUGPRINT(0xbbbb);
1359                         BAMBOO_DEBUGPRINT_REG(host);
1360                         BAMBOO_DEBUGPRINT_REG(ptr);
1361                   }
1362 #endif
1363                   // check if this obj has been forwarded
1364                   if(!MGCHashcontains(gcforwardobjtbl, (int)ptr)) {
1365                         // send a msg to host informing that ptr is active
1366                         send_msg_2(host, GCMARKEDOBJ, ptr, false);
1367                         gcself_numsendobjs++;
1368                         MGCHashadd(gcforwardobjtbl, (int)ptr);
1369                   }
1370                         checkfield = false;
1371                 }// if(isLocal(ptr)) else ...*/
1372           }   // if(ISSHAREDOBJ(ptr))
1373 #ifdef DEBUG
1374       BAMBOO_DEBUGPRINT(0xed06);
1375 #endif
1376
1377       if(checkfield) {
1378                 // scan all pointers in ptr
1379                 unsigned INTPTR * pointer;
1380                 pointer=pointerarray[type];
1381                 if (pointer==0) {
1382                   /* Array of primitives */
1383                   /* Do nothing */
1384                 } else if (((INTPTR)pointer)==1) {
1385                   /* Array of pointers */
1386                   struct ArrayObject *ao=(struct ArrayObject *) ptr;
1387                   int length=ao->___length___;
1388                   int j;
1389                   for(j=0; j<length; j++) {
1390                         void *objptr =
1391                           ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
1392                         markObj(objptr);
1393                   }
1394                 } else {
1395                   INTPTR size=pointer[0];
1396                   int i;
1397                   for(i=1; i<=size; i++) {
1398                         unsigned int offset=pointer[i];
1399                         void * objptr=*((void **)(((char *)ptr)+offset));
1400                         markObj(objptr);
1401                   }
1402                 }     // if (pointer==0) else if ... else ...
1403       }   // if(checkfield)
1404     }     // while(gc_moreItems2())
1405 #ifdef DEBUG
1406     BAMBOO_DEBUGPRINT(0xed07);
1407 #endif
1408     gcbusystatus = false;
1409     // send mark finish msg to core coordinator
1410     if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
1411 #ifdef DEBUG
1412       BAMBOO_DEBUGPRINT(0xed08);
1413 #endif
1414       gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
1415       gcnumsendobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=gcself_numsendobjs;
1416       gcnumreceiveobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=
1417                 gcself_numreceiveobjs;
1418       gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
1419     } else {
1420       if(!sendStall) {
1421 #ifdef DEBUG
1422                 BAMBOO_DEBUGPRINT(0xed09);
1423 #endif
1424                 send_msg_4(STARTUPCORE, GCFINISHMARK, BAMBOO_NUM_OF_CORE,
1425                                    gcself_numsendobjs, gcself_numreceiveobjs, false);
1426                 sendStall = true;
1427       }
1428     }  // if(STARTUPCORE == BAMBOO_NUM_OF_CORE) ...
1429 #ifdef DEBUG
1430     BAMBOO_DEBUGPRINT(0xed0a);
1431 #endif
1432
1433     if(BAMBOO_NUM_OF_CORE == STARTUPCORE) {
1434 #ifdef DEBUG
1435       BAMBOO_DEBUGPRINT(0xed0b);
1436 #endif
1437       return;
1438     }
1439   } // while(MARKPHASE == gcphase)
1440
1441   BAMBOO_CACHE_MF();
1442 } // mark()
1443
1444 inline void compact2Heaptophelper_I(int coren,
1445                                     int* p,
1446                                     int* numblocks,
1447                                     int* remain) {
1448   int b;
1449   int memneed = gcrequiredmems[coren] + BAMBOO_CACHE_LINE_SIZE;
1450   if(STARTUPCORE == coren) {
1451     gctomove = true;
1452     gcmovestartaddr = *p;
1453     gcdstcore = gctopcore;
1454     gcblock2fill = *numblocks + 1;
1455   } else {
1456     send_msg_4(coren, GCMOVESTART, gctopcore, *p, (*numblocks) + 1, false);
1457   }
1458 #ifdef DEBUG
1459   BAMBOO_DEBUGPRINT_REG(coren);
1460   BAMBOO_DEBUGPRINT_REG(gctopcore);
1461   BAMBOO_DEBUGPRINT_REG(*p);
1462   BAMBOO_DEBUGPRINT_REG(*numblocks+1);
1463 #endif
1464   if(memneed < *remain) {
1465 #ifdef DEBUG
1466     BAMBOO_DEBUGPRINT(0xd104);
1467 #endif
1468     *p = *p + memneed;
1469     gcrequiredmems[coren] = 0;
1470     gcloads[gctopcore] += memneed;
1471     *remain = *remain - memneed;
1472   } else {
1473 #ifdef DEBUG
1474     BAMBOO_DEBUGPRINT(0xd105);
1475 #endif
1476     // next available block
1477     *p = *p + *remain;
1478     gcfilledblocks[gctopcore] += 1;
1479     int newbase = 0;
1480     BASEPTR(gctopcore, gcfilledblocks[gctopcore], &newbase);
1481     gcloads[gctopcore] = newbase;
1482     gcrequiredmems[coren] -= *remain - BAMBOO_CACHE_LINE_SIZE;
1483     gcstopblock[gctopcore]++;
1484     gctopcore = NEXTTOPCORE(gctopblock);
1485     gctopblock++;
1486     *numblocks = gcstopblock[gctopcore];
1487     *p = gcloads[gctopcore];
1488     BLOCKINDEX(*p, &b);
1489     *remain=(b<NUMCORES4GC) ?
1490              ((BAMBOO_SMEM_SIZE_L)-((*p)%(BAMBOO_SMEM_SIZE_L)))
1491              : ((BAMBOO_SMEM_SIZE)-((*p)%(BAMBOO_SMEM_SIZE)));
1492 #ifdef DEBUG
1493     BAMBOO_DEBUGPRINT(0xd106);
1494     BAMBOO_DEBUGPRINT_REG(gctopcore);
1495     BAMBOO_DEBUGPRINT_REG(*p);
1496     BAMBOO_DEBUGPRINT_REG(b);
1497     BAMBOO_DEBUGPRINT_REG(*remain);
1498 #endif
1499   }  // if(memneed < remain)
1500   gcmovepending--;
1501 } // void compact2Heaptophelper_I(int, int*, int*, int*)
1502
1503 inline void compact2Heaptop() {
1504   // no cores with spare mem and some cores are blocked with pending move
1505   // find the current heap top and make them move to the heap top
1506   int p;
1507   int numblocks = gcfilledblocks[gctopcore];
1508   p = gcloads[gctopcore];
1509   int b;
1510   BLOCKINDEX(p, &b);
1511   int remain = (b<NUMCORES4GC) ?
1512                ((BAMBOO_SMEM_SIZE_L)-(p%(BAMBOO_SMEM_SIZE_L)))
1513                : ((BAMBOO_SMEM_SIZE)-(p%(BAMBOO_SMEM_SIZE)));
1514   // check if the top core finishes
1515   BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1516   if(gccorestatus[gctopcore] != 0) {
1517 #ifdef DEBUG
1518     BAMBOO_DEBUGPRINT(0xd101);
1519     BAMBOO_DEBUGPRINT_REG(gctopcore);
1520 #endif
1521     // let the top core finishes its own work first
1522     compact2Heaptophelper_I(gctopcore, &p, &numblocks, &remain);
1523     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1524     return;
1525   }
1526   BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1527
1528 #ifdef DEBUG
1529   BAMBOO_DEBUGPRINT(0xd102);
1530   BAMBOO_DEBUGPRINT_REG(gctopcore);
1531   BAMBOO_DEBUGPRINT_REG(p);
1532   BAMBOO_DEBUGPRINT_REG(b);
1533   BAMBOO_DEBUGPRINT_REG(remain);
1534 #endif
1535   for(int i = 0; i < NUMCORES4GC; i++) {
1536     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1537     if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0)) {
1538 #ifdef DEBUG
1539       BAMBOO_DEBUGPRINT(0xd103);
1540 #endif
1541       compact2Heaptophelper_I(i, &p, &numblocks, &remain);
1542       if(gccorestatus[gctopcore] != 0) {
1543 #ifdef DEBUG
1544                 BAMBOO_DEBUGPRINT(0xd101);
1545                 BAMBOO_DEBUGPRINT_REG(gctopcore);
1546 #endif
1547                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1548                 // the top core is not free now
1549                 return;
1550       }
1551     }  // if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0))
1552     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1553   }   // for(i = 0; i < NUMCORES4GC; i++)
1554 #ifdef DEBUG
1555   BAMBOO_DEBUGPRINT(0xd106);
1556 #endif
1557 } // void compact2Heaptop()
1558
1559 inline void resolvePendingMoveRequest() {
1560 #ifdef DEBUG
1561   BAMBOO_DEBUGPRINT(0xeb01);
1562 #endif
1563 #ifdef DEBUG
1564   BAMBOO_DEBUGPRINT(0xeeee);
1565   for(int k = 0; k < NUMCORES4GC; k++) {
1566     BAMBOO_DEBUGPRINT(0xf000+k);
1567     BAMBOO_DEBUGPRINT_REG(gccorestatus[k]);
1568     BAMBOO_DEBUGPRINT_REG(gcloads[k]);
1569     BAMBOO_DEBUGPRINT_REG(gcfilledblocks[k]);
1570     BAMBOO_DEBUGPRINT_REG(gcstopblock[k]);
1571   }
1572   BAMBOO_DEBUGPRINT(0xffff);
1573 #endif
1574   int i;
1575   int j;
1576   bool nosparemem = true;
1577   bool haspending = false;
1578   bool hasrunning = false;
1579   bool noblock = false;
1580   int dstcore = 0;       // the core who need spare mem
1581   int sourcecore = 0;       // the core who has spare mem
1582   for(i = j = 0; (i < NUMCORES4GC) && (j < NUMCORES4GC); ) {
1583     if(nosparemem) {
1584       // check if there are cores with spare mem
1585       if(gccorestatus[i] == 0) {
1586                 // finished working, check if it still have spare mem
1587                 if(gcfilledblocks[i] < gcstopblock[i]) {
1588                   // still have spare mem
1589                   nosparemem = false;
1590                   sourcecore = i;
1591                 }  // if(gcfilledblocks[i] < gcstopblock[i]) else ...
1592       }
1593       i++;
1594     }  // if(nosparemem)
1595     if(!haspending) {
1596       if(gccorestatus[j] != 0) {
1597                 // not finished, check if it has pending move requests
1598                 if((gcfilledblocks[j]==gcstopblock[j])&&(gcrequiredmems[j]>0)) {
1599                   dstcore = j;
1600                   haspending = true;
1601                 } else {
1602                   hasrunning = true;
1603                 }  // if((gcfilledblocks[i] == gcstopblock[i])...) else ...
1604       }  // if(gccorestatus[i] == 0) else ...
1605       j++;
1606     }  // if(!haspending)
1607     if(!nosparemem && haspending) {
1608       // find match
1609       int tomove = 0;
1610       int startaddr = 0;
1611       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1612       gcrequiredmems[dstcore] = assignSpareMem_I(sourcecore,
1613                                                  gcrequiredmems[dstcore],
1614                                                  &tomove,
1615                                                  &startaddr);
1616       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1617 #ifdef DEBUG
1618       BAMBOO_DEBUGPRINT(0xeb02);
1619       BAMBOO_DEBUGPRINT_REG(sourcecore);
1620       BAMBOO_DEBUGPRINT_REG(dstcore);
1621       BAMBOO_DEBUGPRINT_REG(startaddr);
1622       BAMBOO_DEBUGPRINT_REG(tomove);
1623 #endif
1624       if(STARTUPCORE == dstcore) {
1625 #ifdef DEBUG
1626                 BAMBOO_DEBUGPRINT(0xeb03);
1627 #endif
1628                 gcdstcore = sourcecore;
1629                 gctomove = true;
1630                 gcmovestartaddr = startaddr;
1631                 gcblock2fill = tomove;
1632       } else {
1633 #ifdef DEBUG
1634                 BAMBOO_DEBUGPRINT(0xeb04);
1635 #endif
1636                 send_msg_4(dstcore, GCMOVESTART, sourcecore,
1637                                    startaddr, tomove, false);
1638       }
1639       gcmovepending--;
1640       nosparemem = true;
1641       haspending = false;
1642       noblock = true;
1643     }
1644   }   // for(i = 0; i < NUMCORES4GC; i++)
1645 #ifdef DEBUG
1646   BAMBOO_DEBUGPRINT(0xcccc);
1647   BAMBOO_DEBUGPRINT_REG(hasrunning);
1648   BAMBOO_DEBUGPRINT_REG(haspending);
1649   BAMBOO_DEBUGPRINT_REG(noblock);
1650 #endif
1651
1652   if(!hasrunning && !noblock) {
1653     gcphase = SUBTLECOMPACTPHASE;
1654     compact2Heaptop();
1655   }
1656
1657 } // void resovePendingMoveRequest()
1658
1659 struct moveHelper {
1660   int numblocks;       // block num for heap
1661   INTPTR base;       // base virtual address of current heap block
1662   INTPTR ptr;       // virtual address of current heap top
1663   int offset;       // offset in current heap block
1664   int blockbase;       // virtual address of current small block to check
1665   int blockbound;       // bound virtual address of current small blcok
1666   int sblockindex;       // index of the small blocks
1667   int top;       // real size of current heap block to check
1668   int bound;       // bound size of current heap block to check
1669 }; // struct moveHelper
1670
1671 // If out of boundary of valid shared memory, return false, else return true
1672 inline bool nextSBlock(struct moveHelper * orig) {
1673   orig->blockbase = orig->blockbound;
1674   bool sbchanged = false;
1675 #ifdef DEBUG
1676   BAMBOO_DEBUGPRINT(0xecc0);
1677   BAMBOO_DEBUGPRINT_REG(orig->blockbase);
1678   BAMBOO_DEBUGPRINT_REG(orig->blockbound);
1679   BAMBOO_DEBUGPRINT_REG(orig->bound);
1680   BAMBOO_DEBUGPRINT_REG(orig->ptr);
1681 #endif
1682 outernextSBlock:
1683   // check if across a big block
1684   // TODO now do not zero out the whole memory, maybe the last two conditions
1685   // are useless now
1686   if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)
1687      || ((orig->ptr != NULL) && (*((int*)orig->ptr))==0)
1688      || ((*((int*)orig->blockbase))==0)) {
1689 innernextSBlock:
1690     // end of current heap block, jump to next one
1691     orig->numblocks++;
1692 #ifdef DEBUG
1693     BAMBOO_DEBUGPRINT(0xecc1);
1694     BAMBOO_DEBUGPRINT_REG(orig->numblocks);
1695 #endif
1696     BASEPTR(BAMBOO_NUM_OF_CORE, orig->numblocks, &(orig->base));
1697 #ifdef DEBUG
1698     BAMBOO_DEBUGPRINT(orig->base);
1699 #endif
1700     if(orig->base >= gcbaseva + BAMBOO_SHARED_MEM_SIZE) {
1701       // out of boundary
1702       orig->ptr = orig->base; // set current ptr to out of boundary too
1703       return false;
1704     }
1705     orig->blockbase = orig->base;
1706     orig->sblockindex = (orig->blockbase-gcbaseva)/BAMBOO_SMEM_SIZE;
1707     sbchanged = true;
1708     int blocknum = 0;
1709     BLOCKINDEX(orig->base, &blocknum);
1710     if(bamboo_smemtbl[blocknum] == 0) {
1711       // goto next block
1712       goto innernextSBlock;
1713     }
1714         // check the bamboo_smemtbl to decide the real bound
1715         orig->bound = orig->base + bamboo_smemtbl[blocknum];
1716   } else if(0 == (orig->blockbase%BAMBOO_SMEM_SIZE)) {
1717     orig->sblockindex += 1;
1718     sbchanged = true;
1719   }  // if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)...
1720
1721   // check if this sblock should be skipped or have special start point
1722   if(gcsbstarttbl[orig->sblockindex] == -1) {
1723     // goto next sblock
1724 #ifdef DEBUG
1725     BAMBOO_DEBUGPRINT(0xecc2);
1726 #endif
1727     orig->sblockindex += 1;
1728     orig->blockbase += BAMBOO_SMEM_SIZE;
1729     goto outernextSBlock;
1730   } else if((gcsbstarttbl[orig->sblockindex] != 0)
1731             && (sbchanged)) {
1732     // the first time to access this SBlock
1733 #ifdef DEBUG
1734     BAMBOO_DEBUGPRINT(0xecc3);
1735 #endif
1736     // not start from the very beginning
1737     orig->blockbase = gcsbstarttbl[orig->sblockindex];
1738   }  // if(gcsbstarttbl[orig->sblockindex] == -1) else ...
1739
1740   // setup information for this sblock
1741   orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
1742   orig->offset = BAMBOO_CACHE_LINE_SIZE;
1743   orig->ptr = orig->blockbase + orig->offset;
1744 #ifdef DEBUG
1745   BAMBOO_DEBUGPRINT(0xecc4);
1746   BAMBOO_DEBUGPRINT_REG(orig->base);
1747   BAMBOO_DEBUGPRINT_REG(orig->bound);
1748   BAMBOO_DEBUGPRINT_REG(orig->ptr);
1749   BAMBOO_DEBUGPRINT_REG(orig->blockbound);
1750   BAMBOO_DEBUGPRINT_REG(orig->blockbase);
1751   BAMBOO_DEBUGPRINT_REG(orig->offset);
1752 #endif
1753   if(orig->ptr >= orig->bound) {
1754     // met a lobj, move to next block
1755     goto innernextSBlock;
1756   }
1757
1758   return true;
1759 } // bool nextSBlock(struct moveHelper * orig)
1760
1761 // return false if there are no available data to compact
1762 inline bool initOrig_Dst(struct moveHelper * orig,
1763                          struct moveHelper * to) {
1764   // init the dst ptr
1765   to->numblocks = 0;
1766   to->top = to->offset = BAMBOO_CACHE_LINE_SIZE;
1767   to->bound = BAMBOO_SMEM_SIZE_L;
1768   BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
1769
1770 #ifdef DEBUG
1771   BAMBOO_DEBUGPRINT(0xef01);
1772   BAMBOO_DEBUGPRINT_REG(to->base);
1773 #endif
1774   to->ptr = to->base + to->offset;
1775 #ifdef GC_CACHE_ADAPT
1776   // initialize the gc_cache_revise_information
1777   gc_cache_revise_infomation.to_page_start_va = to->ptr;
1778   gc_cache_revise_infomation.to_page_end_va = (BAMBOO_PAGE_SIZE)*
1779         ((to->base-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
1780   gc_cache_revise_infomation.to_page_index =
1781         (to->base-gcbaseva)/(BAMBOO_PAGE_SIZE);
1782   gc_cache_revise_infomation.orig_page_start_va = -1;
1783 #endif // GC_CACHE_ADAPT
1784
1785   // init the orig ptr
1786   orig->numblocks = 0;
1787   orig->base = to->base;
1788   int blocknum = 0;
1789   BLOCKINDEX(orig->base, &blocknum);
1790   // check the bamboo_smemtbl to decide the real bound
1791   orig->bound = orig->base + bamboo_smemtbl[blocknum];
1792   orig->blockbase = orig->base;
1793   orig->sblockindex = (orig->base - gcbaseva) / BAMBOO_SMEM_SIZE;
1794 #ifdef DEBUG
1795   BAMBOO_DEBUGPRINT(0xef02);
1796   BAMBOO_DEBUGPRINT_REG(orig->base);
1797   BAMBOO_DEBUGPRINT_REG(orig->sblockindex);
1798   BAMBOO_DEBUGPRINT_REG(gcsbstarttbl);
1799   BAMBOO_DEBUGPRINT_REG(gcsbstarttbl[orig->sblockindex]);
1800 #endif
1801
1802   if(gcsbstarttbl[orig->sblockindex] == -1) {
1803 #ifdef DEBUG
1804     BAMBOO_DEBUGPRINT(0xef03);
1805 #endif
1806     // goto next sblock
1807     orig->blockbound =
1808       gcbaseva+BAMBOO_SMEM_SIZE*(orig->sblockindex+1);
1809     return nextSBlock(orig);
1810   } else if(gcsbstarttbl[orig->sblockindex] != 0) {
1811 #ifdef DEBUG
1812     BAMBOO_DEBUGPRINT(0xef04);
1813 #endif
1814     orig->blockbase = gcsbstarttbl[orig->sblockindex];
1815   }
1816 #ifdef DEBUG
1817   BAMBOO_DEBUGPRINT(0xef05);
1818 #endif
1819   orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
1820   orig->offset = BAMBOO_CACHE_LINE_SIZE;
1821   orig->ptr = orig->blockbase + orig->offset;
1822 #ifdef DEBUG
1823   BAMBOO_DEBUGPRINT(0xef06);
1824   BAMBOO_DEBUGPRINT_REG(orig->base);
1825 #endif
1826
1827   return true;
1828 } // bool initOrig_Dst(struct moveHelper * orig, struct moveHelper * to)
1829
1830 inline void nextBlock(struct moveHelper * to) {
1831   to->top = to->bound + BAMBOO_CACHE_LINE_SIZE; // header!
1832   to->bound += BAMBOO_SMEM_SIZE;
1833   to->numblocks++;
1834   BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
1835   to->offset = BAMBOO_CACHE_LINE_SIZE;
1836   to->ptr = to->base + to->offset;
1837 } // void nextBlock(struct moveHelper * to)
1838
1839 // endaddr does not contain spaces for headers
1840 inline bool moveobj(struct moveHelper * orig,
1841                     struct moveHelper * to,
1842                     int stopblock) {
1843   if(stopblock == 0) {
1844     return true;
1845   }
1846
1847 #ifdef DEBUG
1848   BAMBOO_DEBUGPRINT(0xe201);
1849   BAMBOO_DEBUGPRINT_REG(orig->ptr);
1850   BAMBOO_DEBUGPRINT_REG(to->ptr);
1851 #endif
1852
1853   int type = 0;
1854   int size = 0;
1855   int mark = 0;
1856   int isize = 0;
1857 innermoveobj:
1858   while((char)(*((int*)(orig->ptr))) == (char)(-2)) {
1859     orig->ptr = (int*)(orig->ptr) + 1;
1860   }
1861 #ifdef GC_CACHE_ADAPT
1862   if(orig->ptr >= gc_cache_revise_infomation.orig_page_end_va) {
1863         // end of an orig page
1864         // compute the impact of this page for the new page
1865         unsigned int tmp_factor=
1866           to->ptr-gc_cache_revise_infomation.to_page_start_va;
1867         int topage=gc_cache_revise_infomation.to_page_index;
1868         int oldpage = gc_cache_revise_infomation.orig_page_index;
1869         unsigned int * newtable=&gccachesamplingtbl_r[topage];
1870         unsigned int * oldtable=&gccachesamplingtbl[oldpage];
1871
1872         for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
1873           (*newtable) += (*oldtable)*tmp_factor;
1874           newtable=(unsigned int*)(
1875                   ((char *)newtable)+size_cachesamplingtbl_local_r);
1876           oldtable=(unsigned int*)(((char *)oldtable)+size_cachesamplingtbl_local);
1877         }
1878         // prepare for an new orig page
1879         int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
1880         gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
1881         gc_cache_revise_infomation.orig_page_end_va = gcbaseva +
1882           (BAMBOO_PAGE_SIZE)*(tmp_index+1);
1883         gc_cache_revise_infomation.orig_page_index = tmp_index;
1884         gc_cache_revise_infomation.to_page_start_va = to->ptr;
1885   }
1886 #endif
1887   if((orig->ptr >= orig->bound) || (orig->ptr == orig->blockbound)) {
1888     if(!nextSBlock(orig)) {
1889       // finished, no more data
1890       return true;
1891     }
1892     goto innermoveobj;
1893   }
1894 #ifdef DEBUG
1895   BAMBOO_DEBUGPRINT(0xe202);
1896   BAMBOO_DEBUGPRINT_REG(orig->ptr);
1897   BAMBOO_DEBUGPRINT(((int *)(orig->ptr))[0]);
1898 #endif
1899   // check the obj's type, size and mark flag
1900   type = ((int *)(orig->ptr))[0];
1901   size = 0;
1902   if(type == 0) {
1903     // end of this block, go to next one
1904     if(!nextSBlock(orig)) {
1905       // finished, no more data
1906       return true;
1907     }
1908     goto innermoveobj;
1909   } else if(type < NUMCLASSES) {
1910     // a normal object
1911     size = classsize[type];
1912   } else {
1913     // an array
1914     struct ArrayObject *ao=(struct ArrayObject *)(orig->ptr);
1915     int elementsize=classsize[type];
1916     int length=ao->___length___;
1917     size=sizeof(struct ArrayObject)+length*elementsize;
1918   }
1919   mark = ((int *)(orig->ptr))[6];
1920   bool isremote = ((((int *)(orig->ptr))[6] & REMOTEM) != 0);
1921 #ifdef DEBUG
1922   BAMBOO_DEBUGPRINT(0xe203);
1923   BAMBOO_DEBUGPRINT_REG(orig->ptr);
1924   BAMBOO_DEBUGPRINT_REG(size);
1925 #endif
1926   ALIGNSIZE(size, &isize);       // no matter is the obj marked or not
1927                                  // should be able to across it
1928   if((mark & MARKED) != 0) {
1929 #ifdef DEBUG
1930     BAMBOO_DEBUGPRINT(0xe204);
1931 #endif
1932 #ifdef GC_PROFILE
1933         gc_num_liveobj++;
1934 #endif
1935     // marked obj, copy it to current heap top
1936     // check to see if remaining space is enough
1937     if(to->top + isize > to->bound) {
1938       // fill 0 indicating the end of this block
1939       BAMBOO_MEMSET_WH(to->ptr,  '\0', to->bound - to->top);
1940       // fill the header of this block and then go to next block
1941       to->offset += to->bound - to->top;
1942       BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
1943       (*((int*)(to->base))) = to->offset;
1944 #ifdef GC_CACHE_ADAPT
1945           int tmp_ptr = to->ptr;
1946 #endif // GC_CACHE_ADAPT
1947       nextBlock(to);
1948 #ifdef GC_CACHE_ADAPT
1949           if((to->ptr) >= gc_cache_revise_infomation.to_page_end_va) {
1950                 // end of an to page, wrap up its information
1951                 unsigned int tmp_factor =
1952                   tmp_ptr-gc_cache_revise_infomation.to_page_start_va;
1953                 int topage=gc_cache_revise_infomation.to_page_index;
1954                 int oldpage = gc_cache_revise_infomation.orig_page_index;
1955                 unsigned int * newtable=&gccachesamplingtbl_r[topage];
1956                 unsigned int * oldtable=&gccachesamplingtbl[oldpage];
1957
1958                 for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
1959                   (*newtable)=((*newtable)+(*oldtable)*tmp_factor);
1960                   newtable=(unsigned int*) (
1961                           ((char *)newtable)+size_cachesamplingtbl_local_r);
1962                   oldtable=(unsigned int*) (
1963                           ((char *)oldtable)+size_cachesamplingtbl_local);
1964                 }
1965                 // prepare for an new to page
1966                 int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
1967                 gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
1968                 gc_cache_revise_infomation.orig_page_end_va = gcbaseva +
1969                   (BAMBOO_PAGE_SIZE)*(tmp_index+1);
1970                 gc_cache_revise_infomation.orig_page_index = tmp_index;
1971                 gc_cache_revise_infomation.to_page_start_va = to->ptr;
1972                 gc_cache_revise_infomation.to_page_end_va = gcbaseva +
1973                   (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
1974                 gc_cache_revise_infomation.to_page_index =
1975                   (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
1976       }
1977 #endif // GC_CACHE_ADAPT
1978       if(stopblock == to->numblocks) {
1979                 // already fulfilled the block
1980                 return true;
1981       }   // if(stopblock == to->numblocks)
1982     }   // if(to->top + isize > to->bound)
1983     // set the mark field to 2, indicating that this obj has been moved
1984     // and need to be flushed
1985     ((int *)(orig->ptr))[6] = COMPACTED;
1986     if(to->ptr != orig->ptr) {
1987       if((int)(orig->ptr) < (int)(to->ptr)+size) {
1988                 memmove(to->ptr, orig->ptr, size);
1989       } else {
1990                 memcpy(to->ptr, orig->ptr, size);
1991       }
1992       // fill the remaining space with -2
1993       BAMBOO_MEMSET_WH(to->ptr+size, -2, isize-size);
1994     }
1995     // store mapping info
1996     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1997 #ifdef LOCALHASHTBL_TEST
1998     RuntimeHashadd_I(gcpointertbl, orig->ptr, to->ptr);
1999 #else
2000         mgchashInsert_I(gcpointertbl, orig->ptr, to->ptr);
2001 #endif
2002         if(isremote) {
2003           // add to the sharedptbl
2004           if(gcsharedptbl != NULL) {
2005                 mgcsharedhashInsert_I(gcsharedptbl, orig->ptr, to->ptr);
2006           }
2007         }
2008     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2009 #ifdef DEBUG
2010     BAMBOO_DEBUGPRINT(0xcdce);
2011     BAMBOO_DEBUGPRINT_REG(orig->ptr);
2012     BAMBOO_DEBUGPRINT_REG(to->ptr);
2013     BAMBOO_DEBUGPRINT_REG(isize);
2014 #endif
2015     gccurr_heaptop -= isize;
2016     to->ptr += isize;
2017     to->offset += isize;
2018     to->top += isize;
2019 #ifdef GC_CACHE_ADAPT
2020         int tmp_ptr = to->ptr;
2021 #endif // GC_CACHE_ADAPT
2022     if(to->top == to->bound) {
2023       // fill the header of this block and then go to next block
2024       BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
2025       (*((int*)(to->base))) = to->offset;
2026       nextBlock(to);
2027     }
2028 #ifdef GC_CACHE_ADAPT
2029           if((to->ptr) >= gc_cache_revise_infomation.to_page_end_va) {
2030                 // end of an to page, wrap up its information
2031                 unsigned int tmp_factor =
2032                   tmp_ptr-gc_cache_revise_infomation.to_page_start_va;
2033                 int topage=gc_cache_revise_infomation.to_page_index;
2034                 int oldpage = gc_cache_revise_infomation.orig_page_index;
2035                 unsigned int * newtable=&gccachesamplingtbl_r[topage];
2036                 unsigned int * oldtable=&gccachesamplingtbl[oldpage];
2037
2038                 for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
2039                   (*newtable)=((*newtable)+(*oldtable)*tmp_factor);
2040                   newtable=(unsigned int*) (
2041                           ((char *)newtable)+size_cachesamplingtbl_local_r);
2042                   oldtable=(unsigned int*) (
2043                           ((char *)oldtable)+size_cachesamplingtbl_local);
2044                 }
2045                 // prepare for an new to page
2046                 int tmp_index = (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
2047                 gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
2048                 gc_cache_revise_infomation.orig_page_end_va = gcbaseva +
2049                   (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2050                 gc_cache_revise_infomation.orig_page_index =
2051                   (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
2052                 gc_cache_revise_infomation.to_page_start_va = to->ptr;
2053                 gc_cache_revise_infomation.to_page_end_va = gcbaseva +
2054                   (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2055                 gc_cache_revise_infomation.to_page_index =
2056                   (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
2057           }
2058 #endif // GC_CACHE_ADAPT
2059   } // if(mark == 1)
2060 #ifdef DEBUG
2061   BAMBOO_DEBUGPRINT(0xe205);
2062 #endif
2063   // move to next obj
2064   orig->ptr += size;
2065
2066 #ifdef DEBUG
2067   BAMBOO_DEBUGPRINT_REG(isize);
2068   BAMBOO_DEBUGPRINT_REG(size);
2069   BAMBOO_DEBUGPRINT_REG(orig->ptr);
2070   BAMBOO_DEBUGPRINT_REG(orig->bound);
2071 #endif
2072   if((orig->ptr > orig->bound) || (orig->ptr == orig->blockbound)) {
2073 #ifdef DEBUG
2074     BAMBOO_DEBUGPRINT(0xe206);
2075 #endif
2076     if(!nextSBlock(orig)) {
2077       // finished, no more data
2078       return true;
2079     }
2080   }
2081 #ifdef DEBUG
2082   BAMBOO_DEBUGPRINT(0xe207);
2083   BAMBOO_DEBUGPRINT_REG(orig->ptr);
2084 #endif
2085   return false;
2086 } //bool moveobj(struct moveHelper* orig,struct moveHelper* to,int* endaddr)
2087
2088 // should be invoked with interrupt closed
2089 inline int assignSpareMem_I(int sourcecore,
2090                             int * requiredmem,
2091                             int * tomove,
2092                             int * startaddr) {
2093   int b = 0;
2094   BLOCKINDEX(gcloads[sourcecore], &b);
2095   int boundptr = (b<NUMCORES4GC) ? ((b+1)*BAMBOO_SMEM_SIZE_L)
2096                  : (BAMBOO_LARGE_SMEM_BOUND+(b-NUMCORES4GC+1)*BAMBOO_SMEM_SIZE);
2097   int remain = boundptr - gcloads[sourcecore];
2098   int memneed = requiredmem + BAMBOO_CACHE_LINE_SIZE;
2099   *startaddr = gcloads[sourcecore];
2100   *tomove = gcfilledblocks[sourcecore] + 1;
2101   if(memneed < remain) {
2102     gcloads[sourcecore] += memneed;
2103     return 0;
2104   } else {
2105     // next available block
2106     gcfilledblocks[sourcecore] += 1;
2107     int newbase = 0;
2108     BASEPTR(sourcecore, gcfilledblocks[sourcecore], &newbase);
2109     gcloads[sourcecore] = newbase;
2110     return requiredmem-remain;
2111   }
2112 } // int assignSpareMem_I(int ,int * , int * , int * )
2113
2114 // should be invoked with interrupt closed
2115 inline bool gcfindSpareMem_I(int * startaddr,
2116                              int * tomove,
2117                              int * dstcore,
2118                              int requiredmem,
2119                              int requiredcore) {
2120   for(int k = 0; k < NUMCORES4GC; k++) {
2121     if((gccorestatus[k] == 0) && (gcfilledblocks[k] < gcstopblock[k])) {
2122       // check if this stopped core has enough mem
2123       assignSpareMem_I(k, requiredmem, tomove, startaddr);
2124       *dstcore = k;
2125       return true;
2126     }
2127   }
2128   // if can not find spare mem right now, hold the request
2129   gcrequiredmems[requiredcore] = requiredmem;
2130   gcmovepending++;
2131   return false;
2132 } //bool gcfindSpareMem_I(int* startaddr,int* tomove,int mem,int core)
2133
2134 inline bool compacthelper(struct moveHelper * orig,
2135                           struct moveHelper * to,
2136                           int * filledblocks,
2137                           int * heaptopptr,
2138                           bool * localcompact) {
2139   // scan over all objs in this block, compact the marked objs
2140   // loop stop when finishing either scanning all active objs or
2141   // fulfilled the gcstopblock
2142 #ifdef DEBUG
2143   BAMBOO_DEBUGPRINT(0xe101);
2144   BAMBOO_DEBUGPRINT_REG(gcblock2fill);
2145   BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
2146 #endif
2147 innercompact:
2148   while(orig->ptr < gcmarkedptrbound) {
2149     bool stop = moveobj(orig, to, gcblock2fill);
2150     if(stop) {
2151       break;
2152     }
2153   }
2154 #ifdef GC_CACHE_ADAPT
2155   // end of an to page, wrap up its information
2156   unsigned int tmp_factor =
2157         to->ptr-gc_cache_revise_infomation.to_page_start_va;
2158   int topage=gc_cache_revise_infomation.to_page_index;
2159   int oldpage = gc_cache_revise_infomation.orig_page_index;
2160   unsigned int * newtable=&gccachesamplingtbl_r[topage];
2161   unsigned int * oldtable=&gccachesamplingtbl[oldpage];
2162
2163   for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
2164     (*newtable) = ((*newtable)+(*oldtable)*tmp_factor);
2165     newtable=(unsigned int*)(((char *)newtable)+size_cachesamplingtbl_local_r);
2166     oldtable=(unsigned int*) (((char *)oldtable)+size_cachesamplingtbl_local);
2167   }
2168 #endif // GC_CACHE_ADAPT
2169   // if no objs have been compact, do nothing,
2170   // otherwise, fill the header of this block
2171   if(to->offset > BAMBOO_CACHE_LINE_SIZE) {
2172     BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
2173     (*((int*)(to->base))) = to->offset;
2174   } else {
2175     to->offset = 0;
2176     to->ptr = to->base;
2177     to->top -= BAMBOO_CACHE_LINE_SIZE;
2178   }  // if(to->offset > BAMBOO_CACHE_LINE_SIZE) else ...
2179   if(*localcompact) {
2180     *heaptopptr = to->ptr;
2181     *filledblocks = to->numblocks;
2182   }
2183 #ifdef DEBUG
2184   BAMBOO_DEBUGPRINT(0xe102);
2185   BAMBOO_DEBUGPRINT_REG(orig->ptr);
2186   BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
2187   BAMBOO_DEBUGPRINT_REG(*heaptopptr);
2188   BAMBOO_DEBUGPRINT_REG(*filledblocks);
2189   BAMBOO_DEBUGPRINT_REG(gccurr_heaptop);
2190 #endif
2191
2192   // send msgs to core coordinator indicating that the compact is finishing
2193   // send compact finish message to core coordinator
2194   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
2195     gcfilledblocks[BAMBOO_NUM_OF_CORE] = *filledblocks;
2196     gcloads[BAMBOO_NUM_OF_CORE] = *heaptopptr;
2197     if(orig->ptr < gcmarkedptrbound) {
2198 #ifdef DEBUG
2199       BAMBOO_DEBUGPRINT(0xe103);
2200 #endif
2201       // ask for more mem
2202       gctomove = false;
2203       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2204       if(gcfindSpareMem_I(&gcmovestartaddr, &gcblock2fill, &gcdstcore,
2205                           gccurr_heaptop, BAMBOO_NUM_OF_CORE)) {
2206 #ifdef DEBUG
2207                 BAMBOO_DEBUGPRINT(0xe104);
2208 #endif
2209                 gctomove = true;
2210       } else {
2211                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2212 #ifdef DEBUG
2213                 BAMBOO_DEBUGPRINT(0xe105);
2214 #endif
2215                 return false;
2216       }
2217       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2218     } else {
2219 #ifdef DEBUG
2220       BAMBOO_DEBUGPRINT(0xe106);
2221 #endif
2222       gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
2223       gctomove = false;
2224       return true;
2225     }
2226   } else {
2227     if(orig->ptr < gcmarkedptrbound) {
2228 #ifdef DEBUG
2229       BAMBOO_DEBUGPRINT(0xe107);
2230 #endif
2231       // ask for more mem
2232       gctomove = false;
2233       send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
2234                  *filledblocks, *heaptopptr, gccurr_heaptop, false);
2235     } else {
2236 #ifdef DEBUG
2237       BAMBOO_DEBUGPRINT(0xe108);
2238       BAMBOO_DEBUGPRINT_REG(*heaptopptr);
2239 #endif
2240       // finish compacting
2241       send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
2242                  *filledblocks, *heaptopptr, 0, false);
2243     }
2244   }       // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
2245
2246   if(orig->ptr < gcmarkedptrbound) {
2247 #ifdef DEBUG
2248     BAMBOO_DEBUGPRINT(0xe109);
2249 #endif
2250     // still have unpacked obj
2251     while(true) {
2252       if(gctomove) {
2253                 break;
2254       }
2255     }
2256     ;
2257         gctomove = false;
2258 #ifdef DEBUG
2259     BAMBOO_DEBUGPRINT(0xe10a);
2260 #endif
2261
2262     to->ptr = gcmovestartaddr;
2263     to->numblocks = gcblock2fill - 1;
2264     to->bound = (to->numblocks==0) ?
2265                 BAMBOO_SMEM_SIZE_L :
2266                 BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
2267     BASEPTR(gcdstcore, to->numblocks, &(to->base));
2268     to->offset = to->ptr - to->base;
2269     to->top = (to->numblocks==0) ?
2270               (to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
2271     to->base = to->ptr;
2272     to->offset = BAMBOO_CACHE_LINE_SIZE;
2273     to->ptr += to->offset;   // for header
2274     to->top += to->offset;
2275     if(gcdstcore == BAMBOO_NUM_OF_CORE) {
2276       *localcompact = true;
2277     } else {
2278       *localcompact = false;
2279     }
2280 #ifdef GC_CACHE_ADAPT
2281         // initialize the gc_cache_revise_information
2282         gc_cache_revise_infomation.to_page_start_va = to->ptr;
2283         gc_cache_revise_infomation.to_page_end_va = gcbaseva +
2284           (BAMBOO_PAGE_SIZE)*((to->base-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2285         gc_cache_revise_infomation.to_page_index =
2286           (to->base-gcbaseva)/(BAMBOO_PAGE_SIZE);
2287         gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
2288         gc_cache_revise_infomation.orig_page_end_va = gcbaseva +
2289           (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2290         gc_cache_revise_infomation.orig_page_index =
2291           (orig->blockbase-gcbaseva)/(BAMBOO_PAGE_SIZE);
2292 #endif // GC_CACHE_ADAPT
2293     goto innercompact;
2294   }
2295 #ifdef DEBUG
2296   BAMBOO_DEBUGPRINT(0xe10b);
2297 #endif
2298   return true;
2299 } // void compacthelper()
2300
2301 inline void compact() {
2302   if(COMPACTPHASE != gcphase) {
2303     BAMBOO_EXIT(0xb102);
2304   }
2305
2306   // initialize pointers for comapcting
2307   struct moveHelper * orig =
2308     (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
2309   struct moveHelper * to =
2310     (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
2311
2312   if(!initOrig_Dst(orig, to)) {
2313     // no available data to compact
2314     // send compact finish msg to STARTUP core
2315 #ifdef DEBUG
2316     BAMBOO_DEBUGPRINT(0xe001);
2317     BAMBOO_DEBUGPRINT_REG(to->base);
2318 #endif
2319     send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
2320                0, to->base, 0, false);
2321     RUNFREE(orig);
2322     RUNFREE(to);
2323     return;
2324   }
2325 #ifdef GC_CACHE_ADAPT
2326   gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
2327   gc_cache_revise_infomation.orig_page_end_va = gcbaseva +
2328         (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2329   gc_cache_revise_infomation.orig_page_index =
2330         (orig->blockbase-gcbaseva)/(BAMBOO_PAGE_SIZE);
2331 #endif // GC_CACHE_ADAPT
2332
2333   int filledblocks = 0;
2334   INTPTR heaptopptr = 0;
2335   bool localcompact = true;
2336   compacthelper(orig, to, &filledblocks, &heaptopptr, &localcompact);
2337
2338   RUNFREE(orig);
2339   RUNFREE(to);
2340 } // compact()
2341
2342 // if return NULL, means
2343 //   1. objptr is NULL
2344 //   2. objptr is not a shared obj
2345 // in these cases, remain the original value is OK
2346 inline void * flushObj(void * objptr) {
2347 #ifdef DEBUG
2348   BAMBOO_DEBUGPRINT(0xe401);
2349 #endif
2350   if(objptr == NULL) {
2351     return NULL;
2352   }
2353   void * dstptr = NULL;
2354   if(ISSHAREDOBJ(objptr)) {
2355 #ifdef DEBUG
2356     BAMBOO_DEBUGPRINT(0xe402);
2357     BAMBOO_DEBUGPRINT_REG(objptr);
2358 #endif
2359     // a shared obj ptr, change to new address
2360     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2361 #ifdef LOCALHASHTBL_TEST
2362     RuntimeHashget(gcpointertbl, objptr, &dstptr);
2363 #else
2364         dstptr = mgchashSearch(gcpointertbl, objptr);
2365 #endif
2366     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2367 #ifdef DEBUG
2368     BAMBOO_DEBUGPRINT_REG(dstptr);
2369 #endif
2370
2371     if(NULL == dstptr) {
2372       // no mapping info
2373 #ifdef DEBUG
2374       BAMBOO_DEBUGPRINT(0xe403);
2375       BAMBOO_DEBUGPRINT_REG(objptr);
2376       BAMBOO_DEBUGPRINT_REG(hostcore(objptr));
2377 #endif
2378       if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) {
2379                 // error! the obj is right on this core, but cannot find it
2380                 BAMBOO_DEBUGPRINT_REG(objptr);
2381                 BAMBOO_EXIT(0xb103);
2382       } else {
2383                 int hostc = hostcore(objptr);
2384                 // check the corresponsing sharedptbl
2385                 BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2386                 mgcsharedhashtbl_t * sptbl = gcrpointertbls[hostc];
2387                 if(sptbl != NULL) {
2388                   dstptr = mgcsharedhashSearch(sptbl, (int)objptr);
2389                   if(dstptr != NULL) {
2390 #ifdef LOCALHASHTBL_TEST
2391                         RuntimeHashadd_I(gcpointertbl, (int)objptr, (int)dstptr);
2392 #else
2393                         mgchashInsert_I(gcpointertbl, (int)objptr, (int)dstptr);
2394 #endif
2395                   }
2396                 }
2397                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2398
2399                 if(dstptr == NULL) {
2400                   // still can not get the mapping info,
2401                   // send msg to host core for the mapping info
2402                   gcobj2map = (int)objptr;
2403                   gcismapped = false;
2404                   gcmappedobj = NULL;
2405                   // the first time require the mapping, send msg to the hostcore
2406                   // for the mapping info
2407                   send_msg_3(hostc, GCMAPREQUEST, (int)objptr,
2408                           BAMBOO_NUM_OF_CORE, false);
2409                   while(true) {
2410                         if(gcismapped) {
2411                           break;
2412                         }
2413                   }
2414                   BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2415 #ifdef LOCALHASHTBL_TEST
2416                   RuntimeHashget(gcpointertbl, objptr, &dstptr);
2417 #else
2418                   dstptr = mgchashSearch(gcpointertbl, objptr);
2419 #endif
2420                   BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2421                 } // if(dstptr == NULL)
2422           }    // if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) else ...
2423 #ifdef DEBUG
2424       BAMBOO_DEBUGPRINT_REG(dstptr);
2425 #endif
2426     }  // if(NULL == dstptr)
2427   }   // if(ISSHAREDOBJ(objptr))
2428   // if not a shared obj, return NULL to indicate no need to flush
2429 #ifdef DEBUG
2430   BAMBOO_DEBUGPRINT(0xe404);
2431 #endif
2432   return dstptr;
2433 } // void flushObj(void * objptr)
2434
2435 inline void flushRuntimeObj(struct garbagelist * stackptr) {
2436   int i,j;
2437   // flush current stack
2438   while(stackptr!=NULL) {
2439     for(i=0; i<stackptr->size; i++) {
2440       if(stackptr->array[i] != NULL) {
2441                 void * dst = flushObj(stackptr->array[i]);
2442                 if(dst != NULL) {
2443                   stackptr->array[i] = dst;
2444                 }
2445       }
2446     }
2447     stackptr=stackptr->next;
2448   }
2449
2450   // flush objectsets
2451   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
2452     for(i=0; i<NUMCLASSES; i++) {
2453       struct parameterwrapper ** queues =
2454         objectqueues[BAMBOO_NUM_OF_CORE][i];
2455       int length = numqueues[BAMBOO_NUM_OF_CORE][i];
2456       for(j = 0; j < length; ++j) {
2457                 struct parameterwrapper * parameter = queues[j];
2458                 struct ObjectHash * set=parameter->objectset;
2459                 struct ObjectNode * ptr=set->listhead;
2460                 while(ptr!=NULL) {
2461                   void * dst = flushObj((void *)ptr->key);
2462                   if(dst != NULL) {
2463                         ptr->key = dst;
2464                   }
2465                   ptr=ptr->lnext;
2466                 }
2467                 ObjectHashrehash(set);
2468       }
2469     }
2470   }
2471
2472   // flush current task descriptor
2473   if(currtpd != NULL) {
2474     for(i=0; i<currtpd->numParameters; i++) {
2475       void * dst = flushObj(currtpd->parameterArray[i]);
2476       if(dst != NULL) {
2477                 currtpd->parameterArray[i] = dst;
2478       }
2479     }
2480   }
2481
2482   // flush active tasks
2483   if(activetasks != NULL) {
2484     struct genpointerlist * ptr=activetasks->list;
2485     while(ptr!=NULL) {
2486       struct taskparamdescriptor *tpd=ptr->src;
2487       int i;
2488       for(i=0; i<tpd->numParameters; i++) {
2489                 void * dst = flushObj(tpd->parameterArray[i]);
2490                 if(dst != NULL) {
2491                   tpd->parameterArray[i] = dst;
2492                 }
2493       }
2494       ptr=ptr->inext;
2495     }
2496     genrehash(activetasks);
2497   }
2498
2499   // flush cached transferred obj
2500   struct QueueItem * tmpobjptr =  getHead(&objqueue);
2501   while(tmpobjptr != NULL) {
2502     struct transObjInfo * objInfo =
2503       (struct transObjInfo *)(tmpobjptr->objectptr);
2504     void * dst = flushObj(objInfo->objptr);
2505     if(dst != NULL) {
2506       objInfo->objptr = dst;
2507     }
2508     tmpobjptr = getNextQueueItem(tmpobjptr);
2509   }
2510
2511   // flush cached objs to be transferred
2512   struct QueueItem * item = getHead(totransobjqueue);
2513   while(item != NULL) {
2514     struct transObjInfo * totransobj =
2515       (struct transObjInfo *)(item->objectptr);
2516     void * dst = flushObj(totransobj->objptr);
2517     if(dst != NULL) {
2518       totransobj->objptr = dst;
2519     }
2520     item = getNextQueueItem(item);
2521   }  // while(item != NULL)
2522
2523   // enqueue lock related info
2524   for(i = 0; i < runtime_locklen; ++i) {
2525     void * dst = flushObj(runtime_locks[i].redirectlock);
2526     if(dst != NULL) {
2527       runtime_locks[i].redirectlock = (int)dst;
2528     }
2529     if(runtime_locks[i].value != NULL) {
2530       void * dst=flushObj(runtime_locks[i].value);
2531       if(dst != NULL) {
2532                 runtime_locks[i].value = (int)dst;
2533       }
2534     }
2535   }
2536
2537 } // void flushRuntimeObj(struct garbagelist * stackptr)
2538
2539 inline void transmappinginfo() {
2540   // broadcast the sharedptbl pointer
2541   for(int i = 0; i < NUMCORESACTIVE; i++) {
2542         if(i != BAMBOO_NUM_OF_CORE) {
2543           send_msg_3(i, GCMAPTBL, gcsharedptbl, BAMBOO_NUM_OF_CORE, false);
2544         }
2545   }
2546
2547   if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
2548         send_msg_2(STARTUPCORE, GCFINISHMAPINFO, BAMBOO_NUM_OF_CORE, false);
2549   }
2550 }
2551
2552 inline void flush(struct garbagelist * stackptr) {
2553
2554   flushRuntimeObj(stackptr);
2555
2556   while(true) {
2557     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2558     bool hasItems = gc_moreItems_I();
2559     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2560     if(!hasItems) {
2561       break;
2562     }
2563
2564 #ifdef DEBUG
2565     BAMBOO_DEBUGPRINT(0xe301);
2566 #endif
2567     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2568     void * ptr = gc_dequeue_I();
2569     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2570     if(ISSHAREDOBJ(ptr)) {
2571       // should be a local shared obj and should have mapping info
2572       ptr = flushObj(ptr);
2573 #ifdef DEBUG
2574       BAMBOO_DEBUGPRINT(0xe302);
2575       BAMBOO_DEBUGPRINT_REG(ptr);
2576       BAMBOO_DEBUGPRINT_REG(tptr);
2577       BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
2578 #endif
2579       if(ptr == NULL) {
2580                 BAMBOO_EXIT(0xb105);
2581       }
2582     } // if(ISSHAREDOBJ(ptr))
2583     if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED)) {
2584       int type = ((int *)(ptr))[0];
2585       // scan all pointers in ptr
2586       unsigned INTPTR * pointer;
2587       pointer=pointerarray[type];
2588 #ifdef DEBUG
2589       BAMBOO_DEBUGPRINT(0xe303);
2590       BAMBOO_DEBUGPRINT_REG(pointer);
2591 #endif
2592       if (pointer==0) {
2593                 /* Array of primitives */
2594                 /* Do nothing */
2595       } else if (((INTPTR)pointer)==1) {
2596 #ifdef DEBUG
2597                 BAMBOO_DEBUGPRINT(0xe304);
2598 #endif
2599                 /* Array of pointers */
2600                 struct ArrayObject *ao=(struct ArrayObject *) ptr;
2601                 int length=ao->___length___;
2602                 int j;
2603                 for(j=0; j<length; j++) {
2604 #ifdef DEBUG
2605                   BAMBOO_DEBUGPRINT(0xe305);
2606 #endif
2607                   void *objptr=
2608                         ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
2609 #ifdef DEBUG
2610                   BAMBOO_DEBUGPRINT_REG(objptr);
2611 #endif
2612                   if(objptr != NULL) {
2613                         void * dst = flushObj(objptr);
2614                         if(dst != NULL) {
2615                           ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
2616                         }
2617                   }
2618                 }
2619       } else {
2620 #ifdef DEBUG
2621                 BAMBOO_DEBUGPRINT(0xe306);
2622 #endif
2623                 INTPTR size=pointer[0];
2624                 int i;
2625                 for(i=1; i<=size; i++) {
2626 #ifdef DEBUG
2627                   BAMBOO_DEBUGPRINT(0xe307);
2628 #endif
2629                   unsigned int offset=pointer[i];
2630                   void * objptr=*((void **)(((char *)ptr)+offset));
2631 #ifdef DEBUG
2632                   BAMBOO_DEBUGPRINT_REG(objptr);
2633 #endif
2634                   if(objptr != NULL) {
2635                         void * dst = flushObj(objptr);
2636                         if(dst != NULL) {
2637                           *((void **)(((char *)ptr)+offset)) = dst;
2638                         }
2639                   }
2640                 } // for(i=1; i<=size; i++)
2641       }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
2642       // restore the mark field, indicating that this obj has been flushed
2643       if(ISSHAREDOBJ(ptr)) {
2644                 ((int *)(ptr))[6] = INIT;
2645       }
2646     }  // if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED))
2647   }   // while(gc_moreItems())
2648 #ifdef DEBUG
2649   BAMBOO_DEBUGPRINT(0xe308);
2650 #endif
2651
2652   // TODO bug here: the startup core contains all lobjs' info, thus all the
2653   // lobjs are flushed in sequence.
2654   // flush lobjs
2655   while(gc_lobjmoreItems_I()) {
2656 #ifdef DEBUG
2657     BAMBOO_DEBUGPRINT(0xe309);
2658 #endif
2659     void * ptr = gc_lobjdequeue_I(NULL, NULL);
2660     ptr = flushObj(ptr);
2661 #ifdef DEBUG
2662     BAMBOO_DEBUGPRINT(0xe30a);
2663     BAMBOO_DEBUGPRINT_REG(ptr);
2664     BAMBOO_DEBUGPRINT_REG(tptr);
2665     BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
2666 #endif
2667     if(ptr == NULL) {
2668       BAMBOO_EXIT(0xb106);
2669     }
2670     if(((int *)(ptr))[6] == COMPACTED) {
2671       int type = ((int *)(ptr))[0];
2672       // scan all pointers in ptr
2673       unsigned INTPTR * pointer;
2674       pointer=pointerarray[type];
2675 #ifdef DEBUG
2676       BAMBOO_DEBUGPRINT(0xe30b);
2677       BAMBOO_DEBUGPRINT_REG(pointer);
2678 #endif
2679       if (pointer==0) {
2680                 /* Array of primitives */
2681                 /* Do nothing */
2682       } else if (((INTPTR)pointer)==1) {
2683 #ifdef DEBUG
2684                 BAMBOO_DEBUGPRINT(0xe30c);
2685 #endif
2686                 /* Array of pointers */
2687                 struct ArrayObject *ao=(struct ArrayObject *) ptr;
2688                 int length=ao->___length___;
2689                 int j;
2690                 for(j=0; j<length; j++) {
2691 #ifdef DEBUG
2692                   BAMBOO_DEBUGPRINT(0xe30d);
2693 #endif
2694                   void *objptr=
2695                         ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
2696 #ifdef DEBUG
2697                   BAMBOO_DEBUGPRINT_REG(objptr);
2698 #endif
2699                   if(objptr != NULL) {
2700                         void * dst = flushObj(objptr);
2701                         if(dst != NULL) {
2702                           ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
2703                         }
2704                   }
2705                 }
2706       } else {
2707 #ifdef DEBUG
2708                 BAMBOO_DEBUGPRINT(0xe30e);
2709 #endif
2710                 INTPTR size=pointer[0];
2711                 int i;
2712                 for(i=1; i<=size; i++) {
2713 #ifdef DEBUG
2714                   BAMBOO_DEBUGPRINT(0xe30f);
2715 #endif
2716                   unsigned int offset=pointer[i];
2717                   void * objptr=*((void **)(((char *)ptr)+offset));
2718
2719 #ifdef DEBUG
2720                   BAMBOO_DEBUGPRINT_REG(objptr);
2721 #endif
2722                   if(objptr != NULL) {
2723                         void * dst = flushObj(objptr);
2724                         if(dst != NULL) {
2725                           *((void **)(((char *)ptr)+offset)) = dst;
2726                         }
2727                   }
2728                 }  // for(i=1; i<=size; i++)
2729       }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
2730       // restore the mark field, indicating that this obj has been flushed
2731       ((int *)(ptr))[6] = INIT;
2732     }     // if(((int *)(ptr))[6] == COMPACTED)
2733   }     // while(gc_lobjmoreItems())
2734 #ifdef DEBUG
2735   BAMBOO_DEBUGPRINT(0xe310);
2736 #endif
2737
2738   // send flush finish message to core coordinator
2739   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
2740     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
2741   } else {
2742     send_msg_2(STARTUPCORE, GCFINISHFLUSH, BAMBOO_NUM_OF_CORE, false);
2743   }
2744 #ifdef DEBUG
2745   BAMBOO_DEBUGPRINT(0xe311);
2746 #endif
2747 } // flush()
2748
2749 #ifdef GC_CACHE_ADAPT
2750 // prepare for cache adaption:
2751 //   -- flush the shared heap
2752 //   -- clean dtlb entries
2753 //   -- change cache strategy
2754 void cacheAdapt_gc(bool isgccachestage) {
2755   // flush the shared heap
2756   BAMBOO_CACHE_FLUSH_L2();
2757
2758   // clean the dtlb entries
2759   BAMBOO_CLEAN_DTLB();
2760
2761   // change the cache strategy
2762   gccachestage = isgccachestage;
2763 } // cacheAdapt_gc(bool isgccachestage)
2764
2765 // the master core decides how to adapt cache strategy for the mutator
2766 // according to collected statistic data
2767
2768 // make all pages hfh
2769 int cacheAdapt_policy_h4h(){
2770   unsigned int page_index = 0;
2771   VA page_sva = 0;
2772   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
2773   int numchanged = 0;
2774   int * tmp_p = gccachepolicytbl+1;
2775   for(page_index = 0; page_index < page_num; page_index++) {
2776         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
2777         bamboo_cache_policy_t policy = {0};
2778         policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
2779         *tmp_p = page_index;
2780         tmp_p++;
2781         *tmp_p = policy.word;
2782         tmp_p++;
2783         numchanged++;
2784   }
2785
2786   return numchanged;
2787 } // int cacheAdapt_policy_hfh()
2788
2789 // make all pages local as non-cache-adaptable gc local mode
2790 int cacheAdapt_policy_local(){
2791   unsigned int page_index = 0;
2792   VA page_sva = 0;
2793   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
2794   int numchanged = 0;
2795   int * tmp_p = gccachepolicytbl+1;
2796   for(page_index = 0; page_index < page_num; page_index++) {
2797         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
2798         bamboo_cache_policy_t policy = {0};
2799         int block = 0;
2800         BLOCKINDEX(page_sva, &block);
2801         int coren = gc_block2core[block%(NUMCORES4GC*2)];
2802         // locally cache the page in the hotest core
2803         // NOTE: (x,y) should be changed to (x+1, y+1)!!!
2804         policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
2805         policy.lotar_x = bamboo_cpu2coords[2*coren]+1;
2806         policy.lotar_y = bamboo_cpu2coords[2*coren+1]+1;
2807         *tmp_p = page_index;
2808         tmp_p++;
2809         *tmp_p = policy.word;
2810         tmp_p++;
2811         numchanged++;
2812   }
2813
2814   return numchanged;
2815 } // int cacheAdapt_policy_local()
2816
2817 int cacheAdapt_policy_hotest(){
2818   unsigned int page_index = 0;
2819   VA page_sva = 0;
2820   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
2821   int numchanged = 0;
2822   int * tmp_p = gccachepolicytbl+1;
2823   for(page_index = 0; page_index < page_num; page_index++) {
2824         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
2825         bamboo_cache_policy_t policy = {0};
2826         int hotestcore = 0;
2827         unsigned int hotfreq = 0;
2828
2829         unsigned int *local_tbl=&gccachesamplingtbl_r[page_index];
2830         for(int i = 0; i < NUMCORESACTIVE; i++) {
2831           unsigned int freq = *local_tbl;
2832           local_tbl=(unsigned int *)(
2833                   ((char *)local_tbl)+size_cachesamplingtbl_local_r);
2834
2835           // check the freqency, decide if this page is hot for the core
2836           if(hotfreq < freq) {
2837                 hotfreq = freq;
2838                 hotestcore = i;
2839           }
2840         }
2841         // TODO
2842         // Decide the cache strategy for this page
2843         // If decide to adapt a new cache strategy, write into the shared block of
2844         // the gcsharedsamplingtbl. The mem recording information that has been
2845         // written is enough to hold the information.
2846         // Format: page start va + cache strategy(hfh/(host core+[x,y]))
2847         if(hotfreq == 0) {
2848           // this page has not been accessed, do not change its cache policy
2849           continue;
2850         } else {
2851           // locally cache the page in the hotest core
2852           // NOTE: (x,y) should be changed to (x+1, y+1)!!!
2853           policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
2854           policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
2855           policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
2856           *tmp_p = page_index;
2857           tmp_p++;
2858           *tmp_p = policy.word;
2859           tmp_p++;
2860           numchanged++;
2861         }
2862   }
2863
2864   return numchanged;
2865 } // int cacheAdapt_policy_hotest()
2866
2867 #define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  50
2868 // cache the page on the core that accesses it the most if that core accesses
2869 // it more than (GC_CACHE_ADAPT_DOMINATE_THRESHOLD)% of the total.  Otherwise,
2870 // h4h the page.
2871 int cacheAdapt_policy_dominate(){
2872   unsigned int page_index = 0;
2873   VA page_sva = 0;
2874   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
2875   int numchanged = 0;
2876   int * tmp_p = gccachepolicytbl+1;
2877   for(page_index = 0; page_index < page_num; page_index++) {
2878         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
2879         bamboo_cache_policy_t policy = {0};
2880         int hotestcore = 0;
2881         unsigned long long totalfreq = 0;
2882         unsigned int hotfreq = 0;
2883
2884         unsigned int *local_tbl=&gccachesamplingtbl_r[page_index];
2885         for(int i = 0; i < NUMCORESACTIVE; i++) {
2886           unsigned int freq = *local_tbl;
2887           local_tbl=(unsigned int *)(
2888                   ((char *)local_tbl)+size_cachesamplingtbl_local_r);
2889           totalfreq += freq;
2890           // check the freqency, decide if this page is hot for the core
2891           if(hotfreq < freq) {
2892                 hotfreq = freq;
2893                 hotestcore = i;
2894           }
2895         }
2896
2897         // Decide the cache strategy for this page
2898         // If decide to adapt a new cache strategy, write into the shared block of
2899         // the gcpolicytbl
2900         // Format: page start va + cache policy
2901         if(hotfreq == 0) {
2902           // this page has not been accessed, do not change its cache policy
2903           continue;
2904         }
2905         totalfreq =
2906           (totalfreq*GC_CACHE_ADAPT_DOMINATE_THRESHOLD)/100/BAMBOO_PAGE_SIZE;
2907         hotfreq/=BAMBOO_PAGE_SIZE;
2908         if(hotfreq < totalfreq) {
2909           // use hfh
2910           policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
2911         } else {
2912           // locally cache the page in the hotest core
2913           // NOTE: (x,y) should be changed to (x+1, y+1)!!!
2914           policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
2915           policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
2916           policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
2917         }
2918         *tmp_p = page_index;
2919         tmp_p++;
2920         *tmp_p = policy.word;
2921         tmp_p++;
2922         numchanged++;
2923   }
2924
2925   return numchanged;
2926 } // int cacheAdapt_policy_dominate()
2927
2928 #define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 10
2929
2930 void gc_quicksort(unsigned long long *array,
2931                       int left,
2932                                   int right,
2933                                   int offset) {
2934   int pivot = 0;;
2935   int leftIdx = left;
2936   int rightIdx = right;
2937   if((right-left+1) >= 1) {
2938         pivot = (left+right)/2;
2939         while((leftIdx <= pivot) && (rightIdx >= pivot)) {
2940           int pivotValue = array[pivot*3-offset];
2941           while((array[leftIdx*3-offset] > pivotValue) && (leftIdx <= pivot)) {
2942                 leftIdx++;
2943           }
2944           while((array[rightIdx*3-offset] < pivotValue) && (rightIdx >= pivot)) {
2945                 rightIdx--;
2946           }
2947           // swap [leftIdx] & [rightIdx]
2948           for(int k = 0; k < 3; k++) {
2949                 unsigned long long tmp = array[3*rightIdx-k];
2950                 array[3*rightIdx-k] = array[3*leftIdx-k];
2951                 array[3*leftIdx-k] = tmp;
2952           }
2953           leftIdx++;
2954           rightIdx--;
2955           if((leftIdx-1) == pivot) {
2956                 pivot = rightIdx = rightIdx + 1;
2957           } else if((leftIdx+1) == pivot) {
2958                 pivot = leftIdx = leftIdx-1;
2959           }
2960         }
2961         gc_quicksort(array, left, pivot-1, offset);
2962         gc_quicksort(array, pivot+1, right, offset);
2963   }
2964   return;
2965 } // void gc_quicksort(...)
2966
2967 // Every page cached on the core that accesses it the most.
2968 // Check to see if any core's pages total more accesses than threshold
2969 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the
2970 // most remote accesses and hash for home them until we get below
2971 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD
2972 int cacheAdapt_policy_overload(){
2973   unsigned int page_index = 0;
2974   VA page_sva = 0;
2975   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
2976   int numchanged = 0;
2977   int * tmp_p = gccachepolicytbl+1;
2978   unsigned long long workload[NUMCORESACTIVE];
2979   memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
2980   unsigned long long total_workload = 0;
2981   unsigned long long core2heavypages[NUMCORESACTIVE][page_num*3+1];
2982   memset(core2heavypages,0,
2983           sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
2984   for(page_index = 0; page_index < page_num; page_index++) {
2985         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
2986         bamboo_cache_policy_t policy = {0};
2987         int hotestcore = 0;
2988         unsigned long long totalfreq = 0;
2989         unsigned int hotfreq = 0;
2990
2991         unsigned int *local_tbl=&gccachesamplingtbl_r[page_index];
2992         for(int i = 0; i < NUMCORESACTIVE; i++) {
2993           unsigned int freq = *local_tbl;
2994           local_tbl=(unsigned int *)(
2995                   ((char *)local_tbl)+size_cachesamplingtbl_local_r);
2996           totalfreq += freq;
2997           // check the freqency, decide if this page is hot for the core
2998           if(hotfreq < freq) {
2999                 hotfreq = freq;
3000                 hotestcore = i;
3001           }
3002         }
3003         // Decide the cache strategy for this page
3004         // If decide to adapt a new cache strategy, write into the shared block of
3005         // the gcsharedsamplingtbl. The mem recording information that has been
3006         // written is enough to hold the information.
3007         // Format: page start va + cache strategy(hfh/(host core+[x,y]))
3008         if(hotfreq == 0) {
3009           // this page has not been accessed, do not change its cache policy
3010           continue;
3011         }
3012
3013         totalfreq/=BAMBOO_PAGE_SIZE;
3014         hotfreq/=BAMBOO_PAGE_SIZE;
3015         // locally cache the page in the hotest core
3016         // NOTE: (x,y) should be changed to (x+1, y+1)!!!
3017         policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
3018         policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
3019         policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
3020         *tmp_p = page_index;
3021         tmp_p++;
3022         *tmp_p = policy.word;
3023         tmp_p++;
3024         numchanged++;
3025         workload[hotestcore] += totalfreq;
3026         total_workload += totalfreq;
3027         // insert into core2heavypages using quicksort
3028         unsigned long long remoteaccess = totalfreq - hotfreq;
3029         int index = (int)core2heavypages[hotestcore][0];
3030         core2heavypages[hotestcore][3*index+3] = remoteaccess;
3031         core2heavypages[hotestcore][3*index+2] = totalfreq;
3032         core2heavypages[hotestcore][3*index+1] = (unsigned long long)(tmp_p-1);
3033         core2heavypages[hotestcore][0]++;
3034   }
3035
3036   unsigned long long workload_threshold =
3037         total_workload/GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
3038   // Check the workload of each core
3039   for(int i = 0; i < NUMCORESACTIVE; i++) {
3040         int j = 1;
3041         int index = (int)core2heavypages[i][0];
3042         if(workload[i] > workload_threshold) {
3043           // sort according to the remoteaccess
3044           gc_quicksort(&core2heavypages[i][0], 1, index, 0);
3045           while((workload[i] > workload_threshold) && (j<index*3)) {
3046                 // hfh those pages with more remote accesses
3047                 bamboo_cache_policy_t policy = {0};
3048                 policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
3049                 *((int*)core2heavypages[i][j]) = policy.word;
3050                 workload[i] -= core2heavypages[i][j+1];
3051                 j += 3;
3052           }
3053         }
3054   }
3055
3056   return numchanged;
3057 } // int cacheAdapt_policy_overload()
3058
3059 #define GC_CACHE_ADAPT_ACCESS_THRESHOLD 70
3060 #define GC_CACHE_ADAPT_CROWD_THRESHOLD  20
3061 // Every page cached on the core that accesses it the most.
3062 // Check to see if any core's pages total more accesses than threshold
3063 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the
3064 // most remote accesses and hash for home them until we get below
3065 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.
3066 // Sort pages based on activity....
3067 // If more then GC_CACHE_ADAPT_ACCESS_THRESHOLD% of the accesses for a
3068 // core's pages are from more than GC_CACHE_ADAPT_CROWD_THRESHOLD pages,
3069 // then start hfh these pages(selecting the ones with the most remote
3070 // accesses first or fewest local accesses) until we get below
3071 // GC_CACHE_ADAPT_CROWD_THRESHOLD pages.
3072 int cacheAdapt_policy_crowd(){
3073   unsigned int page_index = 0;
3074   VA page_sva = 0;
3075   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
3076   int numchanged = 0;
3077   int * tmp_p = gccachepolicytbl+1;
3078   unsigned long long workload[NUMCORESACTIVE];
3079   memset(workload, 0, NUMCORESACTIVE*sizeof(unsigned long long));
3080   unsigned long long total_workload = 0;
3081   unsigned long long core2heavypages[NUMCORESACTIVE][page_num*3+1];
3082   memset(core2heavypages,0,
3083           sizeof(unsigned long long)*(page_num*3+1)*NUMCORESACTIVE);
3084   for(page_index = 0; page_index < page_num; page_index++) {
3085         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
3086         bamboo_cache_policy_t policy = {0};
3087         int hotestcore = 0;
3088         unsigned long long totalfreq = 0;
3089         unsigned int hotfreq = 0;
3090
3091         unsigned int *local_tbl=&gccachesamplingtbl_r[page_index];
3092         for(int i = 0; i < NUMCORESACTIVE; i++) {
3093           unsigned int freq = *local_tbl;
3094           local_tbl=(unsigned int *)(
3095                   ((char *)local_tbl)+size_cachesamplingtbl_local_r);
3096           totalfreq += freq;
3097           // check the freqency, decide if this page is hot for the core
3098           if(hotfreq < freq) {
3099                 hotfreq = freq;
3100                 hotestcore = i;
3101           }
3102         }
3103         // Decide the cache strategy for this page
3104         // If decide to adapt a new cache strategy, write into the shared block of
3105         // the gcsharedsamplingtbl. The mem recording information that has been
3106         // written is enough to hold the information.
3107         // Format: page start va + cache strategy(hfh/(host core+[x,y]))
3108         if(hotfreq == 0) {
3109           // this page has not been accessed, do not change its cache policy
3110           continue;
3111         }
3112         totalfreq/=BAMBOO_PAGE_SIZE;
3113         hotfreq/=BAMBOO_PAGE_SIZE;
3114         // locally cache the page in the hotest core
3115         // NOTE: (x,y) should be changed to (x+1, y+1)!!!
3116         policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
3117         policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
3118         policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
3119         *tmp_p = page_index;
3120         tmp_p++;
3121         *tmp_p = policy.word;
3122         tmp_p++;
3123         numchanged++;
3124         workload[hotestcore] += totalfreq;
3125         total_workload += totalfreq;
3126         // insert into core2heavypages using quicksort
3127         unsigned long long remoteaccess = totalfreq - hotfreq;
3128         int index = (int)core2heavypages[hotestcore][0];
3129         core2heavypages[hotestcore][3*index+3] = remoteaccess;
3130         core2heavypages[hotestcore][3*index+2] = totalfreq;
3131         core2heavypages[hotestcore][3*index+1] = (unsigned long long)(tmp_p-1);
3132         core2heavypages[hotestcore][0]++;
3133   }
3134
3135   unsigned long long workload_threshold =
3136         total_workload / GC_CACHE_ADAPT_OVERLOAD_THRESHOLD;
3137   // Check the workload of each core
3138   for(int i = 0; i < NUMCORESACTIVE; i++) {
3139         int j = 1;
3140         int index = (int)core2heavypages[i][0];
3141         if(workload[i] > workload_threshold) {
3142           // sort according to the remoteaccess
3143           gc_quicksort(&core2heavypages[i][0], 1, index, 0);
3144           while((workload[i] > workload_threshold) && (j<index*3)) {
3145                 // hfh those pages with more remote accesses
3146                 bamboo_cache_policy_t policy = {0};
3147                 policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
3148                 *((int*)core2heavypages[i][j]) = policy.word;
3149                 workload[i] -= core2heavypages[i][j+1];
3150                 j += 3;
3151           }
3152         }
3153
3154         // Check if the accesses are crowded on few pages
3155         // sort according to the total access
3156 inner_crowd:
3157         gc_quicksort(&core2heavypages[i][0], j/3+1, index, 1);
3158         unsigned long long threshold =
3159           GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
3160         int num_crowded = 0;
3161         unsigned long long t_workload = 0;
3162         do {
3163           t_workload += core2heavypages[i][j+num_crowded*3+1];
3164           num_crowded++;
3165         } while(t_workload < threshold);
3166         // num_crowded <= GC_CACHE_ADAPT_CROWD_THRESHOLD and if there are enough
3167         // items, it is always == GC_CACHE_ADAPT_CROWD_THRESHOLD
3168         if(num_crowded > GC_CACHE_ADAPT_CROWD_THRESHOLD) {
3169           // need to hfh these pages
3170           // sort the pages according to remote access
3171           gc_quicksort(&core2heavypages[i][0], j/3+1, j/3+num_crowded, 0);
3172           // h4h those pages with more remote accesses
3173           bamboo_cache_policy_t policy = {0};
3174           policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
3175           *((int*)core2heavypages[i][j]) = policy.word;
3176           workload[i] -= core2heavypages[i][j+1];
3177           t_workload -= core2heavypages[i][j+1];
3178           j += 3;
3179           threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
3180           goto inner_crowd;
3181         }
3182   }
3183
3184   return numchanged;
3185 } // int cacheAdapt_policy_overload()
3186
3187 void cacheAdapt_master() {
3188 #ifdef GC_CACHE_ADAPT_SAMPLING_OUTPUT
3189   gc_output_cache_sampling_r();
3190 #endif // GC_CACHE_ADAPT_SAMPLING_OUTPUT
3191   int numchanged = 0;
3192   // check the statistic data
3193   // for each page, decide the new cache strategy
3194 #ifdef GC_CACHE_ADAPT_POLICY1
3195   numchanged = cacheAdapt_policy_h4h();
3196 #elif defined GC_CACHE_ADAPT_POLICY2
3197   numchanged = cacheAdapt_policy_local();
3198 #elif defined GC_CACHE_ADAPT_POLICY3
3199   numchanged = cacheAdapt_policy_hotest();
3200 #elif defined GC_CACHE_ADAPT_POLICY4
3201   numchanged = cacheAdapt_policy_dominate();
3202 #elif defined GC_CACHE_ADAPT_POLICY5
3203   numchanged = cacheAdapt_policy_overload();
3204 #elif defined GC_CACHE_ADAPT_POLICY6
3205   numchanged = cacheAdapt_policy_crowd();
3206 #endif
3207   *gccachepolicytbl = numchanged;
3208 }
3209
3210 // adapt the cache strategy for the mutator
3211 void cacheAdapt_mutator() {
3212   int numchanged = *gccachepolicytbl;
3213   // check the changes and adapt them
3214   int * tmp_p = gccachepolicytbl+1;
3215   while(numchanged--) {
3216         // read out the policy
3217         int page_index = *tmp_p;
3218         bamboo_cache_policy_t policy = (bamboo_cache_policy_t)(*(tmp_p+1));
3219         // adapt the policy
3220         bamboo_adapt_cache_policy(page_index*(BAMBOO_PAGE_SIZE)+gcbaseva,
3221                 policy, BAMBOO_PAGE_SIZE);
3222
3223         tmp_p += 2;
3224   }
3225 }
3226
3227 void gc_output_cache_sampling() {
3228   unsigned int page_index = 0;
3229   VA page_sva = 0;
3230   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
3231   for(page_index = 0; page_index < page_num; page_index++) {
3232         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
3233         int block = 0;
3234         BLOCKINDEX(page_sva, &block);
3235         int coren = gc_block2core[block%(NUMCORES4GC*2)];
3236         tprintf("va: %x page_index: %d host: %d\n",
3237                 (int)page_sva, page_index, coren);
3238         for(int i = 0; i < NUMCORESACTIVE; i++) {
3239           unsigned int * local_tbl = (unsigned int *)((void *)gccachesamplingtbl
3240                   +size_cachesamplingtbl_local*i);
3241           unsigned int freq = local_tbl[page_index];
3242           printf("%8d ",freq);
3243         }
3244         printf("\n");
3245   }
3246   printf("=================\n");
3247 } // gc_output_cache_sampling
3248
3249 void gc_output_cache_sampling_r() {
3250   unsigned int page_index = 0;
3251   VA page_sva = 0;
3252   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
3253   for(page_index = 0; page_index < page_num; page_index++) {
3254         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
3255         int block = 0;
3256         BLOCKINDEX(page_sva, &block);
3257         int coren = gc_block2core[block%(NUMCORES4GC*2)];
3258         tprintf("va: %x page_index: %d host: %d\n",
3259                 (int)page_sva, page_index, coren);
3260         for(int i = 0; i < NUMCORESACTIVE; i++) {
3261           unsigned int * local_tbl = (unsigned int *)((void *)gccachesamplingtbl_r
3262                   +size_cachesamplingtbl_local_r*i);
3263           unsigned int freq = local_tbl[page_index]/BAMBOO_PAGE_SIZE;
3264           printf("%8d ",freq);
3265         }
3266         printf("\n");
3267   }
3268   printf("=================\n");
3269 } // gc_output_cache_sampling
3270 #endif // GC_CACHE_ADAPT
3271
3272 inline void gc_collect(struct garbagelist * stackptr) {
3273   // inform the master that this core is at a gc safe point and is ready to
3274   // do gc
3275   send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs,
3276           self_numreceiveobjs, false);
3277
3278   // core collector routine
3279   while(true) {
3280     if(INITPHASE == gcphase) {
3281       break;
3282     }
3283   }
3284 #ifdef RAWPATH // TODO GC_DEBUG
3285   printf("(%X,%X) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
3286 #endif
3287   initGC();
3288 #ifdef GC_CACHE_ADAPT
3289   // prepare for cache adaption:
3290   cacheAdapt_gc(true);
3291 #endif // GC_CACHE_ADAPT
3292   //send init finish msg to core coordinator
3293   send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
3294
3295   while(true) {
3296     if(MARKPHASE == gcphase) {
3297       break;
3298     }
3299   }
3300 #ifdef RAWPATH // TODO GC_DEBUG
3301   printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(),
3302              udn_tile_coord_y());
3303 #endif
3304   mark(true, stackptr);
3305 #ifdef RAWPATH // TODO GC_DEBUG
3306   printf("(%x,%x) Finish mark phase, start compact phase\n",
3307              udn_tile_coord_x(), udn_tile_coord_y());
3308 #endif
3309   compact();
3310 #ifdef RAWPATH // TODO GC_DEBUG
3311   printf("(%x,%x) Finish compact phase\n", udn_tile_coord_x(),
3312              udn_tile_coord_y());
3313 #endif
3314
3315   while(true) {
3316         if(MAPPHASE == gcphase) {
3317           break;
3318         }
3319   }
3320 #ifdef RAWPATH // TODO GC_DEBUG
3321   printf("(%x,%x) Start map phase\n", udn_tile_coord_x(),
3322              udn_tile_coord_y());
3323 #endif
3324   transmappinginfo();
3325 #ifdef RAWPATH // TODO GC_DEBUG
3326   printf("(%x,%x) Finish map phase\n", udn_tile_coord_x(),
3327              udn_tile_coord_y());
3328 #endif
3329
3330   while(true) {
3331     if(FLUSHPHASE == gcphase) {
3332       break;
3333     }
3334   }
3335 #ifdef RAWPATH // TODO GC_DEBUG
3336   printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(),
3337              udn_tile_coord_y());
3338 #endif
3339 #ifdef GC_PROFILE
3340   // send the num of obj/liveobj/forwardobj to the startupcore
3341   if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
3342         send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj,
3343                 gc_num_liveobj, gc_num_forwardobj, false);
3344   }
3345   gc_num_obj = 0;
3346 #endif // GC_PROFLIE
3347   flush(stackptr);
3348 #ifdef RAWPATH // TODO GC_DEBUG
3349   printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(),
3350              udn_tile_coord_y());
3351 #endif
3352
3353 #ifdef GC_CACHE_ADAPT
3354   while(true) {
3355     if(PREFINISHPHASE == gcphase) {
3356       break;
3357     }
3358   }
3359 #ifdef RAWPATH // TODO GC_DEBUG
3360   printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(),
3361              udn_tile_coord_y());
3362 #endif
3363   // cache adapt phase
3364   cacheAdapt_mutator();
3365   cacheAdapt_gc(false);
3366   //send init finish msg to core coordinator
3367   send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
3368 #ifdef RAWPATH // TODO GC_DEBUG
3369   printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
3370              udn_tile_coord_y());
3371 #endif
3372 #endif // GC_CACHE_ADAPT
3373
3374   while(true) {
3375     if(FINISHPHASE == gcphase) {
3376       break;
3377     }
3378   }
3379 #ifdef RAWPATH // TODO GC_DEBUG
3380   printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
3381 #endif
3382 } // void gc_collect(struct garbagelist * stackptr)
3383
3384 inline void gc_nocollect(struct garbagelist * stackptr) {
3385   // inform the master that this core is at a gc safe point and is ready to
3386   // do gc
3387   send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs,
3388           self_numreceiveobjs, false);
3389
3390   while(true) {
3391     if(INITPHASE == gcphase) {
3392       break;
3393     }
3394   }
3395 #ifdef RAWPATH // TODO GC_DEBUG
3396   printf("(%x,%x) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
3397 #endif
3398   initGC();
3399 #ifdef GC_CACHE_ADAPT
3400   // prepare for cache adaption:
3401   cacheAdapt_gc(true);
3402 #endif // GC_CACHE_ADAPT
3403   //send init finish msg to core coordinator
3404   send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
3405
3406   while(true) {
3407     if(MARKPHASE == gcphase) {
3408       break;
3409     }
3410   }
3411 #ifdef RAWPATH // TODO GC_DEBUG
3412   printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(),
3413              udn_tile_coord_y());
3414 #endif
3415   mark(true, stackptr);
3416 #ifdef RAWPATH // TODO GC_DEBUG
3417   printf("(%x,%x) Finish mark phase, wait for flush\n",
3418              udn_tile_coord_x(), udn_tile_coord_y());
3419 #endif
3420
3421   // non-gc core collector routine
3422   while(true) {
3423     if(FLUSHPHASE == gcphase) {
3424       break;
3425     }
3426   }
3427 #ifdef RAWPATH // TODO GC_DEBUG
3428   printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(),
3429              udn_tile_coord_y());
3430 #endif
3431 #ifdef GC_PROFILE
3432   if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
3433         send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj,
3434                 gc_num_liveobj, gc_num_forwardobj, false);
3435   }
3436   gc_num_obj = 0;
3437 #endif // GC_PROFLIE
3438   flush(stackptr);
3439 #ifdef RAWPATH // TODO GC_DEBUG
3440   printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(),
3441              udn_tile_coord_y());
3442 #endif
3443
3444 #ifdef GC_CACHE_ADAPT
3445   while(true) {
3446     if(PREFINISHPHASE == gcphase) {
3447       break;
3448     }
3449   }
3450 #ifdef RAWPATH // TODO GC_DEBUG
3451   printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(),
3452              udn_tile_coord_y());
3453 #endif
3454   // cache adapt phase
3455   cacheAdapt_mutator();
3456   cacheAdapt_gc(false);
3457   //send init finish msg to core coordinator
3458   send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
3459 #ifdef RAWPATH // TODO GC_DEBUG
3460   printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
3461              udn_tile_coord_y());
3462 #endif
3463 #endif // GC_CACHE_ADAPT
3464
3465   while(true) {
3466     if(FINISHPHASE == gcphase) {
3467       break;
3468     }
3469   }
3470 #ifdef RAWPATH // TODO GC_DEBUG
3471   printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
3472 #endif
3473 } // void gc_collect(struct garbagelist * stackptr)
3474
3475 inline void gc_master(struct garbagelist * stackptr) {
3476
3477   gcphase = INITPHASE;
3478   int i = 0;
3479   waitconfirm = false;
3480   numconfirm = 0;
3481   initGC();
3482
3483   // Note: all cores need to init gc including non-gc cores
3484   for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; i++) {
3485         // send GC init messages to all cores
3486         send_msg_1(i, GCSTARTINIT, false);
3487   }
3488   bool isfirst = true;
3489   bool allStall = false;
3490
3491 #ifdef GC_CACHE_ADAPT
3492   // prepare for cache adaption:
3493   cacheAdapt_gc(true);
3494 #endif // GC_CACHE_ADAPT
3495
3496 #ifdef RAWPATH // TODO GC_DEBUG
3497   printf("(%x,%x) Check core status \n", udn_tile_coord_x(),
3498                  udn_tile_coord_y());
3499 #endif
3500
3501   gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3502   while(true) {
3503         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3504         if(gc_checkAllCoreStatus_I()) {
3505           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3506           break;
3507         }
3508         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3509   }
3510 #ifdef GC_PROFILE
3511   gc_profileItem();
3512 #endif
3513 #ifdef GC_CACHE_ADAPT_POLICY_OUTPUT
3514   gc_output_cache_sampling();
3515 #endif // GC_CACHE_ADAPT
3516 #ifdef RAWPATH // TODO GC_DEBUG
3517   printf("(%x,%x) Start mark phase \n", udn_tile_coord_x(),
3518                  udn_tile_coord_y());
3519 #endif
3520   // all cores have finished compacting
3521   // restore the gcstatus of all cores
3522   // Note: all cores have to do mark including non-gc cores
3523   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3524   for(i = 1; i < NUMCORESACTIVE; ++i) {
3525         gccorestatus[i] = 1;
3526         // send GC start messages to all cores
3527         send_msg_1(i, GCSTART, false);
3528   }
3529
3530   gcphase = MARKPHASE;
3531   // mark phase
3532   while(MARKPHASE == gcphase) {
3533         mark(isfirst, stackptr);
3534         if(isfirst) {
3535           isfirst = false;
3536         }
3537
3538         // check gcstatus
3539         checkMarkStatue();
3540   }   // while(MARKPHASE == gcphase)
3541   // send msgs to all cores requiring large objs info
3542   // Note: only need to ask gc cores, non-gc cores do not host any objs
3543   numconfirm = NUMCORES4GC - 1;
3544   for(i = 1; i < NUMCORES4GC; ++i) {
3545         send_msg_1(i, GCLOBJREQUEST, false);
3546   }
3547   gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
3548   while(true) {
3549         if(numconfirm==0) {
3550           break;
3551         }
3552   }   // wait for responses
3553   // check the heaptop
3554   if(gcheaptop < gcmarkedptrbound) {
3555         gcheaptop = gcmarkedptrbound;
3556   }
3557 #ifdef GC_PROFILE
3558   gc_profileItem();
3559 #endif
3560 #ifdef RAWPATH // TODO GC_DEBUG
3561   printf("(%x,%x) prepare to cache large objs \n", udn_tile_coord_x(),
3562                  udn_tile_coord_y());
3563 #endif
3564   // cache all large objs
3565   if(!cacheLObjs()) {
3566         // no enough space to cache large objs
3567         BAMBOO_EXIT(0xb107);
3568   }
3569   // predict number of blocks to fill for each core
3570   int tmpheaptop = 0;
3571   int numpbc = loadbalance(&tmpheaptop);
3572   // TODO
3573   numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
3574 #ifdef RAWPATH // TODO GC_DEBUG
3575   printf("(%x,%x) mark phase finished \n", udn_tile_coord_x(),
3576                  udn_tile_coord_y());
3577 #endif
3578   //int tmptopptr = 0;
3579   //BASEPTR(gctopcore, 0, &tmptopptr);
3580   // TODO
3581   //tmptopptr = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
3582   tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
3583 #ifdef DEBUG
3584   BAMBOO_DEBUGPRINT(0xabab);
3585   BAMBOO_DEBUGPRINT_REG(tmptopptr);
3586 #endif
3587   for(i = 0; i < NUMCORES4GC; ++i) {
3588         int tmpcoreptr = 0;
3589         BASEPTR(i, numpbc, &tmpcoreptr);
3590         //send start compact messages to all cores
3591         //TODO bug here, do not know if the direction is positive or negtive?
3592         if (tmpcoreptr < tmpheaptop) {
3593           gcstopblock[i] = numpbc + 1;
3594           if(i != STARTUPCORE) {
3595                 send_msg_2(i, GCSTARTCOMPACT, numpbc+1, false);
3596           } else {
3597                 gcblock2fill = numpbc+1;
3598           }   // if(i != STARTUPCORE)
3599         } else {
3600           gcstopblock[i] = numpbc;
3601           if(i != STARTUPCORE) {
3602                 send_msg_2(i, GCSTARTCOMPACT, numpbc, false);
3603           } else {
3604                 gcblock2fill = numpbc;
3605           }  // if(i != STARTUPCORE)
3606         }
3607 #ifdef DEBUG
3608         BAMBOO_DEBUGPRINT(0xf000+i);
3609         BAMBOO_DEBUGPRINT_REG(tmpcoreptr);
3610         BAMBOO_DEBUGPRINT_REG(gcstopblock[i]);
3611 #endif
3612         // init some data strutures for compact phase
3613         gcloads[i] = 0;
3614         gcfilledblocks[i] = 0;
3615         gcrequiredmems[i] = 0;
3616   }
3617
3618   BAMBOO_CACHE_MF();
3619
3620 #ifdef GC_PROFILE
3621   gc_profileItem();
3622 #endif
3623
3624   // compact phase
3625   bool finalcompact = false;
3626   // initialize pointers for comapcting
3627   struct moveHelper * orig =
3628         (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
3629   struct moveHelper * to =
3630         (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
3631   initOrig_Dst(orig, to);
3632   int filledblocks = 0;
3633   INTPTR heaptopptr = 0;
3634   bool finishcompact = false;
3635   bool iscontinue = true;
3636   bool localcompact = true;
3637   while((COMPACTPHASE == gcphase) || (SUBTLECOMPACTPHASE == gcphase)) {
3638         if((!finishcompact) && iscontinue) {
3639 #ifdef DEBUG
3640           BAMBOO_DEBUGPRINT(0xe001);
3641           BAMBOO_DEBUGPRINT_REG(numpbc);
3642           BAMBOO_DEBUGPRINT_REG(gcblock2fill);
3643 #endif
3644           finishcompact = compacthelper(orig, to, &filledblocks,
3645                                                                         &heaptopptr, &localcompact);
3646 #ifdef DEBUG
3647           BAMBOO_DEBUGPRINT(0xe002);
3648           BAMBOO_DEBUGPRINT_REG(finishcompact);
3649           BAMBOO_DEBUGPRINT_REG(gctomove);
3650           BAMBOO_DEBUGPRINT_REG(gcrequiredmems[0]);
3651           BAMBOO_DEBUGPRINT_REG(gcfilledblocks[0]);
3652           BAMBOO_DEBUGPRINT_REG(gcstopblock[0]);
3653 #endif
3654         }
3655
3656         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3657         if(gc_checkCoreStatus_I()) {
3658           // all cores have finished compacting
3659           // restore the gcstatus of all cores
3660           for(i = 0; i < NUMCORES4GC; ++i) {
3661                 gccorestatus[i] = 1;
3662           }
3663           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3664           break;
3665         } else {
3666           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3667           // check if there are spare mem for pending move requires
3668           if(COMPACTPHASE == gcphase) {
3669 #ifdef DEBUG
3670                 BAMBOO_DEBUGPRINT(0xe003);
3671 #endif
3672                 resolvePendingMoveRequest();
3673 #ifdef DEBUG
3674                 BAMBOO_DEBUGPRINT_REG(gctomove);
3675 #endif
3676           } else {
3677 #ifdef DEBUG
3678                 BAMBOO_DEBUGPRINT(0xe004);
3679 #endif
3680                 compact2Heaptop();
3681           }
3682         }   // if(gc_checkCoreStatus_I()) else ...
3683
3684         if(gctomove) {
3685 #ifdef DEBUG
3686           BAMBOO_DEBUGPRINT(0xe005);
3687           BAMBOO_DEBUGPRINT_REG(gcmovestartaddr);
3688           BAMBOO_DEBUGPRINT_REG(gcblock2fill);
3689           BAMBOO_DEBUGPRINT_REG(gctomove);
3690 #endif
3691           to->ptr = gcmovestartaddr;
3692           to->numblocks = gcblock2fill - 1;
3693           to->bound = (to->numblocks==0) ?
3694                                   BAMBOO_SMEM_SIZE_L :
3695                                   BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
3696           BASEPTR(gcdstcore, to->numblocks, &(to->base));
3697           to->offset = to->ptr - to->base;
3698           to->top = (to->numblocks==0) ?
3699                                 (to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
3700           to->base = to->ptr;
3701           to->offset = BAMBOO_CACHE_LINE_SIZE;
3702           to->ptr += to->offset;                         // for header
3703           to->top += to->offset;
3704           if(gcdstcore == BAMBOO_NUM_OF_CORE) {
3705                 localcompact = true;
3706           } else {
3707                 localcompact = false;
3708           }
3709           gctomove = false;
3710           iscontinue = true;
3711         } else if(!finishcompact) {
3712           // still pending
3713           iscontinue = false;
3714         }  // if(gctomove)
3715   }  // while(COMPACTPHASE == gcphase)
3716 #ifdef GC_PROFILE
3717   gc_profileItem();
3718 #endif
3719 #ifdef RAWPATH // TODO GC_DEBUG
3720   printf("(%x,%x) prepare to move large objs \n", udn_tile_coord_x(),
3721                  udn_tile_coord_y());
3722 #endif
3723   // move largeObjs
3724   moveLObjs();
3725 #ifdef RAWPATH // TODO GC_DEBUG
3726   printf("(%x,%x) compact phase finished \n", udn_tile_coord_x(),
3727                  udn_tile_coord_y());
3728 #endif
3729   RUNFREE(orig);
3730   RUNFREE(to);
3731   orig = to = NULL;
3732
3733   gcphase = MAPPHASE;
3734   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3735   // Note: all cores should flush their runtime data including non-gc
3736   //       cores
3737   for(i = 1; i < NUMCORES4GC; ++i) {
3738         // send start flush messages to all cores
3739         gccorestatus[i] = 1;
3740         send_msg_1(i, GCSTARTMAPINFO, false);
3741   }
3742 #ifdef GC_PROFILE
3743   gc_profileItem();
3744 #endif
3745 #ifdef RAWPATH // TODO GC_DEBUG
3746   printf("(%x,%x) Start map phase \n", udn_tile_coord_x(),
3747                  udn_tile_coord_y());
3748 #endif
3749   // mapinto phase
3750   transmappinginfo();
3751 #ifdef RAWPATH // TODO GC_DEBUG
3752   printf("(%x,%x) Finish map phase \n", udn_tile_coord_x(),
3753                  udn_tile_coord_y());
3754 #endif
3755   gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3756   while(MAPPHASE == gcphase) {
3757         // check the status of all cores
3758         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3759         if(gc_checkCoreStatus_I()) {
3760           // all cores have finished sending mapping info
3761           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3762           break;
3763         }
3764         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3765   }  // while(MAPPHASE == gcphase)
3766
3767   gcphase = FLUSHPHASE;
3768   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3769   // Note: all cores should flush their runtime data including non-gc
3770   //       cores
3771   for(i = 1; i < NUMCORESACTIVE; ++i) {
3772         // send start flush messages to all cores
3773         gccorestatus[i] = 1;
3774         send_msg_1(i, GCSTARTFLUSH, false);
3775   }
3776 #ifdef GC_PROFILE
3777   gc_profileItem();
3778 #endif
3779 #ifdef RAWPATH // TODO GC_DEBUG
3780   printf("(%x,%x) Start flush phase \n", udn_tile_coord_x(),
3781                  udn_tile_coord_y());
3782 #endif
3783   // flush phase
3784   flush(stackptr);
3785
3786 #ifdef GC_CACHE_ADAPT
3787   // now the master core need to decide the new cache strategy
3788   cacheAdapt_master();
3789 #endif // GC_CACHE_ADAPT
3790
3791   gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3792   while(FLUSHPHASE == gcphase) {
3793         // check the status of all cores
3794         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3795         if(gc_checkAllCoreStatus_I()) {
3796           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3797           break;
3798         }
3799         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3800   }  // while(FLUSHPHASE == gcphase)
3801 #ifdef RAWPATH // TODO GC_DEBUG
3802   printf("(%x,%x) Finish flush phase \n", udn_tile_coord_x(),
3803                  udn_tile_coord_y());
3804 #endif
3805
3806 #ifdef GC_CACHE_ADAPT
3807 #ifdef GC_PROFILE
3808   gc_profileItem();
3809 #endif
3810   gcphase = PREFINISHPHASE;
3811   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3812   // Note: all cores should flush their runtime data including non-gc
3813   //       cores
3814   for(i = 1; i < NUMCORESACTIVE; ++i) {
3815         // send start flush messages to all cores
3816         gccorestatus[i] = 1;
3817         send_msg_1(i, GCSTARTPREF, false);
3818   }
3819 #ifdef RAWPATH // TODO GC_DEBUG
3820   printf("(%x,%x) Start prefinish phase \n", udn_tile_coord_x(),
3821                  udn_tile_coord_y());
3822 #endif
3823   // cache adapt phase
3824   cacheAdapt_mutator();
3825 #ifdef GC_CACHE_ADAPT_OUTPUT
3826   bamboo_output_cache_policy();
3827 #endif
3828   cacheAdapt_gc(false);
3829   gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3830   while(PREFINISHPHASE == gcphase) {
3831         // check the status of all cores
3832         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3833         if(gc_checkAllCoreStatus_I()) {
3834           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3835           break;
3836         }
3837         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3838   }  // while(PREFINISHPHASE == gcphase)
3839 #endif // GC_CACHE_ADAPT
3840
3841   gcphase = FINISHPHASE;
3842
3843   // invalidate all shared mem pointers
3844   // put it here as it takes time to inform all the other cores to
3845   // finish gc and it might cause problem when some core resumes
3846   // mutator earlier than the other cores
3847   bamboo_cur_msp = NULL;
3848   bamboo_smem_size = 0;
3849   bamboo_smem_zero_top = NULL;
3850   gcflag = false;
3851   gcprocessing = false;
3852
3853 #ifdef GC_PROFILE
3854   gc_profileEnd();
3855 #endif
3856   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3857   for(i = 1; i < NUMCORESACTIVE; ++i) {
3858         // send gc finish messages to all cores
3859         send_msg_1(i, GCFINISH, false);
3860         gccorestatus[i] = 1;
3861   }
3862 #ifdef RAWPATH // TODO GC_DEBUG
3863   printf("(%x,%x) gc finished \n", udn_tile_coord_x(),
3864                  udn_tile_coord_y());
3865 #endif
3866 } // void gc_master(struct garbagelist * stackptr)
3867
3868 inline bool gc(struct garbagelist * stackptr) {
3869   // check if do gc
3870   if(!gcflag) {
3871     gcprocessing = false;
3872     return false;
3873   }
3874
3875 #ifdef GC_CACHE_ADAPT
3876 #ifdef GC_CACHE_SAMPLING
3877     // disable the timer interrupt
3878     bamboo_mask_timer_intr();
3879 #endif
3880 #endif
3881   // core coordinator routine
3882   if(0 == BAMBOO_NUM_OF_CORE) {
3883 #ifdef GC_DEBUG
3884     printf("(%x,%X) Check if can do gc or not\n", udn_tile_coord_x(),
3885                    udn_tile_coord_y());
3886 #endif
3887         bool isallstall = true;
3888         gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3889         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3890         int ti = 0;
3891         for(ti = 0; ti < NUMCORESACTIVE; ++ti) {
3892           if(gccorestatus[ti] != 0) {
3893                 isallstall = false;
3894                 break;
3895           }
3896         }
3897         if(!isallstall) {
3898           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3899           // some of the cores are still executing the mutator and did not reach
3900           // some gc safe point, therefore it is not ready to do gc
3901           // in case that there are some pregc information msg lost, send a confirm
3902           // msg to the 'busy' core
3903           send_msg_1(ti, GCSTARTPRE, false);
3904           gcflag = true;
3905           return false;
3906         } else {
3907 #ifdef GC_PROFILE
3908     gc_profileStart();
3909 #endif
3910 pregccheck:
3911           gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
3912           gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
3913           int sumsendobj = 0;
3914 #ifdef DEBUG
3915           BAMBOO_DEBUGPRINT(0xec04);
3916 #endif
3917           for(int i = 0; i < NUMCORESACTIVE; ++i) {
3918                 sumsendobj += gcnumsendobjs[0][i];
3919 #ifdef DEBUG
3920                 BAMBOO_DEBUGPRINT(0xf000 + gcnumsendobjs[0][i]);
3921 #endif
3922           }  // for(i = 1; i < NUMCORESACTIVE; ++i)
3923 #ifdef DEBUG
3924           BAMBOO_DEBUGPRINT(0xec05);
3925           BAMBOO_DEBUGPRINT_REG(sumsendobj);
3926 #endif
3927           for(int i = 0; i < NUMCORESACTIVE; ++i) {
3928                 sumsendobj -= gcnumreceiveobjs[0][i];
3929 #ifdef DEBUG
3930                 BAMBOO_DEBUGPRINT(0xf000 + gcnumreceiveobjs[i]);
3931 #endif
3932           }  // for(i = 1; i < NUMCORESACTIVE; ++i)
3933 #ifdef DEBUG
3934           BAMBOO_DEBUGPRINT(0xec06);
3935           BAMBOO_DEBUGPRINT_REG(sumsendobj);
3936 #endif
3937           if(0 != sumsendobj) {
3938                 // there were still some msgs on the fly, wait until there
3939                 // are some update pregc information coming and check it again
3940                 gcprecheck = false;
3941                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3942                 while(true) {
3943                   if(gcprecheck) {
3944                         break;
3945                   }
3946                 }
3947                 goto pregccheck;
3948           } else {
3949                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3950           }
3951         }
3952 #ifdef RAWPATH // TODO GC_DEBUG
3953     printf("(%x,%x) start gc! \n", udn_tile_coord_x(), udn_tile_coord_y());
3954 #endif
3955         // Zero out the remaining bamboo_cur_msp
3956         // Only zero out the first 4 bytes of the remaining memory
3957         // Move the operation here because for the GC_CACHE_ADAPT version,
3958         // we need to make sure during the gcinit phase the shared heap is not
3959         // touched. Otherwise, there would be problem when adapt the cache
3960         // strategy.
3961         if((bamboo_cur_msp != 0)
3962                 && (bamboo_smem_zero_top == bamboo_cur_msp)
3963                 && (bamboo_smem_size > 0)) {
3964           *((int *)bamboo_cur_msp) = 0;
3965         }
3966 #ifdef GC_FLUSH_DTLB
3967         if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
3968           BAMBOO_CLEAN_DTLB();
3969           gc_num_flush_dtlb++;
3970         }
3971 #endif
3972 #ifdef GC_CACHE_ADAPT
3973 #ifdef GC_CACHE_SAMPLING
3974     // get the sampling data
3975     bamboo_output_dtlb_sampling();
3976 #endif // GC_CACHE_SAMPLING
3977 #endif // GC_CACHE_ADAPT
3978         gcprocessing = true;
3979         gc_master(stackptr);
3980   } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
3981         // Zero out the remaining bamboo_cur_msp
3982         // Only zero out the first 4 bytes of the remaining memory
3983         // Move the operation here because for the GC_CACHE_ADAPT version,
3984         // we need to make sure during the gcinit phase the shared heap is not
3985         // touched. Otherwise, there would be problem when adapt the cache
3986         // strategy.
3987         if((bamboo_cur_msp != 0)
3988                 && (bamboo_smem_zero_top == bamboo_cur_msp)
3989                 && (bamboo_smem_size > 0)) {
3990           *((int *)bamboo_cur_msp) = 0;
3991         }
3992 #ifdef GC_FLUSH_DTLB
3993         if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
3994           BAMBOO_CLEAN_DTLB();
3995           gc_num_flush_dtlb++;
3996         }
3997 #endif
3998 #ifdef GC_CACHE_ADAPT
3999 #ifdef GC_CACHE_SAMPLING
4000         if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
4001           // get the sampling data
4002           bamboo_output_dtlb_sampling();
4003         }
4004 #endif // GC_CACHE_SAMPLING
4005 #endif // GC_CACHE_ADAPT
4006     gcprocessing = true;
4007     gc_collect(stackptr);
4008
4009     // invalidate all shared mem pointers
4010     bamboo_cur_msp = NULL;
4011     bamboo_smem_size = 0;
4012         bamboo_smem_zero_top = NULL;
4013     gcflag = false;
4014     gcprocessing = false;
4015   } else {
4016         // Zero out the remaining bamboo_cur_msp
4017         // Only zero out the first 4 bytes of the remaining memory
4018         // Move the operation here because for the GC_CACHE_ADAPT version,
4019         // we need to make sure during the gcinit phase the shared heap is not
4020         // touched. Otherwise, there would be problem when adapt the cache
4021         // strategy.
4022         if((bamboo_cur_msp != 0)
4023                 && (bamboo_smem_zero_top == bamboo_cur_msp)
4024                 && (bamboo_smem_size > 0)) {
4025           *((int *)bamboo_cur_msp) = 0;
4026         }
4027 #ifdef GC_FLUSH_DTLB
4028         if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
4029           BAMBOO_CLEAN_DTLB();
4030           gc_num_flush_dtlb++;
4031         }
4032 #endif
4033 #ifdef GC_CACHE_ADAPT
4034 #ifdef GC_CACHE_SAMPLING
4035         if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
4036           // get the sampling data
4037           bamboo_output_dtlb_sampling();
4038         }
4039 #endif // GC_CACHE_SAMPLING
4040 #endif // GC_CACHE_ADAPT
4041     // not a gc core, should wait for gcfinish msg
4042     gcprocessing = true;
4043     gc_nocollect(stackptr);
4044
4045     // invalidate all shared mem pointers
4046     bamboo_cur_msp = NULL;
4047     bamboo_smem_size = 0;
4048     bamboo_smem_zero_top = NULL;
4049         gcflag = false;
4050     gcprocessing = false;
4051   }
4052 #ifdef GC_CACHE_ADAPT
4053 #ifdef GC_CACHE_SAMPLING
4054   // reset the sampling arrays
4055   bamboo_dtlb_sampling_reset();
4056 #endif // GC_CACHE_SAMPLING
4057   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
4058         // zero out the gccachesamplingtbl
4059         BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);
4060         BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,
4061                 size_cachesamplingtbl_local_r);
4062         if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
4063           BAMBOO_MEMSET_WH(gccachepolicytbl,0,size_cachepolicytbl);
4064         }
4065   }
4066 #ifdef GC_CACHE_SAMPLING
4067   // enable the timer interrupt
4068   bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING);
4069   bamboo_unmask_timer_intr();
4070 #endif // GC_CACHE_SAMPLING
4071 #endif // GC_CACHE_ADAPT
4072   return true;
4073 } // void gc(struct garbagelist * stackptr)
4074
4075 #ifdef GC_PROFILE
4076 inline void gc_profileStart(void) {
4077   if(!gc_infoOverflow) {
4078     GCInfo* gcInfo = RUNMALLOC(sizeof(struct gc_info));
4079     gc_infoArray[gc_infoIndex] = gcInfo;
4080     gcInfo->index = 1;
4081     gcInfo->time[0] = BAMBOO_GET_EXE_TIME();
4082   }
4083 }
4084
4085 inline void gc_profileItem(void) {
4086   if(!gc_infoOverflow) {
4087     GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
4088     gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
4089   }
4090 }
4091
4092 inline void gc_profileEnd(void) {
4093   if(!gc_infoOverflow) {
4094     GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
4095     gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
4096         gcInfo->time[gcInfo->index++] = gc_num_livespace;
4097         gcInfo->time[gcInfo->index++] = gc_num_freespace;
4098         gcInfo->time[gcInfo->index++] = gc_num_lobj;
4099         gcInfo->time[gcInfo->index++] = gc_num_lobjspace;
4100         gcInfo->time[gcInfo->index++] = gc_num_obj;
4101         gcInfo->time[gcInfo->index++] = gc_num_liveobj;
4102         gcInfo->time[gcInfo->index++] = gc_num_forwardobj;
4103     gc_infoIndex++;
4104     if(gc_infoIndex == GCINFOLENGTH) {
4105       gc_infoOverflow = true;
4106       //taskInfoIndex = 0;
4107     }
4108   }
4109 }
4110
4111 // output the profiling data
4112 void gc_outputProfileData() {
4113   int i = 0;
4114   int j = 0;
4115   unsigned long long totalgc = 0;
4116
4117 #ifndef BAMBOO_MEMPROF
4118   BAMBOO_DEBUGPRINT(0xdddd);
4119 #endif
4120   // output task related info
4121   for(i= 0; i < gc_infoIndex; i++) {
4122     GCInfo * gcInfo = gc_infoArray[i];
4123 #ifdef BAMBOO_MEMPROF
4124     unsigned long long tmp=gcInfo->time[gcInfo->index-8]-gcInfo->time[0]; //0;
4125 #else
4126         unsigned long long tmp = 0;
4127     BAMBOO_DEBUGPRINT(0xddda);
4128     for(j = 0; j < gcInfo->index - 7; j++) {
4129       BAMBOO_DEBUGPRINT(gcInfo->time[j]);
4130       BAMBOO_DEBUGPRINT(gcInfo->time[j]-tmp);
4131       BAMBOO_DEBUGPRINT(0xdddb);
4132       tmp = gcInfo->time[j];
4133     }
4134     tmp = (tmp-gcInfo->time[0]);
4135     BAMBOO_DEBUGPRINT_REG(tmp);
4136         BAMBOO_DEBUGPRINT(0xdddc);
4137         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 7]);
4138         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 6]);
4139         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 5]);
4140         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 4]);
4141         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 3]);
4142         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 2]);
4143         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 1]);
4144     BAMBOO_DEBUGPRINT(0xddde);
4145 #endif
4146     totalgc += tmp;
4147   }
4148 #ifndef BAMBOO_MEMPROF
4149   BAMBOO_DEBUGPRINT(0xdddf);
4150 #endif
4151   BAMBOO_DEBUGPRINT_REG(totalgc);
4152
4153   if(gc_infoOverflow) {
4154     BAMBOO_DEBUGPRINT(0xefee);
4155   }
4156
4157 #ifndef BAMBOO_MEMPROF
4158   BAMBOO_DEBUGPRINT(0xeeee);
4159 #endif
4160 }
4161 #endif  // #ifdef GC_PROFILE
4162
4163 #endif