b5897919e609a498f463158cf7b7ab8ac4bd2ff5
[IRC.git] / Robust / src / Runtime / bamboo / multicoregarbage.c
1 #ifdef MULTICORE_GC
2 #include "runtime.h"
3 #include "multicoregarbage.h"
4 #include "multicoreruntime.h"
5 #include "runtime_arch.h"
6 #include "SimpleHash.h"
7 #include "GenericHashtable.h"
8 #include "ObjectHash.h"
9 #include "GCSharedHash.h"
10
11 extern int corenum;
12 extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
13 extern int numqueues[][NUMCLASSES];
14
15 extern struct genhashtable * activetasks;
16 extern struct parameterwrapper ** objectqueues[][NUMCLASSES];
17 extern struct taskparamdescriptor *currtpd;
18
19 extern struct LockValue runtime_locks[MAXTASKPARAMS];
20 extern int runtime_locklen;
21
22 #ifdef SMEMM
23 extern unsigned int gcmem_mixed_threshold;
24 extern unsigned int gcmem_mixed_usedmem;
25 #endif
26
27 struct pointerblock {
28   void * ptrs[NUMPTRS];
29   struct pointerblock *next;
30 };
31
32 struct pointerblock *gchead=NULL;
33 int gcheadindex=0;
34 struct pointerblock *gctail=NULL;
35 int gctailindex=0;
36 struct pointerblock *gctail2=NULL;
37 int gctailindex2=0;
38 struct pointerblock *gcspare=NULL;
39
40 #define NUMLOBJPTRS 20
41
42 struct lobjpointerblock {
43   void * lobjs[NUMLOBJPTRS];
44   //void * dsts[NUMLOBJPTRS];
45   int lengths[NUMLOBJPTRS];
46   //void * origs[NUMLOBJPTRS];
47   int hosts[NUMLOBJPTRS];
48   struct lobjpointerblock *next;
49   struct lobjpointerblock *prev;
50 };
51
52 struct lobjpointerblock *gclobjhead=NULL;
53 int gclobjheadindex=0;
54 struct lobjpointerblock *gclobjtail=NULL;
55 int gclobjtailindex=0;
56 struct lobjpointerblock *gclobjtail2=NULL;
57 int gclobjtailindex2=0;
58 struct lobjpointerblock *gclobjspare=NULL;
59
60 #ifdef GC_CACHE_ADAPT
61 typedef struct gc_cache_revise_info {
62   int orig_page_start_va;
63   int orig_page_end_va;
64   int orig_page_index;
65   int to_page_start_va;
66   int to_page_end_va;
67   int to_page_index;
68   int revised_sampling[NUMCORESACTIVE];
69 } gc_cache_revise_info_t;
70 gc_cache_revise_info_t gc_cache_revise_infomation;
71 #endif// GC_CACHE_ADAPT
72
73 #ifdef GC_DEBUG
74 // dump whole mem in blocks
75 inline void dumpSMem() {
76   int block = 0;
77   int sblock = 0;
78   int j = 0;
79   int i = 0;
80   int coren = 0;
81   int x = 0;
82   int y = 0;
83   printf("(%x,%x) Dump shared mem: \n", udn_tile_coord_x(), 
84              udn_tile_coord_y());
85   // reserved blocks for sblocktbl
86   printf("(%x,%x) ++++ reserved sblocks ++++ \n", udn_tile_coord_x(), 
87              udn_tile_coord_y());
88   for(i=BAMBOO_BASE_VA; i<gcbaseva; i+= 4*16) {
89     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
90                    udn_tile_coord_x(), udn_tile_coord_y(),
91            *((int *)(i)), *((int *)(i + 4)),
92            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
93            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
94            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
95            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
96            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
97            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
98            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
99   }
100   sblock = gcreservedsb;
101   bool advanceblock = false;
102   // remaining memory
103   for(i=gcbaseva; i<gcbaseva+BAMBOO_SHARED_MEM_SIZE; i+=4*16) {
104     advanceblock = false;
105     // computing sblock # and block #, core coordinate (x,y) also
106     if(j%((BAMBOO_SMEM_SIZE)/(4*16)) == 0) {
107       // finished a sblock
108       if(j < ((BAMBOO_LARGE_SMEM_BOUND)/(4*16))) {
109                 if((j > 0) && (j%((BAMBOO_SMEM_SIZE_L)/(4*16)) == 0)) {
110                   // finished a block
111                   block++;
112                   advanceblock = true;
113                 }
114       } else {
115                 // finished a block
116                 block++;
117                 advanceblock = true;
118       }
119       // compute core #
120       if(advanceblock) {
121                 coren = gc_block2core[block%(NUMCORES4GC*2)];
122       }
123       // compute core coordinate
124       BAMBOO_COORDS(coren, &x, &y);
125       printf("(%x,%x) ==== %d, %d : core (%d,%d), saddr %x====\n",
126                      udn_tile_coord_x(), udn_tile_coord_y(),
127              block, sblock++, x, y,
128              (sblock-1)*(BAMBOO_SMEM_SIZE)+gcbaseva);
129     }
130     j++;
131     printf("(%x,%x) 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x 0x%08x \n",
132                    udn_tile_coord_x(), udn_tile_coord_y(),
133            *((int *)(i)), *((int *)(i + 4)),
134            *((int *)(i + 4*2)), *((int *)(i + 4*3)),
135            *((int *)(i + 4*4)), *((int *)(i + 4*5)),
136            *((int *)(i + 4*6)), *((int *)(i + 4*7)),
137            *((int *)(i + 4*8)), *((int *)(i + 4*9)),
138            *((int *)(i + 4*10)), *((int *)(i + 4*11)),
139            *((int *)(i + 4*12)), *((int *)(i + 4*13)),
140            *((int *)(i + 4*14)), *((int *)(i + 4*15)));
141   }
142   printf("(%x,%x) \n", udn_tile_coord_x(), udn_tile_coord_y());
143 }
144 #endif
145
146 // should be invoked with interruption closed
147 inline void gc_enqueue_I(void *ptr) {
148 #ifdef DEBUG
149   BAMBOO_DEBUGPRINT(0xe601);
150   BAMBOO_DEBUGPRINT_REG(ptr);
151 #endif
152   if (gcheadindex==NUMPTRS) {
153     struct pointerblock * tmp;
154     if (gcspare!=NULL) {
155       tmp=gcspare;
156       gcspare=NULL;
157     } else {
158       tmp=RUNMALLOC_I(sizeof(struct pointerblock));
159     }             // if (gcspare!=NULL)
160     gchead->next=tmp;
161     gchead=tmp;
162     gcheadindex=0;
163   } // if (gcheadindex==NUMPTRS)
164   gchead->ptrs[gcheadindex++]=ptr;
165 #ifdef DEBUG
166   BAMBOO_DEBUGPRINT(0xe602);
167 #endif
168 } // void gc_enqueue_I(void *ptr)
169
170 // dequeue and destroy the queue
171 inline void * gc_dequeue_I() {
172   if (gctailindex==NUMPTRS) {
173     struct pointerblock *tmp=gctail;
174     gctail=gctail->next;
175     gctailindex=0;
176     if (gcspare!=NULL) {
177       RUNFREE(tmp);
178     } else {
179       gcspare=tmp;
180     }             // if (gcspare!=NULL)
181   } // if (gctailindex==NUMPTRS)
182   return gctail->ptrs[gctailindex++];
183 } // void * gc_dequeue()
184
185 // dequeue and do not destroy the queue
186 inline void * gc_dequeue2_I() {
187   if (gctailindex2==NUMPTRS) {
188     struct pointerblock *tmp=gctail2;
189     gctail2=gctail2->next;
190     gctailindex2=0;
191   } // if (gctailindex2==NUMPTRS)
192   return gctail2->ptrs[gctailindex2++];
193 } // void * gc_dequeue2()
194
195 inline int gc_moreItems_I() {
196   if ((gchead==gctail)&&(gctailindex==gcheadindex))
197     return 0;
198   return 1;
199 } // int gc_moreItems()
200
201 inline int gc_moreItems2_I() {
202   if ((gchead==gctail2)&&(gctailindex2==gcheadindex))
203     return 0;
204   return 1;
205 } // int gc_moreItems2()
206
207 // should be invoked with interruption closed
208 // enqueue a large obj: start addr & length
209 inline void gc_lobjenqueue_I(void *ptr,
210                              int length,
211                              int host) {
212 #ifdef DEBUG
213   BAMBOO_DEBUGPRINT(0xe901);
214 #endif
215   if (gclobjheadindex==NUMLOBJPTRS) {
216     struct lobjpointerblock * tmp;
217     if (gclobjspare!=NULL) {
218       tmp=gclobjspare;
219       gclobjspare=NULL;
220     } else {
221       tmp=RUNMALLOC_I(sizeof(struct lobjpointerblock));
222     }             // if (gclobjspare!=NULL)
223     gclobjhead->next=tmp;
224     tmp->prev = gclobjhead;
225     gclobjhead=tmp;
226     gclobjheadindex=0;
227   } // if (gclobjheadindex==NUMLOBJPTRS)
228   gclobjhead->lobjs[gclobjheadindex]=ptr;
229   gclobjhead->lengths[gclobjheadindex]=length;
230   gclobjhead->hosts[gclobjheadindex++]=host;
231 #ifdef DEBUG
232   BAMBOO_DEBUGPRINT_REG(gclobjhead->lobjs[gclobjheadindex-1]);
233   BAMBOO_DEBUGPRINT_REG(gclobjhead->lengths[gclobjheadindex-1]);
234   BAMBOO_DEBUGPRINT_REG(gclobjhead->hosts[gclobjheadindex-1]);
235 #endif
236 } // void gc_lobjenqueue_I(void *ptr...)
237
238 // dequeue and destroy the queue
239 inline void * gc_lobjdequeue_I(int * length,
240                                int * host) {
241   if (gclobjtailindex==NUMLOBJPTRS) {
242     struct lobjpointerblock *tmp=gclobjtail;
243     gclobjtail=gclobjtail->next;
244     gclobjtailindex=0;
245     gclobjtail->prev = NULL;
246     if (gclobjspare!=NULL) {
247       RUNFREE(tmp);
248     } else {
249       gclobjspare=tmp;
250       tmp->next = NULL;
251       tmp->prev = NULL;
252     }  // if (gclobjspare!=NULL)
253   } // if (gclobjtailindex==NUMLOBJPTRS)
254   if(length != NULL) {
255     *length = gclobjtail->lengths[gclobjtailindex];
256   }
257   if(host != NULL) {
258     *host = (int)(gclobjtail->hosts[gclobjtailindex]);
259   }
260   return gclobjtail->lobjs[gclobjtailindex++];
261 } // void * gc_lobjdequeue()
262
263 inline int gc_lobjmoreItems_I() {
264   if ((gclobjhead==gclobjtail)&&(gclobjtailindex==gclobjheadindex))
265     return 0;
266   return 1;
267 } // int gc_lobjmoreItems()
268
269 // dequeue and don't destroy the queue
270 inline void gc_lobjdequeue2_I() {
271   if (gclobjtailindex2==NUMLOBJPTRS) {
272     gclobjtail2=gclobjtail2->next;
273     gclobjtailindex2=1;
274   } else {
275     gclobjtailindex2++;
276   }      // if (gclobjtailindex2==NUMLOBJPTRS)
277 } // void * gc_lobjdequeue2()
278
279 inline int gc_lobjmoreItems2_I() {
280   if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
281     return 0;
282   return 1;
283 } // int gc_lobjmoreItems2()
284
285 // 'reversly' dequeue and don't destroy the queue
286 inline void gc_lobjdequeue3_I() {
287   if (gclobjtailindex2==0) {
288     gclobjtail2=gclobjtail2->prev;
289     gclobjtailindex2=NUMLOBJPTRS-1;
290   } else {
291     gclobjtailindex2--;
292   }      // if (gclobjtailindex2==NUMLOBJPTRS)
293 } // void * gc_lobjdequeue3()
294
295 inline int gc_lobjmoreItems3_I() {
296   if ((gclobjtail==gclobjtail2)&&(gclobjtailindex2==gclobjtailindex))
297     return 0;
298   return 1;
299 } // int gc_lobjmoreItems3()
300
301 inline void gc_lobjqueueinit4_I() {
302   gclobjtail2 = gclobjtail;
303   gclobjtailindex2 = gclobjtailindex;
304 } // void gc_lobjqueueinit2()
305
306 inline void * gc_lobjdequeue4_I(int * length,
307                                 int * host) {
308   if (gclobjtailindex2==NUMLOBJPTRS) {
309     gclobjtail2=gclobjtail2->next;
310     gclobjtailindex2=0;
311   } // if (gclobjtailindex==NUMLOBJPTRS)
312   if(length != NULL) {
313     *length = gclobjtail2->lengths[gclobjtailindex2];
314   }
315   if(host != NULL) {
316     *host = (int)(gclobjtail2->hosts[gclobjtailindex2]);
317   }
318   return gclobjtail2->lobjs[gclobjtailindex2++];
319 } // void * gc_lobjdequeue()
320
321 inline int gc_lobjmoreItems4_I() {
322   if ((gclobjhead==gclobjtail2)&&(gclobjtailindex2==gclobjheadindex))
323     return 0;
324   return 1;
325 } // int gc_lobjmoreItems(
326
327 INTPTR gccurr_heapbound = 0;
328
329 inline void gettype_size(void * ptr,
330                          int * ttype,
331                          int * tsize) {
332   int type = ((int *)ptr)[0];
333   int size = 0;
334   if(type < NUMCLASSES) {
335     // a normal object
336     size = classsize[type];
337   } else {
338     // an array
339     struct ArrayObject *ao=(struct ArrayObject *)ptr;
340     int elementsize=classsize[type];
341     int length=ao->___length___;
342     size=sizeof(struct ArrayObject)+length*elementsize;
343   }       // if(type < NUMCLASSES)
344   *ttype = type;
345   *tsize = size;
346 }
347
348 inline bool isLarge(void * ptr,
349                     int * ttype,
350                     int * tsize) {
351 #ifdef DEBUG
352   BAMBOO_DEBUGPRINT(0xe701);
353   BAMBOO_DEBUGPRINT_REG(ptr);
354 #endif
355   // check if a pointer is referring to a large object
356   gettype_size(ptr, ttype, tsize);
357 #ifdef DEBUG
358   BAMBOO_DEBUGPRINT(*tsize);
359 #endif
360   int bound = (BAMBOO_SMEM_SIZE);
361   if(((int)ptr-gcbaseva) < (BAMBOO_LARGE_SMEM_BOUND)) {
362     bound = (BAMBOO_SMEM_SIZE_L);
363   }
364   if((((int)ptr-gcbaseva)%(bound))==0) {
365     // ptr is a start of a block
366 #ifdef DEBUG
367     BAMBOO_DEBUGPRINT(0xe702);
368     BAMBOO_DEBUGPRINT(1);
369 #endif
370     return true;
371   }
372   if((bound-(((int)ptr-gcbaseva)%bound)) < (*tsize)) {
373     // it acrosses the boundary of current block
374 #ifdef DEBUG
375     BAMBOO_DEBUGPRINT(0xe703);
376     BAMBOO_DEBUGPRINT(1);
377 #endif
378     return true;
379   }
380 #ifdef DEBUG
381   BAMBOO_DEBUGPRINT(0);
382 #endif
383   return false;
384 } // bool isLarge(void * ptr, int * ttype, int * tsize)
385
386 inline int hostcore(void * ptr) {
387   // check the host core of ptr
388   int host = 0;
389   RESIDECORE(ptr, &host);
390 #ifdef DEBUG
391   BAMBOO_DEBUGPRINT(0xedd0);
392   BAMBOO_DEBUGPRINT_REG(ptr);
393   BAMBOO_DEBUGPRINT_REG(host);
394 #endif
395   return host;
396 } // int hostcore(void * ptr)
397
398 inline bool isLocal(void * ptr) {
399   // check if a pointer is in shared heap on this core
400   return hostcore(ptr) == BAMBOO_NUM_OF_CORE;
401 } // bool isLocal(void * ptr)
402
403 inline bool gc_checkCoreStatus_I() {
404   bool allStall = true;
405   for(int i = 0; i < NUMCORES4GC; ++i) {
406     if(gccorestatus[i] != 0) {
407       allStall = false;
408       break;
409     }             // if(gccorestatus[i] != 0)
410   }       // for(i = 0; i < NUMCORES4GC; ++i)
411   return allStall;
412 }
413
414 inline bool gc_checkAllCoreStatus_I() {
415   bool allStall = true;
416   for(int i = 0; i < NUMCORESACTIVE; ++i) {
417     if(gccorestatus[i] != 0) {
418       allStall = false;
419       break;
420     }             // if(gccorestatus[i] != 0)
421   }       // for(i = 0; i < NUMCORESACTIVE; ++i)
422   return allStall;
423 }
424
425 inline void checkMarkStatue() {
426 #ifdef DEBUG
427   BAMBOO_DEBUGPRINT(0xee01);
428 #endif
429   int i;
430   if((!waitconfirm) ||
431      (waitconfirm && (numconfirm == 0))) {
432 #ifdef DEBUG
433     BAMBOO_DEBUGPRINT(0xee02);
434 #endif
435         int entry_index = 0;
436         if(waitconfirm) {
437           // phase 2
438           entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
439         } else {
440           // phase 1
441           entry_index = gcnumsrobjs_index;
442         }
443     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
444     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
445     gcnumsendobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
446     gcnumreceiveobjs[entry_index][BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
447     // check the status of all cores
448     bool allStall = gc_checkAllCoreStatus_I();
449 #ifdef DEBUG
450     BAMBOO_DEBUGPRINT(0xee03);
451 #endif
452     if(allStall) {
453 #ifdef DEBUG
454       BAMBOO_DEBUGPRINT(0xee04);
455 #endif
456       // ask for confirm
457       if(!waitconfirm) {
458 #ifdef DEBUG
459                 BAMBOO_DEBUGPRINT(0xee05);
460 #endif
461                 // the first time found all cores stall
462                 // send out status confirm msg to all other cores
463                 // reset the corestatus array too
464                 gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
465                 waitconfirm = true;
466                 numconfirm = NUMCORESACTIVE - 1;
467                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
468                 for(i = 1; i < NUMCORESACTIVE; ++i) {
469                   gccorestatus[i] = 1;
470                   // send mark phase finish confirm request msg to core i
471                   send_msg_1(i, GCMARKCONFIRM, false);
472                 }  // for(i = 1; i < NUMCORESACTIVE; ++i)
473       } else {
474                 // Phase 2
475                 // check if the sum of send objs and receive obj are the same
476                 // yes->check if the info is the latest; no->go on executing
477                 int sumsendobj = 0;
478                 for(i = 0; i < NUMCORESACTIVE; ++i) {
479                   sumsendobj += gcnumsendobjs[gcnumsrobjs_index][i];
480                 }  // for(i = 0; i < NUMCORESACTIVE; ++i)
481 #ifdef DEBUG
482                 BAMBOO_DEBUGPRINT(0xee06);
483                 BAMBOO_DEBUGPRINT_REG(sumsendobj);
484 #endif
485                 for(i = 0; i < NUMCORESACTIVE; ++i) {
486                   sumsendobj -= gcnumreceiveobjs[gcnumsrobjs_index][i];
487                 }  // for(i = 0; i < NUMCORESACTIVE; ++i)
488 #ifdef DEBUG
489                 BAMBOO_DEBUGPRINT(0xee07);
490                 BAMBOO_DEBUGPRINT_REG(sumsendobj);
491 #endif
492                 if(0 == sumsendobj) {
493                   // Check if there are changes of the numsendobjs or numreceiveobjs on
494                   // each core
495                   bool ischanged = false;
496                   for(i = 0; i < NUMCORESACTIVE; ++i) {
497                         if((gcnumsendobjs[0][i] != gcnumsendobjs[1][i]) || 
498                                 (gcnumreceiveobjs[0][i] != gcnumreceiveobjs[1][i]) ) {
499                           ischanged = true;
500                           break;
501                         }
502                   }  // for(i = 0; i < NUMCORESACTIVE; ++i)
503 #ifdef DEBUG
504                   BAMBOO_DEBUGPRINT(0xee08);
505                   BAMBOO_DEBUGPRINT_REG(ischanged);
506 #endif
507                   if(!ischanged) {
508 #ifdef DEBUG
509                         BAMBOO_DEBUGPRINT(0xee09);
510 #endif
511                         // all the core status info are the latest
512                         // stop mark phase
513                         gcphase = COMPACTPHASE;
514                         // restore the gcstatus for all cores
515                         for(i = 0; i < NUMCORESACTIVE; ++i) {
516                           gccorestatus[i] = 1;
517                         }  // for(i = 0; i < NUMCORESACTIVE; ++i)
518                   } else {
519                         waitconfirm = false;
520                         gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
521                   } // if(!ischanged)
522                 } else {
523                   // There were changes between phase 1 and phase 2, can not decide 
524                   // whether the mark phase has been finished
525                   waitconfirm = false;
526                   // As it fails in phase 2, flip the entries
527                   gcnumsrobjs_index = (gcnumsrobjs_index == 0) ? 1 : 0;
528                 } // if(0 == sumsendobj) else ...
529                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
530       } // if(!gcwaitconfirm) else()
531     } else {
532       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
533     } // if(allStall)
534   }  // if((!waitconfirm)...
535 #ifdef DEBUG
536   BAMBOO_DEBUGPRINT(0xee0a);
537 #endif
538 } // void checkMarkStatue()
539 /*
540 inline bool preGC() {
541   // preparation for gc
542   // make sure to clear all incoming msgs espacially transfer obj msgs
543 #ifdef DEBUG
544   BAMBOO_DEBUGPRINT(0xec01);
545 #endif
546   int i;
547   if((!waitconfirm) ||
548      (waitconfirm && (numconfirm == 0))) {
549     // send out status confirm msgs to all cores to check if there are
550     // transfer obj msgs on-the-fly
551     waitconfirm = true;
552     numconfirm = NUMCORESACTIVE - 1;
553     for(i = 1; i < NUMCORESACTIVE; ++i) {
554       corestatus[i] = 1;
555       // send status confirm msg to core i
556       send_msg_1(i, STATUSCONFIRM, false);
557     }   // for(i = 1; i < NUMCORESACTIVE; ++i)
558
559 #ifdef DEBUG
560     BAMBOO_DEBUGPRINT(0xec02);
561 #endif
562     while(true) {
563       if(numconfirm == 0) {
564                 break;
565       }
566     }   // wait for confirmations
567     waitconfirm = false;
568     numconfirm = 0;
569 #ifdef DEBUG
570     BAMBOO_DEBUGPRINT(0xec03);
571 #endif
572     numsendobjs[BAMBOO_NUM_OF_CORE] = self_numsendobjs;
573     numreceiveobjs[BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
574     int sumsendobj = 0;
575 #ifdef DEBUG
576     BAMBOO_DEBUGPRINT(0xec04);
577 #endif
578     for(i = 0; i < NUMCORESACTIVE; ++i) {
579       sumsendobj += numsendobjs[i];
580 #ifdef DEBUG
581       BAMBOO_DEBUGPRINT(0xf000 + numsendobjs[i]);
582 #endif
583     }             // for(i = 1; i < NUMCORESACTIVE; ++i)
584 #ifdef DEBUG
585     BAMBOO_DEBUGPRINT(0xec05);
586     BAMBOO_DEBUGPRINT_REG(sumsendobj);
587 #endif
588     for(i = 0; i < NUMCORESACTIVE; ++i) {
589       sumsendobj -= numreceiveobjs[i];
590 #ifdef DEBUG
591       BAMBOO_DEBUGPRINT(0xf000 + numreceiveobjs[i]);
592 #endif
593     }             // for(i = 1; i < NUMCORESACTIVE; ++i)
594 #ifdef DEBUG
595     BAMBOO_DEBUGPRINT(0xec06);
596     BAMBOO_DEBUGPRINT_REG(sumsendobj);
597 #endif
598     if(0 == sumsendobj) {
599       return true;
600     } else {
601       // still have some transfer obj msgs on-the-fly, can not start gc
602       return false;
603     }  // if(0 == sumsendobj)
604   } else {
605 #ifdef DEBUG
606     BAMBOO_DEBUGPRINT(0xec07);
607 #endif
608     // previously asked for status confirmation and do not have all the
609     // confirmations yet, can not start gc
610     return false;
611   }       // if((!waitconfirm) ||
612 } // bool preGC()*/
613
614 inline void initGC() {
615   int i;
616   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
617     for(i = 0; i < NUMCORES4GC; ++i) {
618       gccorestatus[i] = 1;
619       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
620       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
621       gcloads[i] = 0;
622       gcrequiredmems[i] = 0;
623       gcfilledblocks[i] = 0;
624       gcstopblock[i] = 0;
625     } // for(i = 0; i < NUMCORES4GC; ++i)
626     for(i = NUMCORES4GC; i < NUMCORESACTIVE; ++i) {
627       gccorestatus[i] = 1;
628       gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
629       gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
630     }
631     gcheaptop = 0;
632     gctopcore = 0;
633     gctopblock = 0;
634   } // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
635   gcself_numsendobjs = 0;
636   gcself_numreceiveobjs = 0;
637   gcmarkedptrbound = 0;
638   gcobj2map = 0;
639   gcmappedobj = 0;
640   //gcismapped = false;
641   gcnumlobjs = 0;
642   gcmovestartaddr = 0;
643   gctomove = false;
644   gcblock2fill = 0;
645   gcmovepending = 0;
646   gccurr_heaptop = 0;
647   gcdstcore = 0;
648
649   // initialize queue
650   if (gchead==NULL) {
651     gcheadindex=gctailindex=gctailindex2 = 0;
652     gchead=gctail=gctail2=RUNMALLOC(sizeof(struct pointerblock));
653   } else {
654     gctailindex = gctailindex2 = gcheadindex;
655     gctail = gctail2 = gchead;
656   }
657
658   // initialize the large obj queues
659   if (gclobjhead==NULL) {
660     gclobjheadindex=0;
661     gclobjtailindex=0;
662     gclobjtailindex2 = 0;
663     gclobjhead=gclobjtail=gclobjtail2=
664           RUNMALLOC(sizeof(struct lobjpointerblock));
665   } else {
666     gclobjtailindex = gclobjtailindex2 = gclobjheadindex = 0;
667     gclobjtail = gclobjtail2 = gclobjhead;
668   }
669   gclobjhead->next = gclobjhead->prev = NULL;
670
671 #ifdef LOCALHASHTBL_TEST
672   freeRuntimeHash(gcpointertbl);
673   gcpointertbl = allocateRuntimeHash(20);
674 #else
675   mgchashreset(gcpointertbl);
676 #endif
677   //gcpointertbl = allocateMGCHash(20);
678
679   freeMGCHash(gcforwardobjtbl);
680   gcforwardobjtbl = allocateMGCHash(20, 3);
681
682   // initialize the mapping info related structures
683   if((BAMBOO_NUM_OF_CORE < NUMCORES4GC) && (gcsharedptbl != NULL)) {
684         // Never free the shared hash table, just reset it
685         /*freeGCSharedHash(gcsharedptbl);
686         gcsharedptbl = allocateGCSharedHash(20);*/
687         mgcsharedhashReset(gcsharedptbl);
688   }
689   // Zero out the remaining bamboo_cur_msp 
690   // Only zero out the first 4 bytes of the remaining memory
691   /*if((bamboo_cur_msp != 0) 
692           && (bamboo_smem_zero_top == bamboo_cur_msp) 
693           && (bamboo_smem_size > 0)) {
694         *((int *)bamboo_cur_msp) = 0;
695   }*/
696 #ifdef GC_PROFILE
697   gc_num_livespace = 0;
698   gc_num_freespace = 0;
699   gc_num_lobj = 0;
700   gc_num_lobjspace = 0;
701   gc_num_liveobj = 0;
702   gc_num_forwardobj = 0;
703   gc_num_profiles = NUMCORESACTIVE - 1;
704 #endif
705 } // void initGC()
706
707 // compute load balance for all cores
708 inline int loadbalance(int * heaptop) {
709   // compute load balance
710   int i;
711
712   // get the total loads
713   int tloads = gcloads[STARTUPCORE];
714   for(i = 1; i < NUMCORES4GC; i++) {
715     tloads += gcloads[i];
716   }
717   *heaptop = gcbaseva + tloads;
718
719 #ifdef DEBUG
720   BAMBOO_DEBUGPRINT(0xdddd);
721   BAMBOO_DEBUGPRINT_REG(tloads);
722   BAMBOO_DEBUGPRINT_REG(*heaptop);
723 #endif
724   int b = 0;
725   BLOCKINDEX(*heaptop, &b);
726   int numbpc = b / NUMCORES4GC;       // num of blocks per core
727 #ifdef DEBUG
728   BAMBOO_DEBUGPRINT_REG(b);
729   BAMBOO_DEBUGPRINT_REG(numbpc);
730 #endif
731   gctopblock = b;
732   RESIDECORE(heaptop, &gctopcore);
733 #ifdef DEBUG
734   BAMBOO_DEBUGPRINT_REG(gctopcore);
735 #endif
736   return numbpc;
737 } // void loadbalance(int * heaptop)
738
739 inline bool cacheLObjs() {
740   // check the total mem size need for large objs
741   unsigned long long sumsize = 0;
742   int size = 0;
743 #ifdef DEBUG
744   BAMBOO_DEBUGPRINT(0xe801);
745 #endif
746   gclobjtail2 = gclobjtail;
747   gclobjtailindex2 = gclobjtailindex;
748   int tmp_lobj = 0;
749   int tmp_len = 0;
750   int tmp_host = 0;
751   // compute total mem size required and sort the lobjs in ascending order
752   while(gc_lobjmoreItems2_I()) {
753     gc_lobjdequeue2_I();
754     tmp_lobj = gclobjtail2->lobjs[gclobjtailindex2-1];
755     tmp_host = gclobjtail2->hosts[gclobjtailindex2-1];
756     tmp_len = gclobjtail2->lengths[gclobjtailindex2 - 1];
757     sumsize += tmp_len;
758 #ifdef GC_PROFILE
759         gc_num_lobj++;
760 #endif
761 #ifdef DEBUG
762     BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2-1]);
763     BAMBOO_DEBUGPRINT_REG(tmp_len);
764     BAMBOO_DEBUGPRINT_REG(sumsize);
765 #endif
766     int i = gclobjtailindex2-1;
767     struct lobjpointerblock * tmp_block = gclobjtail2;
768     // find the place to insert
769     while(true) {
770       if(i == 0) {
771                 if(tmp_block->prev == NULL) {
772                   break;
773                 }
774                 if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] > tmp_lobj) {
775                   tmp_block->lobjs[i] = tmp_block->prev->lobjs[NUMLOBJPTRS-1];
776                   tmp_block->lengths[i] = tmp_block->prev->lengths[NUMLOBJPTRS-1];
777                   tmp_block->hosts[i] = tmp_block->prev->hosts[NUMLOBJPTRS-1];
778                   tmp_block = tmp_block->prev;
779                   i = NUMLOBJPTRS-1;
780                 } else {
781                   break;
782                 }  // if(tmp_block->prev->lobjs[NUMLOBJPTRS-1] < tmp_lobj)
783           } else {
784                 if(tmp_block->lobjs[i-1] > tmp_lobj) {
785                   tmp_block->lobjs[i] = tmp_block->lobjs[i-1];
786                   tmp_block->lengths[i] = tmp_block->lengths[i-1];
787                   tmp_block->hosts[i] = tmp_block->hosts[i-1];
788                   i--;
789                 } else {
790                   break;
791                 }  // if(tmp_block->lobjs[i-1] < tmp_lobj)
792       }  // if(i ==0 ) else {}
793     }   // while(true)
794     // insert it
795     if(i != gclobjtailindex2 - 1) {
796       tmp_block->lobjs[i] = tmp_lobj;
797       tmp_block->lengths[i] = tmp_len;
798       tmp_block->hosts[i] = tmp_host;
799     }
800   }  // while(gc_lobjmoreItems2())
801
802 #ifdef GC_PROFILE
803   gc_num_lobjspace = sumsize;
804 #endif
805   // check if there are enough space to cache these large objs
806   INTPTR dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -sumsize;
807   if((unsigned long long)gcheaptop > (unsigned long long)dst) {
808     // do not have enough room to cache large objs
809 #ifdef DEBUG
810     BAMBOO_DEBUGPRINT(0xe802);
811     BAMBOO_DEBUGPRINT_REG(dst);
812     BAMBOO_DEBUGPRINT_REG(gcheaptop);
813         BAMBOO_DEBUGPRINT_REG(sumsize);
814 #endif
815     return false;
816   }
817 #ifdef DEBUG
818   BAMBOO_DEBUGPRINT(0xe803);
819   BAMBOO_DEBUGPRINT_REG(dst);
820   BAMBOO_DEBUGPRINT_REG(gcheaptop);
821 #endif
822
823   gcheaptop = dst; // Note: record the start of cached lobjs with gcheaptop
824   // cache the largeObjs to the top of the shared heap
825   //gclobjtail2 = gclobjtail;
826   //gclobjtailindex2 = gclobjtailindex;
827   dst = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
828   while(gc_lobjmoreItems3_I()) {
829     gc_lobjdequeue3_I();
830     size = gclobjtail2->lengths[gclobjtailindex2];
831     // set the mark field to , indicating that this obj has been moved
832     // and need to be flushed
833     ((int *)(gclobjtail2->lobjs[gclobjtailindex2]))[6] = COMPACTED;
834     dst -= size;
835     if((int)dst < (int)(gclobjtail2->lobjs[gclobjtailindex2])+size) {
836       memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
837     } else {
838       //BAMBOO_WRITE_HINT_CACHE(dst, size);
839       memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
840     }
841 #ifdef DEBUG
842     BAMBOO_DEBUGPRINT(0x804);
843     BAMBOO_DEBUGPRINT_REG(gclobjtail2->lobjs[gclobjtailindex2]);
844     BAMBOO_DEBUGPRINT(dst);
845     BAMBOO_DEBUGPRINT_REG(size);
846     BAMBOO_DEBUGPRINT_REG(*((int*)gclobjtail2->lobjs[gclobjtailindex2]));
847     BAMBOO_DEBUGPRINT_REG(*((int*)(dst)));
848 #endif
849   }
850   return true;
851 } // void cacheLObjs()
852
853 // update the bmmboo_smemtbl to record current shared mem usage
854 void updateSmemTbl(int coren,
855                    int localtop) {
856   int ltopcore = 0;
857   int bound = BAMBOO_SMEM_SIZE_L;
858   BLOCKINDEX(localtop, &ltopcore);
859   if(localtop >= (gcbaseva+(BAMBOO_LARGE_SMEM_BOUND))) {
860     bound = BAMBOO_SMEM_SIZE;
861   }
862   int load = (localtop-gcbaseva)%bound;
863   int i = 0;
864   int j = 0;
865   int toset = 0;
866   do {
867     toset = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
868     if(toset < ltopcore) {
869       bamboo_smemtbl[toset]=
870         (toset<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
871 #ifdef SMEMM
872           gcmem_mixed_usedmem += bamboo_smemtbl[toset];
873 #endif
874     } else if(toset == ltopcore) {
875       bamboo_smemtbl[toset] = load;
876 #ifdef SMEMM
877           gcmem_mixed_usedmem += bamboo_smemtbl[toset];
878 #endif
879       break;
880     } else {
881       break;
882     }
883     i++;
884     if(i == 2) {
885       i = 0;
886       j++;
887     }
888   } while(true);
889 } // void updateSmemTbl(int, int)
890
891 inline void moveLObjs() {
892 #ifdef DEBUG
893   BAMBOO_DEBUGPRINT(0xea01);
894 #endif
895 #ifdef SMEMM
896   // update the gcmem_mixed_usedmem
897   gcmem_mixed_usedmem = 0;
898 #endif
899   // zero out the smemtbl
900   BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
901   // find current heap top
902   // flush all gcloads to indicate the real heap top on one core
903   // previous it represents the next available ptr on a core
904   if((gcloads[0] > (gcbaseva+(BAMBOO_SMEM_SIZE_L)))
905      && ((gcloads[0]%(BAMBOO_SMEM_SIZE)) == 0)) {
906     // edge of a block, check if this is exactly the heaptop
907     BASEPTR(0, gcfilledblocks[0]-1, &(gcloads[0]));
908     gcloads[0]+=(gcfilledblocks[0]>1 ?
909                  (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
910   }
911   updateSmemTbl(0, gcloads[0]);
912 #ifdef DEBUG
913   BAMBOO_DEBUGPRINT(0xea02);
914   BAMBOO_DEBUGPRINT_REG(gcloads[0]);
915   BAMBOO_DEBUGPRINT_REG(bamboo_smemtbl[0]);
916 #endif
917   for(int i = 1; i < NUMCORES4GC; i++) {
918     int tmptop = 0;
919 #ifdef DEBUG
920     BAMBOO_DEBUGPRINT(0xf000+i);
921     BAMBOO_DEBUGPRINT_REG(gcloads[i]);
922     BAMBOO_DEBUGPRINT_REG(gcfilledblocks[i]);
923 #endif
924     if((gcfilledblocks[i] > 0)
925        && ((gcloads[i] % (BAMBOO_SMEM_SIZE)) == 0)) {
926       // edge of a block, check if this is exactly the heaptop
927       BASEPTR(i, gcfilledblocks[i]-1, &gcloads[i]);
928       gcloads[i] += 
929                 (gcfilledblocks[i]>1 ? (BAMBOO_SMEM_SIZE) : (BAMBOO_SMEM_SIZE_L));
930       tmptop = gcloads[i];
931     }
932     updateSmemTbl(i, gcloads[i]);
933 #ifdef DEBUG
934     BAMBOO_DEBUGPRINT_REG(gcloads[i]);
935 #endif
936   } // for(int i = 1; i < NUMCORES4GC; i++) {
937
938   // find current heap top
939   // TODO
940   // a bug here: when using local allocation, directly move large objects
941   // to the highest free chunk might not be memory efficient
942   int tmpheaptop = 0;
943   int size = 0;
944   int bound = 0;
945   int i = 0;
946   for(i = gcnumblock-1; i >= 0; i--) {
947     if(bamboo_smemtbl[i] > 0) {
948       break;
949     }
950   }
951   if(i == -1) {
952     tmpheaptop = gcbaseva;
953   } else {
954     tmpheaptop = gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC) ?
955                 (BAMBOO_SMEM_SIZE_L*i) :
956         (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
957   }
958
959   // move large objs from gcheaptop to tmpheaptop
960   // write the header first
961   unsigned int tomove = gcbaseva + (BAMBOO_SHARED_MEM_SIZE) -gcheaptop;
962 #ifdef SMEMM
963   gcmem_mixed_usedmem += tomove;
964 #endif
965 #ifdef DEBUG
966   BAMBOO_DEBUGPRINT(0xea03);
967   BAMBOO_DEBUGPRINT_REG(tomove);
968   BAMBOO_DEBUGPRINT_REG(tmpheaptop);
969   BAMBOO_DEBUGPRINT_REG(gcheaptop);
970 #endif
971   // flush the sbstartbl
972   BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0',
973           (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-gcreservedsb)*sizeof(INTPTR));
974   if(tomove == 0) {
975     gcheaptop = tmpheaptop;
976   } else {
977     // check how many blocks it acrosses
978     int remain = tmpheaptop-gcbaseva;
979     int sb = remain/(BAMBOO_SMEM_SIZE) + gcreservedsb;//number of the sblock
980     int b = 0;  // number of the block
981     BLOCKINDEX(tmpheaptop, &b);
982     // check the remaining space in this block
983     bound = (BAMBOO_SMEM_SIZE);
984     if(remain < (BAMBOO_LARGE_SMEM_BOUND)) {
985       bound = (BAMBOO_SMEM_SIZE_L);
986     }
987     remain = bound - remain%bound;
988
989 #ifdef DEBUG
990     BAMBOO_DEBUGPRINT(0xea04);
991 #endif
992     size = 0;
993     int isize = 0;
994     int host = 0;
995     int ptr = 0;
996     int base = tmpheaptop;
997     int cpysize = 0;
998     remain -= BAMBOO_CACHE_LINE_SIZE;
999     tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
1000     gc_lobjqueueinit4_I();
1001     while(gc_lobjmoreItems4_I()) {
1002       ptr = (int)(gc_lobjdequeue4_I(&size, &host));
1003       ALIGNSIZE(size, &isize);
1004       if(remain < isize) {
1005                 // this object acrosses blocks
1006                 if(cpysize > 0) {
1007                   // close current block, fill its header
1008                   BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
1009                   *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
1010                   bamboo_smemtbl[b]+=BAMBOO_CACHE_LINE_SIZE;//add the size of header
1011                   cpysize = 0;
1012                   base = tmpheaptop;
1013                   if(remain == 0) {
1014                         remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
1015                                          BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
1016                   }
1017                   remain -= BAMBOO_CACHE_LINE_SIZE;
1018                   tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
1019                   BLOCKINDEX(tmpheaptop, &b);
1020                   sb = (tmpheaptop-gcbaseva)/(BAMBOO_SMEM_SIZE) + gcreservedsb;
1021                 }  // if(cpysize > 0)
1022
1023                 // move the large obj
1024                 if((int)gcheaptop < (int)(tmpheaptop)+size) {
1025                   memmove(tmpheaptop, gcheaptop, size);
1026                 } else {
1027                   //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
1028                   memcpy(tmpheaptop, gcheaptop, size);
1029                 }
1030                 // fill the remaining space with -2 padding
1031                 BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
1032 #ifdef DEBUG
1033                 BAMBOO_DEBUGPRINT(0xea05);
1034                 BAMBOO_DEBUGPRINT_REG(gcheaptop);
1035                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
1036                 BAMBOO_DEBUGPRINT_REG(size);
1037                 BAMBOO_DEBUGPRINT_REG(isize);
1038                 BAMBOO_DEBUGPRINT_REG(base);
1039 #endif
1040                 gcheaptop += size;
1041                 // cache the mapping info anyway
1042                 //if(ptr != tmpheaptop) {
1043                 BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1044 #ifdef LOCALHASHTBL_TEST
1045                 RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
1046 #else
1047                 mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
1048 #endif
1049                 //MGCHashadd_I(gcpointertbl, ptr, tmpheaptop);
1050                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1051                 //}
1052 #ifdef DEBUG
1053                 BAMBOO_DEBUGPRINT(0xcdca);
1054                 BAMBOO_DEBUGPRINT_REG(ptr);
1055                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
1056 #endif
1057                 if(host != BAMBOO_NUM_OF_CORE) {
1058                   // send the original host core with the mapping info
1059                   send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
1060 #ifdef DEBUG
1061                   BAMBOO_DEBUGPRINT(0xcdcb);
1062                   BAMBOO_DEBUGPRINT_REG(ptr);
1063                   BAMBOO_DEBUGPRINT_REG(tmpheaptop);
1064 #endif
1065                 } // if(host != BAMBOO_NUM_OF_CORE)
1066                 tmpheaptop += isize;
1067
1068                 // set the gcsbstarttbl and bamboo_smemtbl
1069                 int tmpsbs = 1+(isize-remain-1)/BAMBOO_SMEM_SIZE;
1070                 for(int k = 1; k < tmpsbs; k++) {
1071                   gcsbstarttbl[sb+k] = (INTPTR)(-1);
1072                 }
1073                 sb += tmpsbs;
1074                 bound = (b<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
1075                 BLOCKINDEX(tmpheaptop-1, &tmpsbs);
1076                 for(; b < tmpsbs; b++) {
1077                   bamboo_smemtbl[b] = bound;
1078                   if(b==NUMCORES4GC-1) {
1079                         bound = BAMBOO_SMEM_SIZE;
1080                   }
1081                 }
1082                 if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) {
1083                   gcsbstarttbl[sb] = (INTPTR)(-1);
1084                   remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ?
1085                                    BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
1086                   bamboo_smemtbl[b] = bound;
1087                 } else {
1088                   gcsbstarttbl[sb] = (INTPTR)(tmpheaptop);
1089                   remain = tmpheaptop-gcbaseva;
1090                   bamboo_smemtbl[b] = remain%bound;
1091                   remain = bound - bamboo_smemtbl[b];
1092                 } // if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) else ...
1093
1094                 // close current block and fill the header
1095                 BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
1096                 *((int*)base) = isize + BAMBOO_CACHE_LINE_SIZE;
1097                 cpysize = 0;
1098                 base = tmpheaptop;
1099                 if(remain == BAMBOO_CACHE_LINE_SIZE) {
1100                   // fill with 0 in case
1101                   BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
1102                 }
1103                 remain -= BAMBOO_CACHE_LINE_SIZE;
1104                 tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
1105       } else {
1106                 remain -= isize;
1107                 // move the large obj
1108                 if((int)gcheaptop < (int)(tmpheaptop)+size) {
1109                   memmove(tmpheaptop, gcheaptop, size);
1110                 } else {
1111                   //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
1112                   memcpy(tmpheaptop, gcheaptop, size);
1113                 }
1114                 // fill the remaining space with -2 padding
1115                 BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
1116 #ifdef DEBUG
1117                 BAMBOO_DEBUGPRINT(0xea06);
1118                 BAMBOO_DEBUGPRINT_REG(gcheaptop);
1119                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
1120                 BAMBOO_DEBUGPRINT_REG(size);
1121                 BAMBOO_DEBUGPRINT_REG(isize);
1122 #endif
1123
1124                 gcheaptop += size;
1125                 cpysize += isize;
1126                 // cache the mapping info anyway
1127                 //if(ptr != tmpheaptop) {
1128                 BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1129 #ifdef LOCALHASHTBL_TEST
1130                 RuntimeHashadd_I(gcpointertbl, ptr, tmpheaptop);
1131 #else
1132                 mgchashInsert_I(gcpointertbl, ptr, tmpheaptop);
1133 #endif
1134                 //MGCHashadd_I(gcpointertbl, ptr, tmpheaptop);
1135                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1136                 //}
1137 #ifdef DEBUG
1138                 BAMBOO_DEBUGPRINT(0xcdcc);
1139                 BAMBOO_DEBUGPRINT_REG(ptr);
1140                 BAMBOO_DEBUGPRINT_REG(tmpheaptop);
1141                 BAMBOO_DEBUGPRINT_REG(*((int*)tmpheaptop));
1142 #endif
1143                 if(host != BAMBOO_NUM_OF_CORE) {
1144                   // send the original host core with the mapping info
1145                   send_msg_3(host, GCLOBJMAPPING, ptr, tmpheaptop, false);
1146 #ifdef DEBUG
1147                   BAMBOO_DEBUGPRINT(0xcdcd);
1148                   BAMBOO_DEBUGPRINT_REG(ptr);
1149                   BAMBOO_DEBUGPRINT_REG(tmpheaptop);
1150 #endif
1151                 }                         // if(host != BAMBOO_NUM_OF_CORE)
1152                 tmpheaptop += isize;
1153
1154                 // update bamboo_smemtbl
1155                 bamboo_smemtbl[b] += isize;
1156           }  // if(remain < isize) else ...
1157     }  // while(gc_lobjmoreItems())
1158     if(cpysize > 0) {
1159       // close current block, fill the header
1160       BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
1161       *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
1162       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE;// add the size of the header
1163     } else {
1164       tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
1165     }
1166     gcheaptop = tmpheaptop;
1167
1168   } // if(tomove == 0)
1169
1170 #ifdef DEBUG
1171   BAMBOO_DEBUGPRINT(0xea07);
1172   BAMBOO_DEBUGPRINT_REG(gcheaptop);
1173 #endif
1174
1175   bamboo_free_block = 0;
1176   int tbound = 0;
1177   do {
1178     tbound = (bamboo_free_block<NUMCORES4GC) ?
1179              BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
1180     if(bamboo_smemtbl[bamboo_free_block] == tbound) {
1181       bamboo_free_block++;
1182     } else {
1183       // the first non-full partition
1184       break;
1185     }
1186   } while(true);
1187
1188 #ifdef GC_PROFILE
1189   // check how many live space there are
1190   gc_num_livespace = 0;
1191   for(int tmpi = 0; tmpi < gcnumblock; tmpi++) {
1192         gc_num_livespace += bamboo_smemtbl[tmpi];
1193   }
1194   gc_num_freespace = (BAMBOO_SHARED_MEM_SIZE) - gc_num_livespace;
1195 #endif
1196 #ifdef DEBUG
1197   BAMBOO_DEBUGPRINT(0xea08);
1198   BAMBOO_DEBUGPRINT_REG(gcheaptop);
1199 #endif
1200 } // void moveLObjs()
1201
1202 inline void markObj(void * objptr) {
1203   if(objptr == NULL) {
1204     return;
1205   }
1206   if(ISSHAREDOBJ(objptr)) {
1207     int host = hostcore(objptr);
1208     if(BAMBOO_NUM_OF_CORE == host) {
1209       // on this core
1210       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1211       if(((int *)objptr)[6] == INIT) {
1212                 // this is the first time that this object is discovered,
1213                 // set the flag as DISCOVERED
1214                 ((int *)objptr)[6] |= DISCOVERED;
1215                 BAMBOO_CACHE_FLUSH_LINE(objptr);
1216                 gc_enqueue_I(objptr);
1217           }
1218       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1219     } else {
1220 #ifdef DEBUG
1221       BAMBOO_DEBUGPRINT(0xbbbb);
1222       BAMBOO_DEBUGPRINT_REG(host);
1223       BAMBOO_DEBUGPRINT_REG(objptr);
1224 #endif
1225       // check if this obj has been forwarded
1226       if(!MGCHashcontains(gcforwardobjtbl, (int)objptr)) {
1227                 // send a msg to host informing that objptr is active
1228                 send_msg_2(host, GCMARKEDOBJ, objptr, /*BAMBOO_NUM_OF_CORE,*/ false);
1229 #ifdef GC_PROFILE
1230                 gc_num_forwardobj++;
1231 #endif // GC_PROFILE
1232                 gcself_numsendobjs++;
1233                 MGCHashadd(gcforwardobjtbl, (int)objptr);
1234       }
1235     }
1236   } else {
1237     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1238     gc_enqueue_I(objptr);
1239     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1240   }       // if(ISSHAREDOBJ(objptr))
1241 } // void markObj(void * objptr)
1242
1243 // enqueue root objs
1244 inline void tomark(struct garbagelist * stackptr) {
1245   if(MARKPHASE != gcphase) {
1246 #ifdef DEBUG
1247     BAMBOO_DEBUGPRINT_REG(gcphase);
1248 #endif
1249     BAMBOO_EXIT(0xb101);
1250   }
1251   gcbusystatus = true;
1252   gcnumlobjs = 0;
1253
1254   int i,j;
1255   // enqueue current stack
1256   while(stackptr!=NULL) {
1257 #ifdef DEBUG
1258     BAMBOO_DEBUGPRINT(0xe501);
1259     BAMBOO_DEBUGPRINT_REG(stackptr->size);
1260     BAMBOO_DEBUGPRINT_REG(stackptr->next);
1261     BAMBOO_DEBUGPRINT_REG(stackptr->array[0]);
1262 #endif
1263     for(i=0; i<stackptr->size; i++) {
1264       if(stackptr->array[i] != NULL) {
1265                 markObj(stackptr->array[i]);
1266       }
1267     }
1268     stackptr=stackptr->next;
1269   }
1270
1271 #ifdef DEBUG
1272   BAMBOO_DEBUGPRINT(0xe503);
1273 #endif
1274   // enqueue objectsets
1275   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
1276     for(i=0; i<NUMCLASSES; i++) {
1277       struct parameterwrapper ** queues =
1278         objectqueues[BAMBOO_NUM_OF_CORE][i];
1279       int length = numqueues[BAMBOO_NUM_OF_CORE][i];
1280       for(j = 0; j < length; ++j) {
1281                 struct parameterwrapper * parameter = queues[j];
1282                 struct ObjectHash * set=parameter->objectset;
1283                 struct ObjectNode * ptr=set->listhead;
1284                 while(ptr!=NULL) {
1285                   markObj((void *)ptr->key);
1286                   ptr=ptr->lnext;
1287                 }
1288       }
1289     }
1290   }
1291
1292   // euqueue current task descriptor
1293   if(currtpd != NULL) {
1294 #ifdef DEBUG
1295     BAMBOO_DEBUGPRINT(0xe504);
1296 #endif
1297     for(i=0; i<currtpd->numParameters; i++) {
1298       markObj(currtpd->parameterArray[i]);
1299     }
1300   }
1301
1302 #ifdef DEBUG
1303   BAMBOO_DEBUGPRINT(0xe505);
1304 #endif
1305   // euqueue active tasks
1306   if(activetasks != NULL) {
1307     struct genpointerlist * ptr=activetasks->list;
1308     while(ptr!=NULL) {
1309       struct taskparamdescriptor *tpd=ptr->src;
1310       int i;
1311       for(i=0; i<tpd->numParameters; i++) {
1312                 markObj(tpd->parameterArray[i]);
1313       }
1314       ptr=ptr->inext;
1315     }
1316   }
1317
1318 #ifdef DEBUG
1319   BAMBOO_DEBUGPRINT(0xe506);
1320 #endif
1321   // enqueue cached transferred obj
1322   struct QueueItem * tmpobjptr =  getHead(&objqueue);
1323   while(tmpobjptr != NULL) {
1324     struct transObjInfo * objInfo =
1325       (struct transObjInfo *)(tmpobjptr->objectptr);
1326     markObj(objInfo->objptr);
1327     tmpobjptr = getNextQueueItem(tmpobjptr);
1328   }
1329
1330 #ifdef DEBUG
1331   BAMBOO_DEBUGPRINT(0xe507);
1332 #endif
1333   // enqueue cached objs to be transferred
1334   struct QueueItem * item = getHead(totransobjqueue);
1335   while(item != NULL) {
1336     struct transObjInfo * totransobj =
1337       (struct transObjInfo *)(item->objectptr);
1338     markObj(totransobj->objptr);
1339     item = getNextQueueItem(item);
1340   }       // while(item != NULL)
1341
1342 #ifdef DEBUG
1343   BAMBOO_DEBUGPRINT(0xe508);
1344 #endif
1345   // enqueue lock related info
1346   for(i = 0; i < runtime_locklen; ++i) {
1347     markObj((void *)(runtime_locks[i].redirectlock));
1348     if(runtime_locks[i].value != NULL) {
1349       markObj((void *)(runtime_locks[i].value));
1350     }
1351   }
1352
1353 } // void tomark(struct garbagelist * stackptr)
1354
1355 inline void mark(bool isfirst,
1356                  struct garbagelist * stackptr) {
1357 #ifdef DEBUG
1358   if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed01);
1359 #endif
1360   if(isfirst) {
1361 #ifdef DEBUG
1362     if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed02);
1363 #endif
1364     // enqueue root objs
1365     tomark(stackptr);
1366     gccurr_heaptop = 0; // record the size of all active objs in this core
1367                         // aligned but does not consider block boundaries
1368     gcmarkedptrbound = 0;
1369   }
1370 #ifdef DEBUG
1371   if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed03);
1372 #endif
1373   int isize = 0;
1374   bool checkfield = true;
1375   bool sendStall = false;
1376   // mark phase
1377   while(MARKPHASE == gcphase) {
1378 #ifdef DEBUG
1379     if(BAMBOO_NUM_OF_CORE == 0) BAMBOO_DEBUGPRINT(0xed04);
1380 #endif
1381     while(true) {
1382       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1383       bool hasItems = gc_moreItems2_I();
1384       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1385 #ifdef DEBUG
1386       BAMBOO_DEBUGPRINT(0xed05);
1387 #endif
1388       if(!hasItems) {
1389                 break;
1390       }
1391       sendStall = false;
1392       gcbusystatus = true;
1393       checkfield = true;
1394       void * ptr = gc_dequeue2_I();
1395
1396 #ifdef DEBUG
1397       BAMBOO_DEBUGPRINT_REG(ptr);
1398 #endif
1399       int size = 0;
1400       int isize = 0;
1401       int type = 0;
1402       // check if it is a shared obj
1403       if(ISSHAREDOBJ(ptr)) {
1404                 // a shared obj, check if it is a local obj on this core
1405                 int host = hostcore(ptr);
1406                 bool islocal = (host == BAMBOO_NUM_OF_CORE);
1407                 if(islocal) {
1408                   bool isnotmarked = ((((int *)ptr)[6] & DISCOVERED) != 0);
1409                   if(isLarge(ptr, &type, &size) && isnotmarked) {
1410                         // ptr is a large object and not marked or enqueued
1411 #ifdef DEBUG
1412                         BAMBOO_DEBUGPRINT(0xecec);
1413                         BAMBOO_DEBUGPRINT_REG(ptr);
1414                         BAMBOO_DEBUGPRINT_REG(*((int*)ptr));
1415 #endif
1416                         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1417                         gc_lobjenqueue_I(ptr, size, BAMBOO_NUM_OF_CORE);
1418                         gcnumlobjs++;
1419                         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1420                         // mark this obj
1421                         ((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
1422                         BAMBOO_CACHE_FLUSH_LINE(ptr);
1423                   } else if(isnotmarked) {
1424                         // ptr is an unmarked active object on this core
1425                         ALIGNSIZE(size, &isize);
1426                         gccurr_heaptop += isize;
1427 #ifdef DEBUG
1428                         BAMBOO_DEBUGPRINT(0xaaaa);
1429                         BAMBOO_DEBUGPRINT_REG(ptr);
1430                         BAMBOO_DEBUGPRINT_REG(isize);
1431                         BAMBOO_DEBUGPRINT(((int *)(ptr))[0]);
1432 #endif
1433                         // mark this obj
1434                         ((int *)ptr)[6] = ((int *)ptr)[6] & (~DISCOVERED) | MARKED;
1435                         BAMBOO_CACHE_FLUSH_LINE(ptr);
1436                   
1437                         if(ptr + size > gcmarkedptrbound) {
1438                           gcmarkedptrbound = ptr + size;
1439                         } // if(ptr + size > gcmarkedptrbound)
1440                   } else {
1441                         // ptr is not an active obj or has been marked
1442                         checkfield = false;
1443                   } // if(isLarge(ptr, &type, &size)) else ...
1444                 }  /* can never reach here
1445                 else {
1446 #ifdef DEBUG
1447                   if(BAMBOO_NUM_OF_CORE == 0) {
1448                         BAMBOO_DEBUGPRINT(0xbbbb);
1449                         BAMBOO_DEBUGPRINT_REG(host);
1450                         BAMBOO_DEBUGPRINT_REG(ptr);
1451                   }
1452 #endif
1453                   // check if this obj has been forwarded
1454                   if(!MGCHashcontains(gcforwardobjtbl, (int)ptr)) {
1455                         // send a msg to host informing that ptr is active
1456                         send_msg_2(host, GCMARKEDOBJ, ptr, false);
1457                         gcself_numsendobjs++;
1458                         MGCHashadd(gcforwardobjtbl, (int)ptr);
1459                   }
1460                         checkfield = false;
1461                 }// if(isLocal(ptr)) else ...*/
1462           }   // if(ISSHAREDOBJ(ptr))
1463 #ifdef DEBUG
1464       BAMBOO_DEBUGPRINT(0xed06);
1465 #endif
1466
1467       if(checkfield) {
1468                 // scan all pointers in ptr
1469                 unsigned INTPTR * pointer;
1470                 pointer=pointerarray[type];
1471                 if (pointer==0) {
1472                   /* Array of primitives */
1473                   /* Do nothing */
1474                 } else if (((INTPTR)pointer)==1) {
1475                   /* Array of pointers */
1476                   struct ArrayObject *ao=(struct ArrayObject *) ptr;
1477                   int length=ao->___length___;
1478                   int j;
1479                   for(j=0; j<length; j++) {
1480                         void *objptr =
1481                           ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
1482                         markObj(objptr);
1483                   }
1484                 } else {
1485                   INTPTR size=pointer[0];
1486                   int i;
1487                   for(i=1; i<=size; i++) {
1488                         unsigned int offset=pointer[i];
1489                         void * objptr=*((void **)(((char *)ptr)+offset));
1490                         markObj(objptr);
1491                   }
1492                 }     // if (pointer==0) else if ... else ...
1493       }   // if(checkfield)
1494     }     // while(gc_moreItems2())
1495 #ifdef DEBUG
1496     BAMBOO_DEBUGPRINT(0xed07);
1497 #endif
1498     gcbusystatus = false;
1499     // send mark finish msg to core coordinator
1500     if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
1501 #ifdef DEBUG
1502       BAMBOO_DEBUGPRINT(0xed08);
1503 #endif
1504       gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
1505       gcnumsendobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=gcself_numsendobjs;
1506       gcnumreceiveobjs[gcnumsrobjs_index][BAMBOO_NUM_OF_CORE]=
1507                 gcself_numreceiveobjs;
1508       gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
1509     } else {
1510       if(!sendStall) {
1511 #ifdef DEBUG
1512                 BAMBOO_DEBUGPRINT(0xed09);
1513 #endif
1514                 send_msg_4(STARTUPCORE, GCFINISHMARK, BAMBOO_NUM_OF_CORE,
1515                                    gcself_numsendobjs, gcself_numreceiveobjs, false);
1516                 sendStall = true;
1517       }
1518     }             // if(STARTUPCORE == BAMBOO_NUM_OF_CORE) ...
1519 #ifdef DEBUG
1520     BAMBOO_DEBUGPRINT(0xed0a);
1521 #endif
1522
1523     if(BAMBOO_NUM_OF_CORE == STARTUPCORE) {
1524 #ifdef DEBUG
1525       BAMBOO_DEBUGPRINT(0xed0b);
1526 #endif
1527       return;
1528     }
1529   } // while(MARKPHASE == gcphase)
1530
1531   BAMBOO_CACHE_MF();
1532 } // mark()
1533
1534 inline void compact2Heaptophelper_I(int coren,
1535                                     int* p,
1536                                     int* numblocks,
1537                                     int* remain) {
1538   int b;
1539   int memneed = gcrequiredmems[coren] + BAMBOO_CACHE_LINE_SIZE;
1540   if(STARTUPCORE == coren) {
1541     gctomove = true;
1542     gcmovestartaddr = *p;
1543     gcdstcore = gctopcore;
1544     gcblock2fill = *numblocks + 1;
1545   } else {
1546     send_msg_4(coren, GCMOVESTART, gctopcore, *p, (*numblocks) + 1, false);
1547   }
1548 #ifdef DEBUG
1549   BAMBOO_DEBUGPRINT_REG(coren);
1550   BAMBOO_DEBUGPRINT_REG(gctopcore);
1551   BAMBOO_DEBUGPRINT_REG(*p);
1552   BAMBOO_DEBUGPRINT_REG(*numblocks+1);
1553 #endif
1554   if(memneed < *remain) {
1555 #ifdef DEBUG
1556     BAMBOO_DEBUGPRINT(0xd104);
1557 #endif
1558     *p = *p + memneed;
1559     gcrequiredmems[coren] = 0;
1560     gcloads[gctopcore] += memneed;
1561     *remain = *remain - memneed;
1562   } else {
1563 #ifdef DEBUG
1564     BAMBOO_DEBUGPRINT(0xd105);
1565 #endif
1566     // next available block
1567     *p = *p + *remain;
1568     gcfilledblocks[gctopcore] += 1;
1569     int newbase = 0;
1570     BASEPTR(gctopcore, gcfilledblocks[gctopcore], &newbase);
1571     gcloads[gctopcore] = newbase;
1572     gcrequiredmems[coren] -= *remain - BAMBOO_CACHE_LINE_SIZE;
1573     gcstopblock[gctopcore]++;
1574     gctopcore = NEXTTOPCORE(gctopblock);
1575     gctopblock++;
1576     *numblocks = gcstopblock[gctopcore];
1577     *p = gcloads[gctopcore];
1578     BLOCKINDEX(*p, &b);
1579     *remain=(b<NUMCORES4GC) ?
1580              ((BAMBOO_SMEM_SIZE_L)-((*p)%(BAMBOO_SMEM_SIZE_L)))
1581              : ((BAMBOO_SMEM_SIZE)-((*p)%(BAMBOO_SMEM_SIZE)));
1582 #ifdef DEBUG
1583     BAMBOO_DEBUGPRINT(0xd106);
1584     BAMBOO_DEBUGPRINT_REG(gctopcore);
1585     BAMBOO_DEBUGPRINT_REG(*p);
1586     BAMBOO_DEBUGPRINT_REG(b);
1587     BAMBOO_DEBUGPRINT_REG(*remain);
1588 #endif
1589   }       // if(memneed < remain)
1590   gcmovepending--;
1591 } // void compact2Heaptophelper_I(int, int*, int*, int*)
1592
1593 inline void compact2Heaptop() {
1594   // no cores with spare mem and some cores are blocked with pending move
1595   // find the current heap top and make them move to the heap top
1596   int p;
1597   int numblocks = gcfilledblocks[gctopcore];
1598   //BASEPTR(gctopcore, numblocks, &p);
1599   p = gcloads[gctopcore];
1600   int b;
1601   BLOCKINDEX(p, &b);
1602   int remain = (b<NUMCORES4GC) ?
1603                ((BAMBOO_SMEM_SIZE_L)-(p%(BAMBOO_SMEM_SIZE_L)))
1604                : ((BAMBOO_SMEM_SIZE)-(p%(BAMBOO_SMEM_SIZE)));
1605   // check if the top core finishes
1606   BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1607   if(gccorestatus[gctopcore] != 0) {
1608 #ifdef DEBUG
1609     BAMBOO_DEBUGPRINT(0xd101);
1610     BAMBOO_DEBUGPRINT_REG(gctopcore);
1611 #endif
1612     // let the top core finishes its own work first
1613     compact2Heaptophelper_I(gctopcore, &p, &numblocks, &remain);
1614     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1615     return;
1616   }
1617   BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1618
1619 #ifdef DEBUG
1620   BAMBOO_DEBUGPRINT(0xd102);
1621   BAMBOO_DEBUGPRINT_REG(gctopcore);
1622   BAMBOO_DEBUGPRINT_REG(p);
1623   BAMBOO_DEBUGPRINT_REG(b);
1624   BAMBOO_DEBUGPRINT_REG(remain);
1625 #endif
1626   for(int i = 0; i < NUMCORES4GC; i++) {
1627     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1628     if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0)) {
1629 #ifdef DEBUG
1630       BAMBOO_DEBUGPRINT(0xd103);
1631 #endif
1632       compact2Heaptophelper_I(i, &p, &numblocks, &remain);
1633       if(gccorestatus[gctopcore] != 0) {
1634 #ifdef DEBUG
1635                 BAMBOO_DEBUGPRINT(0xd101);
1636                 BAMBOO_DEBUGPRINT_REG(gctopcore);
1637 #endif
1638                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1639                 // the top core is not free now
1640                 return;
1641       }
1642     }             // if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0))
1643     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1644   }       // for(i = 0; i < NUMCORES4GC; i++)
1645 #ifdef DEBUG
1646   BAMBOO_DEBUGPRINT(0xd106);
1647 #endif
1648 } // void compact2Heaptop()
1649
1650 inline void resolvePendingMoveRequest() {
1651 #ifdef DEBUG
1652   BAMBOO_DEBUGPRINT(0xeb01);
1653 #endif
1654 #ifdef DEBUG
1655   BAMBOO_DEBUGPRINT(0xeeee);
1656   for(int k = 0; k < NUMCORES4GC; k++) {
1657     BAMBOO_DEBUGPRINT(0xf000+k);
1658     BAMBOO_DEBUGPRINT_REG(gccorestatus[k]);
1659     BAMBOO_DEBUGPRINT_REG(gcloads[k]);
1660     BAMBOO_DEBUGPRINT_REG(gcfilledblocks[k]);
1661     BAMBOO_DEBUGPRINT_REG(gcstopblock[k]);
1662   }
1663   BAMBOO_DEBUGPRINT(0xffff);
1664 #endif
1665   int i;
1666   int j;
1667   bool nosparemem = true;
1668   bool haspending = false;
1669   bool hasrunning = false;
1670   bool noblock = false;
1671   int dstcore = 0;       // the core who need spare mem
1672   int sourcecore = 0;       // the core who has spare mem
1673   for(i = j = 0; (i < NUMCORES4GC) && (j < NUMCORES4GC); ) {
1674     if(nosparemem) {
1675       // check if there are cores with spare mem
1676       if(gccorestatus[i] == 0) {
1677                 // finished working, check if it still have spare mem
1678                 if(gcfilledblocks[i] < gcstopblock[i]) {
1679                   // still have spare mem
1680                   nosparemem = false;
1681                   sourcecore = i;
1682                 }  // if(gcfilledblocks[i] < gcstopblock[i]) else ...
1683       }
1684       i++;
1685     }             // if(nosparemem)
1686     if(!haspending) {
1687       if(gccorestatus[j] != 0) {
1688                 // not finished, check if it has pending move requests
1689                 if((gcfilledblocks[j]==gcstopblock[j])&&(gcrequiredmems[j]>0)) {
1690                   dstcore = j;
1691                   haspending = true;
1692                 } else {
1693                   hasrunning = true;
1694                 }  // if((gcfilledblocks[i] == gcstopblock[i])...) else ...
1695       }  // if(gccorestatus[i] == 0) else ...
1696       j++;
1697     }  // if(!haspending)
1698     if(!nosparemem && haspending) {
1699       // find match
1700       int tomove = 0;
1701       int startaddr = 0;
1702       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
1703       gcrequiredmems[dstcore] = assignSpareMem_I(sourcecore,
1704                                                  gcrequiredmems[dstcore],
1705                                                  &tomove,
1706                                                  &startaddr);
1707       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
1708 #ifdef DEBUG
1709       BAMBOO_DEBUGPRINT(0xeb02);
1710       BAMBOO_DEBUGPRINT_REG(sourcecore);
1711       BAMBOO_DEBUGPRINT_REG(dstcore);
1712       BAMBOO_DEBUGPRINT_REG(startaddr);
1713       BAMBOO_DEBUGPRINT_REG(tomove);
1714 #endif
1715       if(STARTUPCORE == dstcore) {
1716 #ifdef DEBUG
1717                 BAMBOO_DEBUGPRINT(0xeb03);
1718 #endif
1719                 gcdstcore = sourcecore;
1720                 gctomove = true;
1721                 gcmovestartaddr = startaddr;
1722                 gcblock2fill = tomove;
1723       } else {
1724 #ifdef DEBUG
1725                 BAMBOO_DEBUGPRINT(0xeb04);
1726 #endif
1727                 send_msg_4(dstcore, GCMOVESTART, sourcecore,
1728                                    startaddr, tomove, false);
1729       }
1730       gcmovepending--;
1731       nosparemem = true;
1732       haspending = false;
1733       noblock = true;
1734     }
1735   }       // for(i = 0; i < NUMCORES4GC; i++)
1736 #ifdef DEBUG
1737   BAMBOO_DEBUGPRINT(0xcccc);
1738   BAMBOO_DEBUGPRINT_REG(hasrunning);
1739   BAMBOO_DEBUGPRINT_REG(haspending);
1740   BAMBOO_DEBUGPRINT_REG(noblock);
1741 #endif
1742
1743   if(!hasrunning && !noblock) {
1744     gcphase = SUBTLECOMPACTPHASE;
1745     compact2Heaptop();
1746   }
1747
1748 } // void resovePendingMoveRequest()
1749
1750 struct moveHelper {
1751   int numblocks;       // block num for heap
1752   INTPTR base;       // base virtual address of current heap block
1753   INTPTR ptr;       // virtual address of current heap top
1754   int offset;       // offset in current heap block
1755   int blockbase;       // virtual address of current small block to check
1756   int blockbound;       // bound virtual address of current small blcok
1757   int sblockindex;       // index of the small blocks
1758   int top;       // real size of current heap block to check
1759   int bound;       // bound size of current heap block to check
1760 }; // struct moveHelper
1761
1762 // If out of boundary of valid shared memory, return false, else return true
1763 inline bool nextSBlock(struct moveHelper * orig) {
1764   orig->blockbase = orig->blockbound;
1765   bool sbchanged = false;
1766 #ifdef DEBUG
1767   BAMBOO_DEBUGPRINT(0xecc0);
1768   BAMBOO_DEBUGPRINT_REG(orig->blockbase);
1769   BAMBOO_DEBUGPRINT_REG(orig->blockbound);
1770   BAMBOO_DEBUGPRINT_REG(orig->bound);
1771   BAMBOO_DEBUGPRINT_REG(orig->ptr);
1772 #endif
1773 outernextSBlock:
1774   // check if across a big block
1775   // TODO now do not zero out the whole memory, maybe the last two conditions
1776   // are useless now
1777   if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)
1778      || ((orig->ptr != NULL) && (*((int*)orig->ptr))==0)
1779      || ((*((int*)orig->blockbase))==0)) {
1780 innernextSBlock:
1781     // end of current heap block, jump to next one
1782     orig->numblocks++;
1783 #ifdef DEBUG
1784     BAMBOO_DEBUGPRINT(0xecc1);
1785     BAMBOO_DEBUGPRINT_REG(orig->numblocks);
1786 #endif
1787     BASEPTR(BAMBOO_NUM_OF_CORE, orig->numblocks, &(orig->base));
1788 #ifdef DEBUG
1789     BAMBOO_DEBUGPRINT(orig->base);
1790 #endif
1791     if(orig->base >= gcbaseva + BAMBOO_SHARED_MEM_SIZE) {
1792       // out of boundary
1793       orig->ptr = orig->base; // set current ptr to out of boundary too
1794       return false;
1795     }
1796     //orig->bound = orig->base + BAMBOO_SMEM_SIZE;
1797     orig->blockbase = orig->base;
1798     orig->sblockindex = (orig->blockbase-gcbaseva)/BAMBOO_SMEM_SIZE;
1799     sbchanged = true;
1800     int blocknum = 0;
1801     BLOCKINDEX(orig->base, &blocknum);
1802     if(bamboo_smemtbl[blocknum] == 0) {
1803       // goto next block
1804       goto innernextSBlock;
1805     }
1806         // check the bamboo_smemtbl to decide the real bound
1807         orig->bound = orig->base + bamboo_smemtbl[blocknum];
1808   } else if(0 == (orig->blockbase%BAMBOO_SMEM_SIZE)) {
1809     orig->sblockindex += 1;
1810     sbchanged = true;
1811   }  // if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)...
1812
1813   // check if this sblock should be skipped or have special start point
1814   if(gcsbstarttbl[orig->sblockindex] == -1) {
1815     // goto next sblock
1816 #ifdef DEBUG
1817     BAMBOO_DEBUGPRINT(0xecc2);
1818 #endif
1819     orig->sblockindex += 1;
1820     orig->blockbase += BAMBOO_SMEM_SIZE;
1821     goto outernextSBlock;
1822   } else if((gcsbstarttbl[orig->sblockindex] != 0)
1823             && (sbchanged)) {
1824     // the first time to access this SBlock
1825 #ifdef DEBUG
1826     BAMBOO_DEBUGPRINT(0xecc3);
1827 #endif
1828     // not start from the very beginning
1829     orig->blockbase = gcsbstarttbl[orig->sblockindex];
1830   }  // if(gcsbstarttbl[orig->sblockindex] == -1) else ...
1831
1832   // setup information for this sblock
1833   orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
1834   orig->offset = BAMBOO_CACHE_LINE_SIZE;
1835   orig->ptr = orig->blockbase + orig->offset;
1836 #ifdef DEBUG
1837   BAMBOO_DEBUGPRINT(0xecc4);
1838   BAMBOO_DEBUGPRINT_REG(orig->base);
1839   BAMBOO_DEBUGPRINT_REG(orig->bound);
1840   BAMBOO_DEBUGPRINT_REG(orig->ptr);
1841   BAMBOO_DEBUGPRINT_REG(orig->blockbound);
1842   BAMBOO_DEBUGPRINT_REG(orig->blockbase);
1843   BAMBOO_DEBUGPRINT_REG(orig->offset);
1844 #endif
1845   if(orig->ptr >= orig->bound) {
1846     // met a lobj, move to next block
1847     goto innernextSBlock;
1848   }
1849
1850   return true;
1851 } // bool nextSBlock(struct moveHelper * orig)
1852
1853 // return false if there are no available data to compact
1854 inline bool initOrig_Dst(struct moveHelper * orig,
1855                          struct moveHelper * to) {
1856   // init the dst ptr
1857   to->numblocks = 0;
1858   to->top = to->offset = BAMBOO_CACHE_LINE_SIZE;
1859   to->bound = BAMBOO_SMEM_SIZE_L;
1860   BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
1861
1862 #ifdef DEBUG
1863   BAMBOO_DEBUGPRINT(0xef01);
1864   BAMBOO_DEBUGPRINT_REG(to->base);
1865 #endif
1866   to->ptr = to->base + to->offset;
1867 #ifdef GC_CACHE_ADAPT
1868   // initialize the gc_cache_revise_information
1869   gc_cache_revise_infomation.to_page_start_va = to->ptr;
1870   gc_cache_revise_infomation.to_page_end_va = (BAMBOO_PAGE_SIZE)*
1871         ((to->base-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
1872   gc_cache_revise_infomation.to_page_index = 
1873         (to->base-gcbaseva)/(BAMBOO_PAGE_SIZE);
1874   gc_cache_revise_infomation.orig_page_start_va = -1; 
1875 #endif // GC_CACHE_ADAPT
1876
1877   // init the orig ptr
1878   orig->numblocks = 0;
1879   orig->base = to->base;
1880   int blocknum = 0;
1881   BLOCKINDEX(orig->base, &blocknum);
1882   // check the bamboo_smemtbl to decide the real bound
1883   orig->bound = orig->base + bamboo_smemtbl[blocknum];
1884   orig->blockbase = orig->base;
1885   orig->sblockindex = (orig->base - gcbaseva) / BAMBOO_SMEM_SIZE;
1886 #ifdef DEBUG
1887   BAMBOO_DEBUGPRINT(0xef02);
1888   BAMBOO_DEBUGPRINT_REG(orig->base);
1889   BAMBOO_DEBUGPRINT_REG(orig->sblockindex);
1890   BAMBOO_DEBUGPRINT_REG(gcsbstarttbl);
1891   BAMBOO_DEBUGPRINT_REG(gcsbstarttbl[orig->sblockindex]);
1892 #endif
1893
1894   if(gcsbstarttbl[orig->sblockindex] == -1) {
1895 #ifdef DEBUG
1896     BAMBOO_DEBUGPRINT(0xef03);
1897 #endif
1898     // goto next sblock
1899     orig->blockbound =
1900       gcbaseva+BAMBOO_SMEM_SIZE*(orig->sblockindex+1);
1901     return nextSBlock(orig);
1902   } else if(gcsbstarttbl[orig->sblockindex] != 0) {
1903 #ifdef DEBUG
1904     BAMBOO_DEBUGPRINT(0xef04);
1905 #endif
1906     orig->blockbase = gcsbstarttbl[orig->sblockindex];
1907   }
1908 #ifdef DEBUG
1909   BAMBOO_DEBUGPRINT(0xef05);
1910 #endif
1911   orig->blockbound = orig->blockbase + *((int*)(orig->blockbase));
1912   orig->offset = BAMBOO_CACHE_LINE_SIZE;
1913   orig->ptr = orig->blockbase + orig->offset;
1914 #ifdef DEBUG
1915   BAMBOO_DEBUGPRINT(0xef06);
1916   BAMBOO_DEBUGPRINT_REG(orig->base);
1917 #endif
1918
1919   return true;
1920 } // bool initOrig_Dst(struct moveHelper * orig, struct moveHelper * to)
1921
1922 inline void nextBlock(struct moveHelper * to) {
1923   to->top = to->bound + BAMBOO_CACHE_LINE_SIZE; // header!
1924   to->bound += BAMBOO_SMEM_SIZE;
1925   to->numblocks++;
1926   BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
1927   to->offset = BAMBOO_CACHE_LINE_SIZE;
1928   to->ptr = to->base + to->offset;
1929 } // void nextBlock(struct moveHelper * to)
1930
1931 // endaddr does not contain spaces for headers
1932 inline bool moveobj(struct moveHelper * orig,
1933                     struct moveHelper * to,
1934                     int stopblock) {
1935   if(stopblock == 0) {
1936     return true;
1937   }
1938
1939 #ifdef DEBUG
1940   BAMBOO_DEBUGPRINT(0xe201);
1941   BAMBOO_DEBUGPRINT_REG(orig->ptr);
1942   BAMBOO_DEBUGPRINT_REG(to->ptr);
1943 #endif
1944
1945   int type = 0;
1946   int size = 0;
1947   int mark = 0;
1948   int isize = 0;
1949 innermoveobj:
1950   while((char)(*((int*)(orig->ptr))) == (char)(-2)) {
1951     orig->ptr = (int*)(orig->ptr) + 1;
1952   }
1953 #ifdef GC_CACHE_ADAPT
1954   if(orig->ptr >= gc_cache_revise_infomation.orig_page_end_va) {
1955         // end of an orig page
1956         // compute the impact of this page for the new page
1957         float tmp_factor = 
1958           ((float)(to->ptr-gc_cache_revise_infomation.to_page_start_va))/
1959           ((float)(BAMBOO_PAGE_SIZE));
1960         for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
1961           ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
1962                  )[gc_cache_revise_infomation.to_page_index] += (int)(
1963                    ((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
1964                 gc_cache_revise_infomation.orig_page_index]*tmp_factor);
1965           // TODO
1966 /*        if(((gc_cache_revise_infomation.orig_page_start_va-gcbaseva)/(BAMBOO_PAGE_SIZE))*(BAMBOO_PAGE_SIZE)+gcbaseva == 0xd180000) {
1967                 tprintf("0xd180000 -> %x %d, %d, %d\n",(int)(gcbaseva+(BAMBOO_PAGE_SIZE)*gc_cache_revise_infomation.to_page_index), (int)(((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[gc_cache_revise_infomation.orig_page_index]*tmp_factor), (int)(tmp_factor*100000), (int)(to->ptr-gc_cache_revise_infomation.to_page_start_va));
1968           }*/
1969         }
1970         // prepare for an new orig page
1971         gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
1972         gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
1973           (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
1974         gc_cache_revise_infomation.orig_page_index = 
1975           (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
1976         gc_cache_revise_infomation.to_page_start_va = to->ptr;
1977   }
1978 #endif
1979   if((orig->ptr >= orig->bound) || (orig->ptr == orig->blockbound)) {
1980     if(!nextSBlock(orig)) {
1981       // finished, no more data
1982       return true;
1983     }
1984     goto innermoveobj;
1985   }
1986 #ifdef DEBUG
1987   BAMBOO_DEBUGPRINT(0xe202);
1988   BAMBOO_DEBUGPRINT_REG(orig->ptr);
1989   BAMBOO_DEBUGPRINT(((int *)(orig->ptr))[0]);
1990 #endif
1991   // check the obj's type, size and mark flag
1992   type = ((int *)(orig->ptr))[0];
1993   size = 0;
1994   if(type == 0) {
1995     // end of this block, go to next one
1996     if(!nextSBlock(orig)) {
1997       // finished, no more data
1998       return true;
1999     }
2000     goto innermoveobj;
2001   } else if(type < NUMCLASSES) {
2002     // a normal object
2003     size = classsize[type];
2004   } else {
2005     // an array
2006     struct ArrayObject *ao=(struct ArrayObject *)(orig->ptr);
2007     int elementsize=classsize[type];
2008     int length=ao->___length___;
2009     size=sizeof(struct ArrayObject)+length*elementsize;
2010   }
2011   mark = ((int *)(orig->ptr))[6];
2012   bool isremote = ((((int *)(orig->ptr))[6] & REMOTEM) != 0);
2013 #ifdef DEBUG
2014   BAMBOO_DEBUGPRINT(0xe203);
2015   BAMBOO_DEBUGPRINT_REG(orig->ptr);
2016   BAMBOO_DEBUGPRINT_REG(size);
2017 #endif
2018   ALIGNSIZE(size, &isize);       // no matter is the obj marked or not
2019                                  // should be able to across it
2020   if((mark & MARKED) != 0) {
2021 #ifdef DEBUG
2022     BAMBOO_DEBUGPRINT(0xe204);
2023 #endif
2024 #ifdef GC_PROFILE
2025         gc_num_liveobj++;
2026 #endif
2027     // marked obj, copy it to current heap top
2028     // check to see if remaining space is enough
2029     if(to->top + isize > to->bound) {
2030       // fill 0 indicating the end of this block
2031       BAMBOO_MEMSET_WH(to->ptr,  '\0', to->bound - to->top);
2032       // fill the header of this block and then go to next block
2033       to->offset += to->bound - to->top;
2034       BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
2035       (*((int*)(to->base))) = to->offset;
2036 #ifdef GC_CACHE_ADAPT
2037           int tmp_ptr = to->ptr;
2038 #endif // GC_CACHE_ADAPT
2039       nextBlock(to);
2040 #ifdef GC_CACHE_ADAPT
2041           if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
2042           // end of an to page, wrap up its information
2043           float tmp_factor = 
2044                 ((float)(tmp_ptr-gc_cache_revise_infomation.to_page_start_va))/
2045                 ((float)(BAMBOO_PAGE_SIZE));
2046           for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
2047                 ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
2048                  )[gc_cache_revise_infomation.to_page_index] += (int)(
2049                   ((int*)((void*)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
2050                   gc_cache_revise_infomation.orig_page_index]*tmp_factor);
2051                 // TODO
2052 /*        if(((gc_cache_revise_infomation.orig_page_start_va-gcbaseva)/(BAMBOO_PAGE_SIZE))*(BAMBOO_PAGE_SIZE)+gcbaseva == 0xd180000) {
2053                 tprintf("0xd180000 -> %x %d, %d, %d\n",(int)(gcbaseva+(BAMBOO_PAGE_SIZE)*gc_cache_revise_infomation.to_page_index), (int)(((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[gc_cache_revise_infomation.orig_page_index]*tmp_factor), (int)(tmp_factor*100000), (int)(to->ptr-gc_cache_revise_infomation.to_page_start_va));
2054           }*/
2055           }
2056           // prepare for an new to page
2057           gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
2058           gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
2059                 (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2060           gc_cache_revise_infomation.orig_page_index = 
2061                 (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
2062           gc_cache_revise_infomation.to_page_start_va = to->ptr;
2063           gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
2064                 (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2065           gc_cache_revise_infomation.to_page_index = 
2066                 (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
2067         }
2068 #endif // GC_CACHE_ADAPT
2069       if(stopblock == to->numblocks) {
2070                 // already fulfilled the block
2071                 return true;
2072       }   // if(stopblock == to->numblocks)
2073     }   // if(to->top + isize > to->bound)
2074     // set the mark field to 2, indicating that this obj has been moved
2075     // and need to be flushed
2076     ((int *)(orig->ptr))[6] = COMPACTED;
2077     if(to->ptr != orig->ptr) {
2078       if((int)(orig->ptr) < (int)(to->ptr)+size) {
2079                 memmove(to->ptr, orig->ptr, size);
2080       } else {
2081                 //BAMBOO_WRITE_HINT_CACHE(to->ptr, size);
2082                 memcpy(to->ptr, orig->ptr, size);
2083       }
2084       // fill the remaining space with -2
2085       BAMBOO_MEMSET_WH(to->ptr+size, -2, isize-size);
2086     }
2087     // store mapping info
2088     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2089 #ifdef LOCALHASHTBL_TEST
2090     RuntimeHashadd_I(gcpointertbl, orig->ptr, to->ptr);
2091 #else
2092         mgchashInsert_I(gcpointertbl, orig->ptr, to->ptr);
2093 #endif
2094         //MGCHashadd_I(gcpointertbl, orig->ptr, to->ptr);
2095         if(isremote) {
2096           // add to the sharedptbl
2097           if(gcsharedptbl != NULL) {
2098                 //GCSharedHashadd_I(gcsharedptbl, orig->ptr, to->ptr);
2099                 mgcsharedhashInsert_I(gcsharedptbl, orig->ptr, to->ptr);
2100           }
2101         }
2102     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2103     //}
2104 #ifdef DEBUG
2105     BAMBOO_DEBUGPRINT(0xcdce);
2106     BAMBOO_DEBUGPRINT_REG(orig->ptr);
2107     BAMBOO_DEBUGPRINT_REG(to->ptr);
2108         BAMBOO_DEBUGPRINT_REG(isize);
2109 #endif
2110     gccurr_heaptop -= isize;
2111     to->ptr += isize;
2112     to->offset += isize;
2113     to->top += isize;
2114 #ifdef GC_CACHE_ADAPT
2115         int tmp_ptr = to->ptr;
2116 #endif // GC_CACHE_ADAPT
2117     if(to->top == to->bound) {
2118       // fill the header of this block and then go to next block
2119       BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
2120       (*((int*)(to->base))) = to->offset;
2121       nextBlock(to);
2122     }
2123 #ifdef GC_CACHE_ADAPT
2124         if((to->base+to->bound) >= gc_cache_revise_infomation.to_page_end_va) {
2125           // end of an to page, wrap up its information
2126           float tmp_factor = 
2127                 ((float)(tmp_ptr-gc_cache_revise_infomation.to_page_start_va))/
2128                 ((float)(BAMBOO_PAGE_SIZE));
2129           for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
2130                 ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
2131                  )[gc_cache_revise_infomation.to_page_index] += (int)(
2132                   ((int*)((void*)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
2133                   gc_cache_revise_infomation.orig_page_index]*tmp_factor);
2134       // TODO
2135 /*        if(((gc_cache_revise_infomation.orig_page_start_va-gcbaseva)/(BAMBOO_PAGE_SIZE))*(BAMBOO_PAGE_SIZE)+gcbaseva == 0xd180000) {
2136                 tprintf("0xd180000 -> %x %d, %d, %d\n",(int)(gcbaseva+(BAMBOO_PAGE_SIZE)*gc_cache_revise_infomation.to_page_index), (int)(((int*)((void *)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[gc_cache_revise_infomation.orig_page_index]*tmp_factor), (int)(tmp_factor*100000), (int)(to->ptr-gc_cache_revise_infomation.to_page_start_va));
2137           }*/
2138           }
2139           // prepare for an new to page
2140           gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
2141           gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
2142                 (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2143           gc_cache_revise_infomation.orig_page_index = 
2144                 (orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
2145           gc_cache_revise_infomation.to_page_start_va = to->ptr;
2146           gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
2147                 (BAMBOO_PAGE_SIZE)*((to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2148           gc_cache_revise_infomation.to_page_index = 
2149                 (to->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE);
2150         }
2151 #endif // GC_CACHE_ADAPT
2152   } // if(mark == 1)
2153 #ifdef DEBUG
2154   BAMBOO_DEBUGPRINT(0xe205);
2155 #endif
2156   // move to next obj
2157   orig->ptr += size;
2158
2159 #ifdef DEBUG
2160   BAMBOO_DEBUGPRINT_REG(isize);
2161   BAMBOO_DEBUGPRINT_REG(size);
2162   BAMBOO_DEBUGPRINT_REG(orig->ptr);
2163   BAMBOO_DEBUGPRINT_REG(orig->bound);
2164 #endif
2165   if((orig->ptr > orig->bound) || (orig->ptr == orig->blockbound)) {
2166 #ifdef DEBUG
2167     BAMBOO_DEBUGPRINT(0xe206);
2168 #endif
2169     if(!nextSBlock(orig)) {
2170       // finished, no more data
2171       return true;
2172     }
2173   }
2174 #ifdef DEBUG
2175   BAMBOO_DEBUGPRINT(0xe207);
2176   BAMBOO_DEBUGPRINT_REG(orig->ptr);
2177 #endif
2178   return false;
2179 } //bool moveobj(struct moveHelper* orig,struct moveHelper* to,int* endaddr)
2180
2181 // should be invoked with interrupt closed
2182 inline int assignSpareMem_I(int sourcecore,
2183                             int * requiredmem,
2184                             int * tomove,
2185                             int * startaddr) {
2186   int b = 0;
2187   BLOCKINDEX(gcloads[sourcecore], &b);
2188   int boundptr = (b<NUMCORES4GC) ? ((b+1)*BAMBOO_SMEM_SIZE_L)
2189                  : (BAMBOO_LARGE_SMEM_BOUND+(b-NUMCORES4GC+1)*BAMBOO_SMEM_SIZE);
2190   int remain = boundptr - gcloads[sourcecore];
2191   int memneed = requiredmem + BAMBOO_CACHE_LINE_SIZE;
2192   *startaddr = gcloads[sourcecore];
2193   *tomove = gcfilledblocks[sourcecore] + 1;
2194   if(memneed < remain) {
2195     gcloads[sourcecore] += memneed;
2196     return 0;
2197   } else {
2198     // next available block
2199     gcfilledblocks[sourcecore] += 1;
2200     int newbase = 0;
2201     BASEPTR(sourcecore, gcfilledblocks[sourcecore], &newbase);
2202     gcloads[sourcecore] = newbase;
2203     return requiredmem-remain;
2204   }
2205 } // int assignSpareMem_I(int ,int * , int * , int * )
2206
2207 // should be invoked with interrupt closed
2208 inline bool gcfindSpareMem_I(int * startaddr,
2209                              int * tomove,
2210                              int * dstcore,
2211                              int requiredmem,
2212                              int requiredcore) {
2213   for(int k = 0; k < NUMCORES4GC; k++) {
2214     if((gccorestatus[k] == 0) && (gcfilledblocks[k] < gcstopblock[k])) {
2215       // check if this stopped core has enough mem
2216       assignSpareMem_I(k, requiredmem, tomove, startaddr);
2217       *dstcore = k;
2218       return true;
2219     }
2220   }
2221   // if can not find spare mem right now, hold the request
2222   gcrequiredmems[requiredcore] = requiredmem;
2223   gcmovepending++;
2224   return false;
2225 } //bool gcfindSpareMem_I(int* startaddr,int* tomove,int mem,int core)
2226
2227 inline bool compacthelper(struct moveHelper * orig,
2228                           struct moveHelper * to,
2229                           int * filledblocks,
2230                           int * heaptopptr,
2231                           bool * localcompact) {
2232   // scan over all objs in this block, compact the marked objs
2233   // loop stop when finishing either scanning all active objs or
2234   // fulfilled the gcstopblock
2235 #ifdef DEBUG
2236   BAMBOO_DEBUGPRINT(0xe101);
2237   BAMBOO_DEBUGPRINT_REG(gcblock2fill);
2238   BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
2239 #endif
2240 innercompact:
2241   while(orig->ptr < gcmarkedptrbound) {
2242     bool stop = moveobj(orig, to, gcblock2fill);
2243     if(stop) {
2244       break;
2245     }
2246   }
2247 #ifdef GC_CACHE_ADAPT
2248         // end of an to page, wrap up its information
2249         float tmp_factor = 
2250           ((float)(to->ptr-gc_cache_revise_infomation.to_page_start_va))/
2251           ((float)(BAMBOO_PAGE_SIZE));
2252         for(int tt = 0; tt < NUMCORESACTIVE; tt++) {
2253           ((int*)((void*)gccachesamplingtbl_r+tt*size_cachesamplingtbl_local_r)
2254            )[gc_cache_revise_infomation.to_page_index] += (int)(
2255                 ((int*)((void*)gccachesamplingtbl+tt*size_cachesamplingtbl_local))[
2256                 gc_cache_revise_infomation.orig_page_index]*tmp_factor);
2257         }
2258 #endif // GC_CACHE_ADAPT
2259   // if no objs have been compact, do nothing,
2260   // otherwise, fill the header of this block
2261   if(to->offset > BAMBOO_CACHE_LINE_SIZE) {
2262     BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
2263     (*((int*)(to->base))) = to->offset;
2264   } else {
2265     to->offset = 0;
2266     to->ptr = to->base;
2267     to->top -= BAMBOO_CACHE_LINE_SIZE;
2268   }  // if(to->offset > BAMBOO_CACHE_LINE_SIZE) else ...
2269   if(*localcompact) {
2270     *heaptopptr = to->ptr;
2271     *filledblocks = to->numblocks;
2272   }
2273 #ifdef DEBUG
2274   BAMBOO_DEBUGPRINT(0xe102);
2275   BAMBOO_DEBUGPRINT_REG(orig->ptr);
2276   BAMBOO_DEBUGPRINT_REG(gcmarkedptrbound);
2277   BAMBOO_DEBUGPRINT_REG(*heaptopptr);
2278   BAMBOO_DEBUGPRINT_REG(*filledblocks);
2279   BAMBOO_DEBUGPRINT_REG(gccurr_heaptop);
2280 #endif
2281
2282   // send msgs to core coordinator indicating that the compact is finishing
2283   // send compact finish message to core coordinator
2284   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
2285     gcfilledblocks[BAMBOO_NUM_OF_CORE] = *filledblocks;
2286     gcloads[BAMBOO_NUM_OF_CORE] = *heaptopptr;
2287     if(orig->ptr < gcmarkedptrbound) {
2288 #ifdef DEBUG
2289       BAMBOO_DEBUGPRINT(0xe103);
2290 #endif
2291       // ask for more mem
2292       gctomove = false;
2293       BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2294       if(gcfindSpareMem_I(&gcmovestartaddr, &gcblock2fill, &gcdstcore,
2295                           gccurr_heaptop, BAMBOO_NUM_OF_CORE)) {
2296 #ifdef DEBUG
2297                 BAMBOO_DEBUGPRINT(0xe104);
2298 #endif
2299                 gctomove = true;
2300       } else {
2301                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2302 #ifdef DEBUG
2303                 BAMBOO_DEBUGPRINT(0xe105);
2304 #endif
2305                 return false;
2306       }
2307       BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2308     } else {
2309 #ifdef DEBUG
2310       BAMBOO_DEBUGPRINT(0xe106);
2311 #endif
2312       gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
2313       gctomove = false;
2314       return true;
2315     }
2316   } else {
2317     if(orig->ptr < gcmarkedptrbound) {
2318 #ifdef DEBUG
2319       BAMBOO_DEBUGPRINT(0xe107);
2320 #endif
2321       // ask for more mem
2322       gctomove = false;
2323       send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
2324                  *filledblocks, *heaptopptr, gccurr_heaptop, false);
2325     } else {
2326 #ifdef DEBUG
2327       BAMBOO_DEBUGPRINT(0xe108);
2328       BAMBOO_DEBUGPRINT_REG(*heaptopptr);
2329 #endif
2330       // finish compacting
2331       send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
2332                  *filledblocks, *heaptopptr, 0, false);
2333     }
2334   }       // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
2335
2336   if(orig->ptr < gcmarkedptrbound) {
2337 #ifdef DEBUG
2338     BAMBOO_DEBUGPRINT(0xe109);
2339 #endif
2340     // still have unpacked obj
2341     while(true) {
2342       if(gctomove) {
2343                 break;
2344       }
2345     }
2346     ;
2347         gctomove = false;
2348 #ifdef DEBUG
2349     BAMBOO_DEBUGPRINT(0xe10a);
2350 #endif
2351
2352     to->ptr = gcmovestartaddr;
2353     to->numblocks = gcblock2fill - 1;
2354     to->bound = (to->numblocks==0) ?
2355                 BAMBOO_SMEM_SIZE_L :
2356                 BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
2357     BASEPTR(gcdstcore, to->numblocks, &(to->base));
2358     to->offset = to->ptr - to->base;
2359     to->top = (to->numblocks==0) ?
2360               (to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
2361     to->base = to->ptr;
2362     to->offset = BAMBOO_CACHE_LINE_SIZE;
2363     to->ptr += to->offset;             // for header
2364     to->top += to->offset;
2365     if(gcdstcore == BAMBOO_NUM_OF_CORE) {
2366       *localcompact = true;
2367     } else {
2368       *localcompact = false;
2369     }
2370 #ifdef GC_CACHE_ADAPT
2371         // initialize the gc_cache_revise_information
2372         gc_cache_revise_infomation.to_page_start_va = to->ptr;
2373         gc_cache_revise_infomation.to_page_end_va = gcbaseva + 
2374           (BAMBOO_PAGE_SIZE)*((to->base-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2375         gc_cache_revise_infomation.to_page_index = 
2376           (to->base-gcbaseva)/(BAMBOO_PAGE_SIZE);
2377         gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
2378         gc_cache_revise_infomation.orig_page_end_va = gcbaseva + 
2379           (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2380         gc_cache_revise_infomation.orig_page_index = 
2381           orig->blockbase/(BAMBOO_PAGE_SIZE);
2382 #endif // GC_CACHE_ADAPT
2383     goto innercompact;
2384   }
2385 #ifdef DEBUG
2386   BAMBOO_DEBUGPRINT(0xe10b);
2387 #endif
2388   return true;
2389 } // void compacthelper()
2390
2391 inline void compact() {
2392   if(COMPACTPHASE != gcphase) {
2393     BAMBOO_EXIT(0xb102);
2394   }
2395
2396   // initialize pointers for comapcting
2397   struct moveHelper * orig =
2398     (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
2399   struct moveHelper * to =
2400     (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
2401
2402   if(!initOrig_Dst(orig, to)) {
2403     // no available data to compact
2404     // send compact finish msg to STARTUP core
2405 #ifdef DEBUG
2406     BAMBOO_DEBUGPRINT(0xe001);
2407     BAMBOO_DEBUGPRINT_REG(to->base);
2408 #endif
2409     send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
2410                0, to->base, 0, false);
2411     RUNFREE(orig);
2412     RUNFREE(to);
2413     return;
2414   }
2415 #ifdef GC_CACHE_ADAPT
2416   gc_cache_revise_infomation.orig_page_start_va = orig->ptr;
2417   gc_cache_revise_infomation.orig_page_end_va = gcbaseva +  
2418         (BAMBOO_PAGE_SIZE)*((orig->ptr-gcbaseva)/(BAMBOO_PAGE_SIZE)+1);
2419   gc_cache_revise_infomation.orig_page_index = 
2420         orig->blockbase/(BAMBOO_PAGE_SIZE);
2421 #endif
2422
2423   int filledblocks = 0;
2424   INTPTR heaptopptr = 0;
2425   bool localcompact = true;
2426   compacthelper(orig, to, &filledblocks, &heaptopptr, &localcompact);
2427
2428   RUNFREE(orig);
2429   RUNFREE(to);
2430 } // compact()
2431
2432 // if return NULL, means
2433 //   1. objptr is NULL
2434 //   2. objptr is not a shared obj
2435 // in these cases, remain the original value is OK
2436 inline void * flushObj(void * objptr) {
2437 #ifdef DEBUG
2438   BAMBOO_DEBUGPRINT(0xe401);
2439 #endif
2440   if(objptr == NULL) {
2441     return NULL;
2442   }
2443   void * dstptr = NULL;
2444   if(ISSHAREDOBJ(objptr)) {
2445 #ifdef DEBUG
2446     BAMBOO_DEBUGPRINT(0xe402);
2447     BAMBOO_DEBUGPRINT_REG(objptr);
2448 #endif
2449     // a shared obj ptr, change to new address
2450     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2451 #ifdef GC_PROFILE
2452     //unsigned long long ttime = BAMBOO_GET_EXE_TIME();
2453 #endif
2454 #ifdef LOCALHASHTBL_TEST
2455     RuntimeHashget(gcpointertbl, objptr, &dstptr);
2456 #else
2457         dstptr = mgchashSearch(gcpointertbl, objptr);
2458 #endif
2459         //MGCHashget(gcpointertbl, objptr, &dstptr);
2460 #ifdef GC_PROFILE
2461     //flushstalltime += BAMBOO_GET_EXE_TIME()-ttime;
2462 #endif
2463     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2464 #ifdef DEBUG
2465     BAMBOO_DEBUGPRINT_REG(dstptr);
2466 #endif
2467
2468     if(NULL == dstptr) {
2469       // no mapping info
2470 #ifdef DEBUG
2471       BAMBOO_DEBUGPRINT(0xe403);
2472       BAMBOO_DEBUGPRINT_REG(objptr);
2473       BAMBOO_DEBUGPRINT_REG(hostcore(objptr));
2474 #endif
2475       if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) {
2476                 // error! the obj is right on this core, but cannot find it
2477                 //BAMBOO_DEBUGPRINT(0xecec);
2478                 BAMBOO_DEBUGPRINT_REG(objptr);
2479                 BAMBOO_EXIT(0xb103);
2480                 // assume that the obj has not been moved, use the original address
2481                 //dstptr = objptr;
2482       } else {
2483                 int hostc = hostcore(objptr);
2484 #ifdef GC_PROFILE
2485                 //unsigned long long ttimet = BAMBOO_GET_EXE_TIME();
2486 #endif
2487                 // check the corresponsing sharedptbl
2488                 BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2489                 //struct GCSharedHash * sptbl = gcrpointertbls[hostcore(objptr)];
2490                 mgcsharedhashtbl_t * sptbl = gcrpointertbls[hostc];
2491                 if(sptbl != NULL) {
2492                   //GCSharedHashget(sptbl, (int)objptr, &dstptr);
2493                   dstptr = mgcsharedhashSearch(sptbl, (int)objptr);
2494                   if(dstptr != NULL) {
2495 #ifdef LOCALHASHTBL_TEST
2496                         RuntimeHashadd_I(gcpointertbl, (int)objptr, (int)dstptr);
2497 #else
2498                         mgchashInsert_I(gcpointertbl, (int)objptr, (int)dstptr);
2499 #endif
2500                   }
2501                 }
2502                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2503 #ifdef GC_PROFILE
2504                 //flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
2505 #endif
2506
2507                 if(dstptr == NULL) {
2508                   // still can not get the mapping info,
2509                   // send msg to host core for the mapping info
2510                   gcobj2map = (int)objptr;
2511                   gcismapped = false;
2512                   gcmappedobj = NULL;
2513                   // the first time require the mapping, send msg to the hostcore
2514                   // for the mapping info
2515                   send_msg_3(hostc, GCMAPREQUEST, (int)objptr,
2516                           BAMBOO_NUM_OF_CORE, false);
2517                   while(true) {
2518                         if(gcismapped) {
2519                           break;
2520                         }
2521                   }
2522 #ifdef GC_PROFILE
2523                   //flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimet;
2524 #endif
2525                   BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2526 #ifdef LOCALHASHTBL_TEST
2527                   RuntimeHashget(gcpointertbl, objptr, &dstptr);
2528 #else
2529                   dstptr = mgchashSearch(gcpointertbl, objptr);
2530 #endif
2531                   //MGCHashget(gcpointertbl, objptr, &dstptr);
2532                   BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2533                 } // if(dstptr == NULL)
2534           }    // if(hostcore(objptr) == BAMBOO_NUM_OF_CORE) else ...
2535 #ifdef DEBUG
2536       BAMBOO_DEBUGPRINT_REG(dstptr);
2537 #endif
2538     }     // if(NULL == dstptr)
2539   }      // if(ISSHAREDOBJ(objptr))
2540          // if not a shared obj, return NULL to indicate no need to flush
2541 #ifdef DEBUG
2542   BAMBOO_DEBUGPRINT(0xe404);
2543 #endif
2544   return dstptr;
2545 } // void flushObj(void * objptr)
2546
2547 inline void flushRuntimeObj(struct garbagelist * stackptr) {
2548   int i,j;
2549   // flush current stack
2550   while(stackptr!=NULL) {
2551     for(i=0; i<stackptr->size; i++) {
2552       if(stackptr->array[i] != NULL) {
2553                 void * dst = flushObj(stackptr->array[i]);
2554                 if(dst != NULL) {
2555                   stackptr->array[i] = dst;
2556                 }
2557       }
2558     }
2559     stackptr=stackptr->next;
2560   }
2561
2562   // flush objectsets
2563   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
2564     for(i=0; i<NUMCLASSES; i++) {
2565       struct parameterwrapper ** queues =
2566         objectqueues[BAMBOO_NUM_OF_CORE][i];
2567       int length = numqueues[BAMBOO_NUM_OF_CORE][i];
2568       for(j = 0; j < length; ++j) {
2569                 struct parameterwrapper * parameter = queues[j];
2570                 struct ObjectHash * set=parameter->objectset;
2571                 struct ObjectNode * ptr=set->listhead;
2572                 while(ptr!=NULL) {
2573                   void * dst = flushObj((void *)ptr->key);
2574                   if(dst != NULL) {
2575                         ptr->key = dst;
2576                   }
2577                   ptr=ptr->lnext;
2578                 }
2579                 ObjectHashrehash(set);
2580       }
2581     }
2582   }
2583
2584   // flush current task descriptor
2585   if(currtpd != NULL) {
2586     for(i=0; i<currtpd->numParameters; i++) {
2587       void * dst = flushObj(currtpd->parameterArray[i]);
2588       if(dst != NULL) {
2589                 currtpd->parameterArray[i] = dst;
2590       }
2591     }
2592   }
2593
2594   // flush active tasks
2595   if(activetasks != NULL) {
2596     struct genpointerlist * ptr=activetasks->list;
2597     while(ptr!=NULL) {
2598       struct taskparamdescriptor *tpd=ptr->src;
2599       int i;
2600       for(i=0; i<tpd->numParameters; i++) {
2601                 void * dst = flushObj(tpd->parameterArray[i]);
2602                 if(dst != NULL) {
2603                   tpd->parameterArray[i] = dst;
2604                 }
2605       }
2606       ptr=ptr->inext;
2607     }
2608     genrehash(activetasks);
2609   }
2610
2611   // flush cached transferred obj
2612   struct QueueItem * tmpobjptr =  getHead(&objqueue);
2613   while(tmpobjptr != NULL) {
2614     struct transObjInfo * objInfo =
2615       (struct transObjInfo *)(tmpobjptr->objectptr);
2616     void * dst = flushObj(objInfo->objptr);
2617     if(dst != NULL) {
2618       objInfo->objptr = dst;
2619     }
2620     tmpobjptr = getNextQueueItem(tmpobjptr);
2621   }
2622
2623   // flush cached objs to be transferred
2624   struct QueueItem * item = getHead(totransobjqueue);
2625   while(item != NULL) {
2626     struct transObjInfo * totransobj =
2627       (struct transObjInfo *)(item->objectptr);
2628     void * dst = flushObj(totransobj->objptr);
2629     if(dst != NULL) {
2630       totransobj->objptr = dst;
2631     }
2632     item = getNextQueueItem(item);
2633   }       // while(item != NULL)
2634
2635   // enqueue lock related info
2636   for(i = 0; i < runtime_locklen; ++i) {
2637     void * dst = flushObj(runtime_locks[i].redirectlock);
2638     if(dst != NULL) {
2639       runtime_locks[i].redirectlock = (int)dst;
2640     }
2641     if(runtime_locks[i].value != NULL) {
2642       void * dst=flushObj(runtime_locks[i].value);
2643       if(dst != NULL) {
2644                 runtime_locks[i].value = (int)dst;
2645       }
2646     }
2647   }
2648
2649 } // void flushRuntimeObj(struct garbagelist * stackptr)
2650
2651 inline void transmappinginfo() {
2652   // broadcast the sharedptbl pointer
2653   for(int i = 0; i < NUMCORESACTIVE; i++) {
2654         if(i != BAMBOO_NUM_OF_CORE) {
2655           send_msg_3(i, GCMAPTBL, gcsharedptbl, BAMBOO_NUM_OF_CORE, false);
2656         }
2657   }
2658
2659   if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
2660         send_msg_2(STARTUPCORE, GCFINISHMAPINFO, BAMBOO_NUM_OF_CORE, false);
2661   }
2662 }
2663
2664 inline void flush(struct garbagelist * stackptr) {
2665
2666   flushRuntimeObj(stackptr);
2667
2668   while(true) {
2669     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2670     bool hasItems = gc_moreItems_I();
2671     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2672     if(!hasItems) {
2673       break;
2674     }
2675
2676 #ifdef DEBUG
2677     BAMBOO_DEBUGPRINT(0xe301);
2678 #endif
2679     BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
2680     void * ptr = gc_dequeue_I();
2681     BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
2682     if(ISSHAREDOBJ(ptr)) {
2683       // should be a local shared obj and should have mapping info
2684       ptr = flushObj(ptr);
2685 #ifdef DEBUG
2686       BAMBOO_DEBUGPRINT(0xe302);
2687       BAMBOO_DEBUGPRINT_REG(ptr);
2688       BAMBOO_DEBUGPRINT_REG(tptr);
2689       BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
2690 #endif
2691       if(ptr == NULL) {
2692                 BAMBOO_EXIT(0xb105);
2693       }
2694     } // if(ISSHAREDOBJ(ptr))
2695     if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED)) {
2696       int type = ((int *)(ptr))[0];
2697       // scan all pointers in ptr
2698       unsigned INTPTR * pointer;
2699       pointer=pointerarray[type];
2700 #ifdef DEBUG
2701       BAMBOO_DEBUGPRINT(0xe303);
2702       BAMBOO_DEBUGPRINT_REG(pointer);
2703 #endif
2704       if (pointer==0) {
2705                 /* Array of primitives */
2706                 /* Do nothing */
2707       } else if (((INTPTR)pointer)==1) {
2708 #ifdef DEBUG
2709                 BAMBOO_DEBUGPRINT(0xe304);
2710 #endif
2711                 /* Array of pointers */
2712                 struct ArrayObject *ao=(struct ArrayObject *) ptr;
2713                 int length=ao->___length___;
2714                 int j;
2715                 for(j=0; j<length; j++) {
2716 #ifdef DEBUG
2717                   BAMBOO_DEBUGPRINT(0xe305);
2718 #endif
2719                   void *objptr=
2720                         ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
2721 #ifdef DEBUG
2722                   BAMBOO_DEBUGPRINT_REG(objptr);
2723 #endif
2724                   if(objptr != NULL) {
2725                         void * dst = flushObj(objptr);
2726                         if(dst != NULL) {
2727                           ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
2728                         }
2729                   }
2730                 }
2731       } else {
2732 #ifdef DEBUG
2733                 BAMBOO_DEBUGPRINT(0xe306);
2734 #endif
2735                 INTPTR size=pointer[0];
2736                 int i;
2737                 for(i=1; i<=size; i++) {
2738 #ifdef DEBUG
2739                   BAMBOO_DEBUGPRINT(0xe307);
2740 #endif
2741                   unsigned int offset=pointer[i];
2742                   void * objptr=*((void **)(((char *)ptr)+offset));
2743 #ifdef DEBUG
2744                   BAMBOO_DEBUGPRINT_REG(objptr);
2745 #endif
2746                   if(objptr != NULL) {
2747                         void * dst = flushObj(objptr);
2748                         if(dst != NULL) {
2749                           *((void **)(((char *)ptr)+offset)) = dst;
2750                         }
2751                   }
2752                 } // for(i=1; i<=size; i++)
2753       }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
2754          // restore the mark field, indicating that this obj has been flushed
2755       if(ISSHAREDOBJ(ptr)) {
2756                 ((int *)(ptr))[6] = INIT;
2757       }
2758     }  // if((!ISSHAREDOBJ(ptr)) || (((int *)(ptr))[6] == COMPACTED))
2759   }   // while(gc_moreItems())
2760 #ifdef DEBUG
2761   BAMBOO_DEBUGPRINT(0xe308);
2762 #endif
2763
2764   // TODO bug here: the startup core contains all lobjs' info, thus all the
2765   // lobjs are flushed in sequence.
2766   // flush lobjs
2767   while(gc_lobjmoreItems_I()) {
2768 #ifdef DEBUG
2769     BAMBOO_DEBUGPRINT(0xe309);
2770 #endif
2771     void * ptr = gc_lobjdequeue_I(NULL, NULL);
2772     ptr = flushObj(ptr);
2773 #ifdef DEBUG
2774     BAMBOO_DEBUGPRINT(0xe30a);
2775     BAMBOO_DEBUGPRINT_REG(ptr);
2776     BAMBOO_DEBUGPRINT_REG(tptr);
2777     BAMBOO_DEBUGPRINT_REG(((int *)(tptr))[0]);
2778 #endif
2779     if(ptr == NULL) {
2780       BAMBOO_EXIT(0xb106);
2781     }
2782     if(((int *)(ptr))[6] == COMPACTED) {
2783       int type = ((int *)(ptr))[0];
2784       // scan all pointers in ptr
2785       unsigned INTPTR * pointer;
2786       pointer=pointerarray[type];
2787 #ifdef DEBUG
2788       BAMBOO_DEBUGPRINT(0xe30b);
2789       BAMBOO_DEBUGPRINT_REG(pointer);
2790 #endif
2791       if (pointer==0) {
2792                 /* Array of primitives */
2793                 /* Do nothing */
2794       } else if (((INTPTR)pointer)==1) {
2795 #ifdef DEBUG
2796                 BAMBOO_DEBUGPRINT(0xe30c);
2797 #endif
2798                 /* Array of pointers */
2799                 struct ArrayObject *ao=(struct ArrayObject *) ptr;
2800                 int length=ao->___length___;
2801                 int j;
2802                 for(j=0; j<length; j++) {
2803 #ifdef DEBUG
2804                   BAMBOO_DEBUGPRINT(0xe30d);
2805 #endif
2806                   void *objptr=
2807                         ((void **)(((char *)&ao->___length___)+sizeof(int)))[j];
2808 #ifdef DEBUG
2809                   BAMBOO_DEBUGPRINT_REG(objptr);
2810 #endif
2811                   if(objptr != NULL) {
2812                         void * dst = flushObj(objptr);
2813                         if(dst != NULL) {
2814                           ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = dst;
2815                         }
2816                   }
2817                 }
2818       } else {
2819 #ifdef DEBUG
2820                 BAMBOO_DEBUGPRINT(0xe30e);
2821 #endif
2822                 INTPTR size=pointer[0];
2823                 int i;
2824                 for(i=1; i<=size; i++) {
2825 #ifdef DEBUG
2826                   BAMBOO_DEBUGPRINT(0xe30f);
2827 #endif
2828                   unsigned int offset=pointer[i];
2829                   void * objptr=*((void **)(((char *)ptr)+offset));
2830
2831 #ifdef DEBUG
2832                   BAMBOO_DEBUGPRINT_REG(objptr);
2833 #endif
2834                   if(objptr != NULL) {
2835                         void * dst = flushObj(objptr);
2836                         if(dst != NULL) {
2837                           *((void **)(((char *)ptr)+offset)) = dst;
2838                         }
2839                   }
2840                 }  // for(i=1; i<=size; i++)
2841       }  // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
2842          // restore the mark field, indicating that this obj has been flushed
2843       ((int *)(ptr))[6] = INIT;
2844     }     // if(((int *)(ptr))[6] == COMPACTED)
2845   }     // while(gc_lobjmoreItems())
2846 #ifdef DEBUG
2847   BAMBOO_DEBUGPRINT(0xe310);
2848 #endif
2849
2850   // send flush finish message to core coordinator
2851   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
2852     gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
2853   } else {
2854     send_msg_2(STARTUPCORE, GCFINISHFLUSH, BAMBOO_NUM_OF_CORE, false);
2855   }
2856 #ifdef DEBUG
2857   BAMBOO_DEBUGPRINT(0xe311);
2858 #endif
2859 } // flush()
2860
2861 #ifdef GC_CACHE_ADAPT
2862 // prepare for cache adaption:
2863 //   -- flush the shared heap
2864 //   -- clean dtlb entries
2865 //   -- change cache strategy
2866 void cacheAdapt_gc(bool isgccachestage) {
2867   // flush the shared heap
2868   BAMBOO_CACHE_FLUSH_L2();
2869
2870   // clean the dtlb entries
2871   BAMBOO_CLEAN_DTLB();
2872
2873   // change the cache strategy
2874   gccachestage = isgccachestage;
2875 } // cacheAdapt_gc(bool isgccachestage)
2876
2877 // the master core decides how to adapt cache strategy for the mutator 
2878 // according to collected statistic data
2879 extern int gc_num_sampling;
2880
2881 // make all pages hfh
2882 int cacheAdapt_policy_h4h(){
2883   unsigned int page_index = 0;
2884   VA page_sva = 0;
2885   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
2886   int numchanged = 0;
2887   int * tmp_p = gccachepolicytbl+1;
2888   for(page_index = 0; page_index < page_num; page_index++) {
2889         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
2890         bamboo_cache_policy_t policy = {0};
2891         policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
2892         *tmp_p = page_index;
2893         tmp_p++;
2894         *tmp_p = policy.word;
2895         tmp_p++;
2896         numchanged++;
2897   }
2898
2899   return numchanged;
2900 } // int cacheAdapt_policy_hfh()
2901
2902 // make all pages local as non-cache-adaptable gc local mode
2903 int cacheAdapt_policy_local(){
2904   unsigned int page_index = 0;
2905   VA page_sva = 0;
2906   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
2907   int numchanged = 0;
2908   int * tmp_p = gccachepolicytbl+1;
2909   for(page_index = 0; page_index < page_num; page_index++) {
2910         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
2911         bamboo_cache_policy_t policy = {0};
2912         int block = 0;
2913         BLOCKINDEX(page_sva, &block);
2914         int coren = gc_block2core[block%(NUMCORES4GC*2)];
2915         // locally cache the page in the hotest core
2916         // NOTE: (x,y) should be changed to (x+1, y+1)!!!
2917         policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
2918         policy.lotar_x = bamboo_cpu2coords[2*coren]+1;
2919         policy.lotar_y = bamboo_cpu2coords[2*coren+1]+1;
2920         *tmp_p = page_index;
2921         tmp_p++;
2922         *tmp_p = policy.word;
2923         tmp_p++;
2924         numchanged++;
2925   }
2926
2927   return numchanged;
2928 } // int cacheAdapt_policy_local()
2929
2930 int cacheAdapt_policy_hotest(){
2931   unsigned int page_index = 0;
2932   VA page_sva = 0;
2933   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
2934   int numchanged = 0;
2935   int * tmp_p = gccachepolicytbl+1;
2936   for(page_index = 0; page_index < page_num; page_index++) {
2937         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
2938         bamboo_cache_policy_t policy = {0};
2939         int hotestcore = 0;
2940         int hotfreq = 0;
2941
2942         for(int i = 0; i < NUMCORESACTIVE; i++) {
2943           int * local_tbl = (int *)((void *)gccachesamplingtbl_r
2944                   +page_num*sizeof(float)*i);
2945           int freq = local_tbl[page_index];
2946           // TODO
2947           // check the freqency, decide if this page is hot for the core
2948           if(hotfreq < freq) {
2949                 hotfreq = freq;
2950                 hotestcore = i;
2951           }
2952         }
2953         // TODO
2954         // Decide the cache strategy for this page
2955         // If decide to adapt a new cache strategy, write into the shared block of
2956         // the gcsharedsamplingtbl. The mem recording information that has been 
2957         // written is enough to hold the information.
2958         // Format: page start va + cache strategy(hfh/(host core+[x,y]))
2959         if(hotfreq == 0) {
2960           // this page has not been accessed, do not change its cache policy
2961           continue;
2962         } else {
2963           // locally cache the page in the hotest core
2964           // NOTE: (x,y) should be changed to (x+1, y+1)!!!
2965           policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
2966           policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
2967           policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
2968           *tmp_p = page_index;
2969           tmp_p++;
2970           *tmp_p = policy.word;
2971           tmp_p++;
2972           numchanged++;
2973         }
2974   }
2975
2976   return numchanged;
2977 } // int cacheAdapt_policy_hotest()
2978
2979 #define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  50
2980 // cache the page on the core that accesses it the most if that core accesses 
2981 // it more than (GC_CACHE_ADAPT_DOMINATE_THRESHOLD)% of the total.  Otherwise,
2982 // h4h the page.
2983 int cacheAdapt_policy_dominate(){
2984   unsigned int page_index = 0;
2985   VA page_sva = 0;
2986   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
2987   int numchanged = 0;
2988   int * tmp_p = gccachepolicytbl+1;
2989   for(page_index = 0; page_index < page_num; page_index++) {
2990         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
2991         bamboo_cache_policy_t policy = {0};
2992         int hotestcore = 0;
2993         int totalfreq = 0;
2994         int hotfreq = 0;
2995         
2996         for(int i = 0; i < NUMCORESACTIVE; i++) {
2997           int * local_tbl = (int *)((void *)gccachesamplingtbl_r
2998                   +page_num*sizeof(float)*i);
2999           int freq = local_tbl[page_index];
3000           totalfreq += freq;
3001           // TODO
3002           // check the freqency, decide if this page is hot for the core
3003           if(hotfreq < freq) {
3004                 hotfreq = freq;
3005                 hotestcore = i;
3006           }
3007         }
3008         // Decide the cache strategy for this page
3009         // If decide to adapt a new cache strategy, write into the shared block of
3010         // the gcpolicytbl 
3011         // Format: page start va + cache policy
3012         if(hotfreq == 0) {
3013           // this page has not been accessed, do not change its cache policy
3014           continue;
3015         }
3016         totalfreq = (totalfreq*GC_CACHE_ADAPT_DOMINATE_THRESHOLD)/100;
3017         if(hotfreq < totalfreq) {
3018           // use hfh
3019           policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
3020         } else {
3021           // locally cache the page in the hotest core
3022           // NOTE: (x,y) should be changed to (x+1, y+1)!!!
3023           policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
3024           policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
3025           policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
3026         }
3027         *tmp_p = page_index;
3028         tmp_p++;
3029         *tmp_p = policy.word;
3030         tmp_p++;
3031         numchanged++;
3032   }
3033
3034   return numchanged;
3035 } // int cacheAdapt_policy_dominate()
3036
3037 #define GC_CACHE_ADAPT_OVERLOAD_THRESHOLD 1000
3038
3039 void gc_quicksort(int *array, 
3040                       int left,
3041                                   int right,
3042                                   int offset) {
3043   int pivot = 0;;
3044   int leftIdx = left;
3045   int rightIdx = right;
3046   if((right-left+1) >= 1) {
3047         pivot = (left+right)/2;
3048         while((leftIdx <= pivot) && (rightIdx >= pivot)) {
3049           int pivotValue = array[pivot*3-offset];
3050           while((array[leftIdx*3-offset] < pivotValue) && (leftIdx <= pivot)) {
3051                 leftIdx++;
3052           }
3053           while((array[rightIdx*3-offset] > pivotValue) && (rightIdx >= pivot)) {
3054                 rightIdx--;
3055           }
3056           // swap [leftIdx] & [rightIdx]
3057           for(int k = 0; k < 3; k++) {
3058                 int tmp = array[3*rightIdx-k];
3059                 array[3*rightIdx-k] = array[3*leftIdx-k];
3060                 array[3*leftIdx-k] = tmp;
3061           }
3062           leftIdx++;
3063           rightIdx--;
3064           if((leftIdx-1) == pivot) {
3065                 pivot = rightIdx = rightIdx + 1;
3066           } else if((leftIdx+1) == pivot) {
3067                 pivot = leftIdx = leftIdx-1;
3068           }
3069         }
3070         gc_quicksort(array, left, pivot-1, offset);
3071         gc_quicksort(array, pivot+1, right, offset);
3072   }
3073   return;
3074 } // void gc_quicksort(...)
3075
3076 // Every page cached on the core that accesses it the most. 
3077 // Check to see if any core's pages total more accesses than threshold 
3078 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
3079 // most remote accesses and hash for home them until we get below 
3080 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD
3081 int cacheAdapt_policy_overload(){
3082   unsigned int page_index = 0;
3083   VA page_sva = 0;
3084   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
3085   int numchanged = 0;
3086   int * tmp_p = gccachepolicytbl+1;
3087   int workload[NUMCORESACTIVE];
3088   memset(workload, 0, NUMCORESACTIVE*sizeof(int));
3089   int core2heavypages[NUMCORESACTIVE][page_num*3+1];
3090   memset(core2heavypages, 0, sizeof(int)*(page_num*3+1)*NUMCORESACTIVE);
3091   for(page_index = 0; page_index < page_num; page_index++) {
3092         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
3093         bamboo_cache_policy_t policy = {0};
3094         int hotestcore = 0;
3095         int totalfreq = 0;
3096         int hotfreq = 0;
3097         
3098         for(int i = 0; i < NUMCORESACTIVE; i++) {
3099           int * local_tbl = (int *)((void *)gccachesamplingtbl_r
3100                   +page_num*sizeof(float)*i);
3101           int freq = local_tbl[page_index];
3102           totalfreq += freq;
3103           // TODO
3104           // check the freqency, decide if this page is hot for the core
3105           if(hotfreq < freq) {
3106                 hotfreq = freq;
3107                 hotestcore = i;
3108           }
3109         }
3110         // TODO
3111         // Decide the cache strategy for this page
3112         // If decide to adapt a new cache strategy, write into the shared block of
3113         // the gcsharedsamplingtbl. The mem recording information that has been 
3114         // written is enough to hold the information.
3115         // Format: page start va + cache strategy(hfh/(host core+[x,y]))
3116         if(hotfreq == 0) {
3117           // this page has not been accessed, do not change its cache policy
3118           continue;
3119         }
3120         // locally cache the page in the hotest core
3121         // NOTE: (x,y) should be changed to (x+1, y+1)!!!
3122         policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
3123         policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
3124         policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
3125         *tmp_p = page_index;
3126         tmp_p++;
3127         *tmp_p = policy.word;
3128         tmp_p++;
3129         numchanged++;
3130         workload[hotestcore] += totalfreq;
3131         // insert into core2heavypages using quicksort
3132         int remoteaccess = totalfreq - hotfreq;
3133         int index = core2heavypages[hotestcore][0];
3134         core2heavypages[hotestcore][3*index+3] = remoteaccess;
3135         core2heavypages[hotestcore][3*index+2] = totalfreq;
3136         core2heavypages[hotestcore][3*index+1] = tmp_p-1;
3137         core2heavypages[hotestcore][0]++;
3138   }
3139
3140   // Check the workload of each core
3141   for(int i = 0; i < NUMCORESACTIVE; i++) {
3142         int j = 1;
3143         int index = core2heavypages[i][0];
3144         if(workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) {
3145           // sort according to the remoteaccess
3146           gc_quicksort(&core2heavypages[i][0], 1, index, 0);
3147           while((workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) && (j<index*3)) {
3148                 // hfh those pages with more remote accesses 
3149                 bamboo_cache_policy_t policy = {0};
3150                 policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
3151                 *((int*)core2heavypages[i][j]) = policy.word;
3152                 workload[i] -= core2heavypages[i][j+1];
3153                 j += 3;
3154           }
3155         }
3156   }
3157
3158   return numchanged;
3159 } // int cacheAdapt_policy_overload()
3160
3161 #define GC_CACHE_ADAPT_ACCESS_THRESHOLD 70
3162 #define GC_CACHE_ADAPT_CROWD_THRESHOLD  10
3163 // Every page cached on the core that accesses it the most. 
3164 // Check to see if any core's pages total more accesses than threshold 
3165 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  If so, find the pages with the 
3166 // most remote accesses and hash for home them until we get below 
3167 // GC_CACHE_ADAPT_OVERLOAD_THRESHOLD.  
3168 // Sort pages based on activity.... 
3169 // If more then GC_CACHE_ADAPT_ACCESS_THRESHOLD% of the accesses for a
3170 // core's pages are from more than GC_CACHE_ADAPT_CROWD_THRESHOLD pages, 
3171 // then start hfh these pages(selecting the ones with the most remote 
3172 // accesses first or fewest local accesses) until we get below 
3173 // GC_CACHE_ADAPT_CROWD_THRESHOLD pages.
3174 int cacheAdapt_policy_crowd(){
3175   unsigned int page_index = 0;
3176   VA page_sva = 0;
3177   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) / (BAMBOO_PAGE_SIZE);
3178   int numchanged = 0;
3179   int * tmp_p = gccachepolicytbl+1;
3180   int workload[NUMCORESACTIVE];
3181   memset(workload, 0, NUMCORESACTIVE*sizeof(int));
3182   int core2heavypages[NUMCORESACTIVE][page_num*3+1];
3183   memset(core2heavypages, 0, sizeof(int)*(page_num*3+1)*NUMCORESACTIVE);
3184   for(page_index = 0; page_index < page_num; page_index++) {
3185         page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
3186         bamboo_cache_policy_t policy = {0};
3187         int hotestcore = 0;
3188         int totalfreq = 0;
3189         int hotfreq = 0;
3190         
3191         for(int i = 0; i < NUMCORESACTIVE; i++) {
3192           int * local_tbl = (int *)((void *)gccachesamplingtbl_r
3193                   +page_num*sizeof(float)*i);
3194           int freq = local_tbl[page_index];
3195           totalfreq += freq;
3196           // TODO
3197           // check the freqency, decide if this page is hot for the core
3198           if(hotfreq < freq) {
3199                 hotfreq = freq;
3200                 hotestcore = i;
3201           }
3202         }
3203         // TODO
3204         // Decide the cache strategy for this page
3205         // If decide to adapt a new cache strategy, write into the shared block of
3206         // the gcsharedsamplingtbl. The mem recording information that has been 
3207         // written is enough to hold the information.
3208         // Format: page start va + cache strategy(hfh/(host core+[x,y]))
3209         if(hotfreq == 0) {
3210           // this page has not been accessed, do not change its cache policy
3211           continue;
3212         }
3213         // locally cache the page in the hotest core
3214         // NOTE: (x,y) should be changed to (x+1, y+1)!!!
3215         policy.cache_mode = BAMBOO_CACHE_MODE_COORDS;
3216         policy.lotar_x = bamboo_cpu2coords[2*hotestcore]+1;
3217         policy.lotar_y = bamboo_cpu2coords[2*hotestcore+1]+1;
3218         *tmp_p = page_index;
3219         tmp_p++;
3220         *tmp_p = policy.word;
3221         tmp_p++;
3222         numchanged++;
3223         workload[hotestcore] += totalfreq;
3224         // insert into core2heavypages using quicksort
3225         int remoteaccess = totalfreq - hotfreq;
3226         int index = core2heavypages[hotestcore][0];
3227         core2heavypages[hotestcore][3*index+3] = remoteaccess;
3228         core2heavypages[hotestcore][3*index+2] = totalfreq;
3229         core2heavypages[hotestcore][3*index+1] = tmp_p-1;
3230         core2heavypages[hotestcore][0]++;
3231   }
3232
3233   // Check the workload of each core
3234   for(int i = 0; i < NUMCORESACTIVE; i++) {
3235         int j = 1;
3236         int index = core2heavypages[i][0];
3237         if(workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) {
3238           // sort according to the remote access
3239           gc_quicksort(&core2heavypages[i][0], 1, index, 0);
3240           while((workload[i] > GC_CACHE_ADAPT_OVERLOAD_THRESHOLD) && (j<index*3)) {
3241                 // hfh those pages with more remote accesses 
3242                 bamboo_cache_policy_t policy = {0};
3243                 policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
3244                 *((int*)core2heavypages[i][j]) = policy.word;
3245                 workload[i] -= core2heavypages[i][j+1];
3246                 j += 3;
3247           }
3248         }
3249         
3250         // Check if the accesses are crowded on few pages
3251         // sort according to the total access
3252         gc_quicksort(&core2heavypages[i][0], j/3+1, index, 1);
3253         int threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
3254         int num_crowded = 0;
3255         int t_workload = 0;
3256         for(;(num_crowded<GC_CACHE_ADAPT_CROWD_THRESHOLD)
3257                 &&(num_crowded<(index-j/3)); num_crowded++) {
3258           t_workload += core2heavypages[i][j+num_crowded*3+1];
3259         }
3260         // num_crowded <= GC_CACHE_ADAPT_CROWD_THRESHOLD and if there are enough 
3261         // items, it is always == GC_CACHE_ADAPT_CROWD_THRESHOLD
3262         if(t_workload > threshold) {
3263 inner_crowd:
3264           // need to hfh these pages
3265           // sort the pages according to remote access
3266           gc_quicksort(&core2heavypages[i][0], j/3+1, j/3+num_crowded, 0);
3267           while((num_crowded--) && (j < index*3)) {
3268                 // h4h those pages with more remote accesses 
3269                 bamboo_cache_policy_t policy = {0};
3270                 policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
3271                 *((int*)core2heavypages[i][j]) = policy.word;
3272                 workload[i] -= core2heavypages[i][j+1];
3273                 t_workload -= core2heavypages[i][j+1];
3274                 if((j/3+GC_CACHE_ADAPT_CROWD_THRESHOLD) < index) {
3275                   t_workload += 
3276                         core2heavypages[i][j+GC_CACHE_ADAPT_CROWD_THRESHOLD*3+1];
3277                 }
3278                 j += 3;
3279                 threshold = GC_CACHE_ADAPT_ACCESS_THRESHOLD*workload[i]/100;
3280                 if(t_workload <= threshold) {
3281                   break;
3282                 }
3283           }
3284           if((j < index*3) && (t_workload > threshold)) {
3285                 num_crowded = ((index-j/3) > GC_CACHE_ADAPT_CROWD_THRESHOLD) ?
3286                   (GC_CACHE_ADAPT_CROWD_THRESHOLD) : (index-j/3);
3287                 goto inner_crowd;
3288           }
3289         }
3290   }
3291
3292   return numchanged;
3293 } // int cacheAdapt_policy_overload()
3294
3295 void cacheAdapt_master() {
3296   // check the statistic data
3297   // for each page, decide the new cache strategy
3298   //int numchanged = cacheAdapt_policy_h4h();
3299   //int numchanged = cacheAdapt_policy_local();
3300   //int numchanged = cacheAdapt_policy_hotest();
3301   //int numchanged = cacheAdapt_policy_dominate();
3302   int numchanged = cacheAdapt_policy_overload();
3303   //int numchanged = cacheAdapt_policy_crowd();
3304   *gccachepolicytbl = numchanged;
3305   // TODO
3306   //if(numchanged > 0) tprintf("=================\n");
3307 }
3308
3309 // adapt the cache strategy for the mutator
3310 void cacheAdapt_mutator() {
3311   int numchanged = *gccachepolicytbl;
3312   // check the changes and adapt them
3313   int * tmp_p = gccachepolicytbl+1;
3314   while(numchanged--) {
3315         // read out the policy
3316         int page_index = *tmp_p;
3317         bamboo_cache_policy_t policy = (bamboo_cache_policy_t)(*(tmp_p+1));
3318         // TODO
3319         /*if(BAMBOO_NUM_OF_CORE == 0) {
3320           tprintf("va: %x, policy: %d (%d,%d) \n", 
3321                   (int)(page_index*(BAMBOO_PAGE_SIZE)+gcbaseva), policy.cache_mode,
3322                   policy.lotar_x, policy.lotar_y);
3323         }*/
3324         // adapt the policy
3325         bamboo_adapt_cache_policy(page_index*(BAMBOO_PAGE_SIZE)+gcbaseva, 
3326                 policy, BAMBOO_PAGE_SIZE);
3327
3328         tmp_p += 2;
3329   }
3330   //if(BAMBOO_NUM_OF_CORE == 0) tprintf("=================\n"); // TODO
3331 }
3332 #endif // GC_CACHE_ADAPT
3333
3334 inline void gc_collect(struct garbagelist * stackptr) {
3335   // inform the master that this core is at a gc safe point and is ready to 
3336   // do gc
3337   send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs, 
3338           self_numreceiveobjs, false);
3339
3340   // core collector routine
3341   while(true) {
3342     if(INITPHASE == gcphase) {
3343       break;
3344     }
3345   }
3346 #ifdef RAWPATH // TODO GC_DEBUG
3347   printf("(%X,%X) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
3348 #endif
3349   initGC();
3350 #ifdef GC_CACHE_ADAPT
3351   // prepare for cache adaption:
3352   cacheAdapt_gc(true);
3353 #endif // GC_CACHE_ADAPT
3354   //send init finish msg to core coordinator
3355   send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
3356
3357   while(true) {
3358     if(MARKPHASE == gcphase) {
3359       break;
3360     }
3361   }
3362 #ifdef RAWPATH // TODO GC_DEBUG
3363   printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(), 
3364              udn_tile_coord_y());
3365 #endif
3366   mark(true, stackptr);
3367 #ifdef RAWPATH // TODO GC_DEBUG
3368   printf("(%x,%x) Finish mark phase, start compact phase\n", 
3369              udn_tile_coord_x(), udn_tile_coord_y());
3370 #endif
3371   compact();
3372 #ifdef RAWPATH // TODO GC_DEBUG
3373   printf("(%x,%x) Finish compact phase\n", udn_tile_coord_x(),
3374              udn_tile_coord_y());
3375 #endif
3376
3377   while(true) {
3378         if(MAPPHASE == gcphase) {
3379           break;
3380         }
3381   }
3382 #ifdef RAWPATH // TODO GC_DEBUG
3383   printf("(%x,%x) Start map phase\n", udn_tile_coord_x(), 
3384              udn_tile_coord_y());
3385 #endif
3386   transmappinginfo();
3387 #ifdef RAWPATH // TODO GC_DEBUG
3388   printf("(%x,%x) Finish map phase\n", udn_tile_coord_x(),
3389              udn_tile_coord_y());
3390 #endif
3391
3392   while(true) {
3393     if(FLUSHPHASE == gcphase) {
3394       break;
3395     }
3396   }
3397 #ifdef RAWPATH // TODO GC_DEBUG
3398   printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(), 
3399              udn_tile_coord_y());
3400 #endif
3401 #ifdef GC_PROFILE
3402   // send the num of obj/liveobj/forwardobj to the startupcore
3403   if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
3404         send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj, 
3405                 gc_num_liveobj, gc_num_forwardobj, false);
3406   }
3407   gc_num_obj = 0;
3408 #endif // GC_PROFLIE
3409   flush(stackptr);
3410 #ifdef RAWPATH // TODO GC_DEBUG
3411   printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(),
3412              udn_tile_coord_y());
3413 #endif
3414
3415 #ifdef GC_CACHE_ADAPT
3416   while(true) {
3417     if(PREFINISHPHASE == gcphase) {
3418       break;
3419     }
3420   }
3421 #ifdef RAWPATH // TODO GC_DEBUG
3422   printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(), 
3423              udn_tile_coord_y());
3424 #endif
3425   // cache adapt phase
3426   cacheAdapt_mutator();
3427   cacheAdapt_gc(false);
3428   //send init finish msg to core coordinator
3429   send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
3430 #ifdef RAWPATH // TODO GC_DEBUG
3431   printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
3432              udn_tile_coord_y());
3433 #endif
3434 #endif // GC_CACHE_ADAPT
3435
3436   while(true) {
3437     if(FINISHPHASE == gcphase) {
3438       break;
3439     }
3440   }
3441 #ifdef RAWPATH // TODO GC_DEBUG
3442   printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
3443 #endif
3444 } // void gc_collect(struct garbagelist * stackptr)
3445
3446 inline void gc_nocollect(struct garbagelist * stackptr) {
3447   // inform the master that this core is at a gc safe point and is ready to 
3448   // do gc
3449   send_msg_4(STARTUPCORE, GCFINISHPRE, BAMBOO_NUM_OF_CORE, self_numsendobjs, 
3450           self_numreceiveobjs, false);
3451   
3452   while(true) {
3453     if(INITPHASE == gcphase) {
3454       break;
3455     }
3456   }
3457 #ifdef RAWPATH // TODO GC_DEBUG
3458   printf("(%x,%x) Do initGC\n", udn_tile_coord_x(), udn_tile_coord_y());
3459 #endif
3460   initGC();
3461 #ifdef GC_CACHE_ADAPT
3462   // prepare for cache adaption:
3463   cacheAdapt_gc(true);
3464 #endif // GC_CACHE_ADAPT
3465   //send init finish msg to core coordinator
3466   send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
3467
3468   while(true) {
3469     if(MARKPHASE == gcphase) {
3470       break;
3471     }
3472   }
3473 #ifdef RAWPATH // TODO GC_DEBUG
3474   printf("(%x,%x) Start mark phase\n", udn_tile_coord_x(), 
3475              udn_tile_coord_y());
3476 #endif
3477   mark(true, stackptr);
3478 #ifdef RAWPATH // TODO GC_DEBUG
3479   printf("(%x,%x) Finish mark phase, wait for flush\n", 
3480              udn_tile_coord_x(), udn_tile_coord_y());
3481 #endif
3482
3483   // non-gc core collector routine
3484   while(true) {
3485     if(FLUSHPHASE == gcphase) {
3486       break;
3487     }
3488   }
3489 #ifdef RAWPATH // TODO GC_DEBUG
3490   printf("(%x,%x) Start flush phase\n", udn_tile_coord_x(), 
3491              udn_tile_coord_y());
3492 #endif
3493 #ifdef GC_PROFILE
3494   if(STARTUPCORE != BAMBOO_NUM_OF_CORE) {
3495         send_msg_4(STARTUPCORE, GCPROFILES, gc_num_obj, 
3496                 gc_num_liveobj, gc_num_forwardobj, false);
3497   }
3498   gc_num_obj = 0;
3499 #endif // GC_PROFLIE
3500   flush(stackptr);
3501 #ifdef RAWPATH // TODO GC_DEBUG
3502   printf("(%x,%x) Finish flush phase\n", udn_tile_coord_x(), 
3503              udn_tile_coord_y());
3504 #endif
3505
3506 #ifdef GC_CACHE_ADAPT
3507   while(true) {
3508     if(PREFINISHPHASE == gcphase) {
3509       break;
3510     }
3511   }
3512 #ifdef RAWPATH // TODO GC_DEBUG
3513   printf("(%x,%x) Start prefinish phase\n", udn_tile_coord_x(), 
3514              udn_tile_coord_y());
3515 #endif
3516   // cache adapt phase
3517   cacheAdapt_mutator();
3518   cacheAdapt_gc(false);
3519   //send init finish msg to core coordinator
3520   send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE, false);
3521 #ifdef RAWPATH // TODO GC_DEBUG
3522   printf("(%x,%x) Finish prefinish phase\n", udn_tile_coord_x(),
3523              udn_tile_coord_y());
3524 #endif
3525 #endif // GC_CACHE_ADAPT
3526
3527   while(true) {
3528     if(FINISHPHASE == gcphase) {
3529       break;
3530     }
3531   }
3532 #ifdef RAWPATH // TODO GC_DEBUG
3533   printf("(%x,%x) Finish gc!\n", udn_tile_coord_x(), udn_tile_coord_y());
3534 #endif
3535 } // void gc_collect(struct garbagelist * stackptr)
3536
3537 inline void gc_master(struct garbagelist * stackptr) {
3538
3539   gcphase = INITPHASE;
3540   int i = 0;
3541   waitconfirm = false;
3542   numconfirm = 0;
3543   initGC();
3544
3545   // Note: all cores need to init gc including non-gc cores
3546   for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; i++) {
3547         // send GC init messages to all cores
3548         send_msg_1(i, GCSTARTINIT, false);
3549   }
3550   bool isfirst = true;
3551   bool allStall = false;
3552
3553 #ifdef GC_CACHE_ADAPT
3554   // prepare for cache adaption:
3555   cacheAdapt_gc(true);
3556 #endif // GC_CACHE_ADAPT
3557
3558 #ifdef RAWPATH // TODO GC_DEBUG
3559   printf("(%x,%x) Check core status \n", udn_tile_coord_x(), 
3560                  udn_tile_coord_y());
3561 #endif
3562
3563   gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3564   while(true) {
3565         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3566         if(gc_checkAllCoreStatus_I()) {
3567           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3568           break;
3569         }
3570         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3571   }
3572 #ifdef GC_PROFILE
3573   gc_profileItem();
3574 #endif
3575 #ifdef RAWPATH // TODO GC_DEBUG
3576   printf("(%x,%x) Start mark phase \n", udn_tile_coord_x(), 
3577                  udn_tile_coord_y());
3578 #endif
3579   // all cores have finished compacting
3580   // restore the gcstatus of all cores
3581   // Note: all cores have to do mark including non-gc cores
3582   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3583   for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
3584         gccorestatus[i] = 1;
3585         // send GC start messages to all cores
3586         send_msg_1(i, GCSTART, false);
3587   }
3588
3589   gcphase = MARKPHASE;
3590   // mark phase
3591   while(MARKPHASE == gcphase) {
3592         mark(isfirst, stackptr);
3593         if(isfirst) {
3594           isfirst = false;
3595         }
3596
3597         // check gcstatus
3598         checkMarkStatue();
3599   }   // while(MARKPHASE == gcphase)
3600   // send msgs to all cores requiring large objs info
3601   // Note: only need to ask gc cores, non-gc cores do not host any objs
3602   numconfirm = NUMCORES4GC - 1;
3603   for(i = 1; i < NUMCORES4GC; ++i) {
3604         send_msg_1(i, GCLOBJREQUEST, false);
3605   }
3606   gcloads[BAMBOO_NUM_OF_CORE] = gccurr_heaptop;
3607   while(true) {
3608         if(numconfirm==0) {
3609           break;
3610         }
3611   }   // wait for responses
3612   // check the heaptop
3613   if(gcheaptop < gcmarkedptrbound) {
3614         gcheaptop = gcmarkedptrbound;
3615   }
3616 #ifdef GC_PROFILE
3617   gc_profileItem();
3618 #endif
3619 #ifdef RAWPATH // TODO GC_DEBUG
3620   printf("(%x,%x) prepare to cache large objs \n", udn_tile_coord_x(),
3621                  udn_tile_coord_y());
3622   //dumpSMem();
3623 #endif
3624   // cache all large objs
3625   if(!cacheLObjs()) {
3626         // no enough space to cache large objs
3627         BAMBOO_EXIT(0xb107);
3628   }
3629   // predict number of blocks to fill for each core
3630   int tmpheaptop = 0;
3631   int numpbc = loadbalance(&tmpheaptop);
3632   // TODO
3633   numpbc = (BAMBOO_SHARED_MEM_SIZE)/(BAMBOO_SMEM_SIZE);
3634 #ifdef RAWPATH // TODO GC_DEBUG
3635   printf("(%x,%x) mark phase finished \n", udn_tile_coord_x(), 
3636                  udn_tile_coord_y());
3637   //dumpSMem();
3638 #endif
3639   //int tmptopptr = 0;
3640   //BASEPTR(gctopcore, 0, &tmptopptr);
3641   // TODO
3642   //tmptopptr = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
3643   tmpheaptop = gcbaseva + (BAMBOO_SHARED_MEM_SIZE);
3644 #ifdef DEBUG
3645   BAMBOO_DEBUGPRINT(0xabab);
3646   BAMBOO_DEBUGPRINT_REG(tmptopptr);
3647 #endif
3648   for(i = 0; i < NUMCORES4GC; ++i) {
3649         int tmpcoreptr = 0;
3650         BASEPTR(i, numpbc, &tmpcoreptr);
3651         //send start compact messages to all cores
3652         //TODO bug here, do not know if the direction is positive or negtive?
3653         if (tmpcoreptr < tmpheaptop /*tmptopptr*/) {
3654           gcstopblock[i] = numpbc + 1;
3655           if(i != STARTUPCORE) {
3656                 send_msg_2(i, GCSTARTCOMPACT, numpbc+1, false);
3657           } else {
3658                 gcblock2fill = numpbc+1;
3659           }                         // if(i != STARTUPCORE)
3660         } else {
3661           gcstopblock[i] = numpbc;
3662           if(i != STARTUPCORE) {
3663                 send_msg_2(i, GCSTARTCOMPACT, numpbc, false);
3664           } else {
3665                 gcblock2fill = numpbc;
3666           }  // if(i != STARTUPCORE)
3667         }
3668 #ifdef DEBUG
3669         BAMBOO_DEBUGPRINT(0xf000+i);
3670         BAMBOO_DEBUGPRINT_REG(tmpcoreptr);
3671         BAMBOO_DEBUGPRINT_REG(gcstopblock[i]);
3672 #endif
3673         // init some data strutures for compact phase
3674         gcloads[i] = 0;
3675         gcfilledblocks[i] = 0;
3676         gcrequiredmems[i] = 0;
3677   }
3678
3679   BAMBOO_CACHE_MF();
3680
3681 #ifdef GC_PROFILE
3682   gc_profileItem();
3683 #endif
3684
3685   // compact phase
3686   bool finalcompact = false;
3687   // initialize pointers for comapcting
3688   struct moveHelper * orig =
3689         (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
3690   struct moveHelper * to =
3691         (struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
3692   initOrig_Dst(orig, to);
3693   int filledblocks = 0;
3694   INTPTR heaptopptr = 0;
3695   bool finishcompact = false;
3696   bool iscontinue = true;
3697   bool localcompact = true;
3698   while((COMPACTPHASE == gcphase) || (SUBTLECOMPACTPHASE == gcphase)) {
3699         if((!finishcompact) && iscontinue) {
3700 #ifdef DEBUG
3701           BAMBOO_DEBUGPRINT(0xe001);
3702           BAMBOO_DEBUGPRINT_REG(numpbc);
3703           BAMBOO_DEBUGPRINT_REG(gcblock2fill);
3704 #endif
3705           finishcompact = compacthelper(orig, to, &filledblocks,
3706                                                                         &heaptopptr, &localcompact);
3707 #ifdef DEBUG
3708           BAMBOO_DEBUGPRINT(0xe002);
3709           BAMBOO_DEBUGPRINT_REG(finishcompact);
3710           BAMBOO_DEBUGPRINT_REG(gctomove);
3711           BAMBOO_DEBUGPRINT_REG(gcrequiredmems[0]);
3712           BAMBOO_DEBUGPRINT_REG(gcfilledblocks[0]);
3713           BAMBOO_DEBUGPRINT_REG(gcstopblock[0]);
3714 #endif
3715         }
3716
3717         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3718         if(gc_checkCoreStatus_I()) {
3719           // all cores have finished compacting
3720           // restore the gcstatus of all cores
3721           for(i = 0; i < NUMCORES4GC; ++i) {
3722                 gccorestatus[i] = 1;
3723           }
3724           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3725           break;
3726         } else {
3727           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3728           // check if there are spare mem for pending move requires
3729           if(COMPACTPHASE == gcphase) {
3730 #ifdef DEBUG
3731                 BAMBOO_DEBUGPRINT(0xe003);
3732 #endif
3733                 resolvePendingMoveRequest();
3734 #ifdef DEBUG
3735                 BAMBOO_DEBUGPRINT_REG(gctomove);
3736 #endif
3737           } else {
3738 #ifdef DEBUG
3739                 BAMBOO_DEBUGPRINT(0xe004);
3740 #endif
3741                 compact2Heaptop();
3742           }
3743         }   // if(gc_checkCoreStatus_I()) else ...
3744
3745         if(gctomove) {
3746 #ifdef DEBUG
3747           BAMBOO_DEBUGPRINT(0xe005);
3748           BAMBOO_DEBUGPRINT_REG(gcmovestartaddr);
3749           BAMBOO_DEBUGPRINT_REG(gcblock2fill);
3750           BAMBOO_DEBUGPRINT_REG(gctomove);
3751 #endif
3752           to->ptr = gcmovestartaddr;
3753           to->numblocks = gcblock2fill - 1;
3754           to->bound = (to->numblocks==0) ?
3755                                   BAMBOO_SMEM_SIZE_L :
3756                                   BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
3757           BASEPTR(gcdstcore, to->numblocks, &(to->base));
3758           to->offset = to->ptr - to->base;
3759           to->top = (to->numblocks==0) ?
3760                                 (to->offset) : (to->bound-BAMBOO_SMEM_SIZE+to->offset);
3761           to->base = to->ptr;
3762           to->offset = BAMBOO_CACHE_LINE_SIZE;
3763           to->ptr += to->offset;                         // for header
3764           to->top += to->offset;
3765           if(gcdstcore == BAMBOO_NUM_OF_CORE) {
3766                 localcompact = true;
3767           } else {
3768                 localcompact = false;
3769           }
3770           gctomove = false;
3771           iscontinue = true;
3772         } else if(!finishcompact) {
3773           // still pending
3774           iscontinue = false;
3775         }  // if(gctomove)
3776   }  // while(COMPACTPHASE == gcphase)
3777 #ifdef GC_PROFILE
3778   gc_profileItem();
3779 #endif
3780 #ifdef RAWPATH // TODO GC_DEBUG
3781   printf("(%x,%x) prepare to move large objs \n", udn_tile_coord_x(),
3782                  udn_tile_coord_y());
3783   //dumpSMem();
3784 #endif
3785   // move largeObjs
3786   moveLObjs();
3787 #ifdef RAWPATH // TODO GC_DEBUG
3788   printf("(%x,%x) compact phase finished \n", udn_tile_coord_x(), 
3789                  udn_tile_coord_y());
3790   //dumpSMem();
3791 #endif
3792   RUNFREE(orig);
3793   RUNFREE(to);
3794   orig = to = NULL;
3795
3796   gcphase = MAPPHASE;
3797   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3798   // Note: all cores should flush their runtime data including non-gc
3799   //       cores
3800   for(i = 1; i < NUMCORES4GC; ++i) {
3801         // send start flush messages to all cores
3802         gccorestatus[i] = 1;
3803         send_msg_1(i, GCSTARTMAPINFO, false);
3804   }
3805 #ifdef GC_PROFILE
3806   gc_profileItem();
3807 #endif
3808 #ifdef RAWPATH // TODO GC_DEBUG
3809   printf("(%x,%x) Start map phase \n", udn_tile_coord_x(), 
3810                  udn_tile_coord_y());
3811 #endif
3812   // mapinto phase
3813   transmappinginfo();
3814 #ifdef RAWPATH // TODO GC_DEBUG
3815   printf("(%x,%x) Finish map phase \n", udn_tile_coord_x(), 
3816                  udn_tile_coord_y());
3817 #endif
3818   gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3819   while(MAPPHASE == gcphase) {
3820         // check the status of all cores
3821         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3822         if(gc_checkCoreStatus_I()) {
3823           // all cores have finished sending mapping info 
3824           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3825           break;
3826         }
3827         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3828   }  // while(MAPPHASE == gcphase)
3829
3830   gcphase = FLUSHPHASE;
3831   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3832   // Note: all cores should flush their runtime data including non-gc
3833   //       cores
3834   for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
3835         // send start flush messages to all cores
3836         gccorestatus[i] = 1;
3837         send_msg_1(i, GCSTARTFLUSH, false);
3838   }
3839 #ifdef GC_PROFILE
3840   gc_profileItem();
3841 #endif
3842 #ifdef RAWPATH // TODO GC_DEBUG
3843   printf("(%x,%x) Start flush phase \n", udn_tile_coord_x(), 
3844                  udn_tile_coord_y());
3845 #endif
3846   // flush phase
3847   flush(stackptr);
3848   gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3849   while(FLUSHPHASE == gcphase) {
3850         // check the status of all cores
3851         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3852         if(gc_checkAllCoreStatus_I()) {
3853           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3854           break;
3855         }
3856         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3857   }  // while(FLUSHPHASE == gcphase)
3858 #ifdef RAWPATH // TODO GC_DEBUG
3859   printf("(%x,%x) Finish flush phase \n", udn_tile_coord_x(), 
3860                  udn_tile_coord_y());
3861 #endif
3862
3863 #ifdef GC_CACHE_ADAPT
3864   // now the master core need to decide the new cache strategy
3865   cacheAdapt_master();
3866
3867   gcphase = PREFINISHPHASE;
3868   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3869   // Note: all cores should flush their runtime data including non-gc
3870   //       cores
3871   for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
3872         // send start flush messages to all cores
3873         gccorestatus[i] = 1;
3874         send_msg_1(i, GCSTARTPREF, false);
3875   }
3876 #ifdef GC_PROFILE
3877   gc_profileItem();
3878 #endif
3879 #ifdef RAWPATH // TODO GC_DEBUG
3880   printf("(%x,%x) Start prefinish phase \n", udn_tile_coord_x(), 
3881                  udn_tile_coord_y());
3882 #endif
3883   // cache adapt phase
3884   cacheAdapt_mutator();
3885   cacheAdapt_gc(false);
3886   gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3887   while(PREFINISHPHASE == gcphase) {
3888         // check the status of all cores
3889         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3890         if(gc_checkAllCoreStatus_I()) {
3891           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3892           break;
3893         }
3894         BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3895   }  // while(PREFINISHPHASE == gcphase)
3896 #endif // GC_CACHE_ADAPT
3897
3898   gcphase = FINISHPHASE;
3899
3900   // invalidate all shared mem pointers
3901   // put it here as it takes time to inform all the other cores to
3902   // finish gc and it might cause problem when some core resumes
3903   // mutator earlier than the other cores
3904   bamboo_cur_msp = NULL;
3905   bamboo_smem_size = 0;
3906   bamboo_smem_zero_top = NULL;
3907   gcflag = false;
3908   gcprocessing = false;
3909
3910 #ifdef GC_PROFILE
3911   gc_profileEnd();
3912 #endif
3913   gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
3914   for(i = 1; i < NUMCORESACTIVE /*NUMCORES4GC*/; ++i) {
3915         // send gc finish messages to all cores
3916         send_msg_1(i, GCFINISH, false);
3917         gccorestatus[i] = 1;
3918   }
3919 #ifdef RAWPATH // TODO GC_DEBUG
3920   printf("(%x,%x) gc finished \n", udn_tile_coord_x(), 
3921                  udn_tile_coord_y());
3922   //dumpSMem();
3923 #endif
3924 } // void gc_master(struct garbagelist * stackptr)
3925
3926 inline bool gc(struct garbagelist * stackptr) {
3927   // check if do gc
3928   if(!gcflag) {
3929     gcprocessing = false;
3930     return false;
3931   }
3932
3933   // core coordinator routine
3934   if(0 == BAMBOO_NUM_OF_CORE) {
3935 #ifdef GC_DEBUG
3936     printf("(%x,%X) Check if can do gc or not\n", udn_tile_coord_x(),
3937                    udn_tile_coord_y());
3938 #endif
3939         bool isallstall = true;
3940         gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
3941         BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3942         int ti = 0;
3943         for(ti = 0; ti < NUMCORESACTIVE; ++ti) {
3944           if(gccorestatus[ti] != 0) {
3945                 isallstall = false;
3946                 break;
3947           }
3948         }
3949         if(!isallstall) {
3950           BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3951           // some of the cores are still executing the mutator and did not reach
3952           // some gc safe point, therefore it is not ready to do gc
3953           // in case that there are some pregc information msg lost, send a confirm
3954           // msg to the 'busy' core
3955           send_msg_1(ti, GCSTARTPRE, false);
3956           gcflag = true;
3957           return false;
3958         } else {
3959 #ifdef GC_PROFILE
3960     gc_profileStart();
3961 #endif
3962 pregccheck:
3963           //BAMBOO_ENTER_RUNTIME_MODE_FROM_CLIENT();
3964           gcnumsendobjs[0][BAMBOO_NUM_OF_CORE] = self_numsendobjs;
3965           gcnumreceiveobjs[0][BAMBOO_NUM_OF_CORE] = self_numreceiveobjs;
3966           int sumsendobj = 0;
3967 #ifdef DEBUG
3968           BAMBOO_DEBUGPRINT(0xec04);
3969 #endif
3970           for(int i = 0; i < NUMCORESACTIVE; ++i) {
3971                 sumsendobj += gcnumsendobjs[0][i];
3972 #ifdef DEBUG
3973                 BAMBOO_DEBUGPRINT(0xf000 + gcnumsendobjs[0][i]);
3974 #endif
3975           }  // for(i = 1; i < NUMCORESACTIVE; ++i)
3976 #ifdef DEBUG
3977           BAMBOO_DEBUGPRINT(0xec05);
3978           BAMBOO_DEBUGPRINT_REG(sumsendobj);
3979 #endif
3980           for(int i = 0; i < NUMCORESACTIVE; ++i) {
3981                 sumsendobj -= gcnumreceiveobjs[0][i];
3982 #ifdef DEBUG
3983                 BAMBOO_DEBUGPRINT(0xf000 + gcnumreceiveobjs[i]);
3984 #endif
3985           }  // for(i = 1; i < NUMCORESACTIVE; ++i)
3986 #ifdef DEBUG
3987           BAMBOO_DEBUGPRINT(0xec06);
3988           BAMBOO_DEBUGPRINT_REG(sumsendobj);
3989 #endif
3990           if(0 != sumsendobj) {
3991                 // there were still some msgs on the fly, wait until there 
3992                 // are some update pregc information coming and check it again
3993                 gcprecheck = false;
3994                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
3995                 while(true) {
3996                   if(gcprecheck) {
3997                         break;
3998                   }
3999                 }
4000                 goto pregccheck;
4001           } else {
4002                 BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
4003           }
4004         }
4005 #ifdef RAWPATH // TODO GC_DEBUG
4006     printf("(%x,%x) start gc! \n", udn_tile_coord_x(), udn_tile_coord_y());
4007     //dumpSMem();
4008 #endif
4009         // Zero out the remaining bamboo_cur_msp 
4010         // Only zero out the first 4 bytes of the remaining memory
4011         // Move the operation here because for the GC_CACHE_ADAPT version,
4012         // we need to make sure during the gcinit phase the shared heap is not 
4013         // touched. Otherwise, there would be problem when adapt the cache 
4014         // strategy.
4015         if((bamboo_cur_msp != 0) 
4016                 && (bamboo_smem_zero_top == bamboo_cur_msp) 
4017                 && (bamboo_smem_size > 0)) {
4018           *((int *)bamboo_cur_msp) = 0;
4019         }
4020 #ifdef GC_FLUSH_DTLB
4021         if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
4022           BAMBOO_CLEAN_DTLB();
4023           gc_num_flush_dtlb++;
4024         }
4025 #endif
4026 #ifdef GC_CACHE_ADAPT
4027     // disable the timer interrupt
4028     bamboo_mask_timer_intr();
4029     // get the sampling data 
4030     bamboo_output_dtlb_sampling();
4031 #endif // GC_CACHE_ADAPT
4032         gcprocessing = true;
4033         gc_master(stackptr);
4034   } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
4035         // Zero out the remaining bamboo_cur_msp 
4036         // Only zero out the first 4 bytes of the remaining memory
4037         // Move the operation here because for the GC_CACHE_ADAPT version,
4038         // we need to make sure during the gcinit phase the shared heap is not 
4039         // touched. Otherwise, there would be problem when adapt the cache 
4040         // strategy.
4041         if((bamboo_cur_msp != 0) 
4042                 && (bamboo_smem_zero_top == bamboo_cur_msp) 
4043                 && (bamboo_smem_size > 0)) {
4044           *((int *)bamboo_cur_msp) = 0;
4045         }
4046 #ifdef GC_FLUSH_DTLB
4047         if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
4048           BAMBOO_CLEAN_DTLB();
4049           gc_num_flush_dtlb++;
4050         }
4051 #endif
4052 #ifdef GC_CACHE_ADAPT
4053         // disable the timer interrupt
4054         bamboo_mask_timer_intr();
4055         if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
4056           // get the sampling data 
4057           bamboo_output_dtlb_sampling();
4058         }
4059 #endif // GC_CACHE_ADAPT
4060     gcprocessing = true;
4061     gc_collect(stackptr);
4062
4063     // invalidate all shared mem pointers
4064     bamboo_cur_msp = NULL;
4065     bamboo_smem_size = 0;
4066         bamboo_smem_zero_top = NULL;
4067     gcflag = false;
4068     gcprocessing = false;
4069   } else {
4070         // Zero out the remaining bamboo_cur_msp 
4071         // Only zero out the first 4 bytes of the remaining memory
4072         // Move the operation here because for the GC_CACHE_ADAPT version,
4073         // we need to make sure during the gcinit phase the shared heap is not 
4074         // touched. Otherwise, there would be problem when adapt the cache 
4075         // strategy.
4076         if((bamboo_cur_msp != 0) 
4077                 && (bamboo_smem_zero_top == bamboo_cur_msp) 
4078                 && (bamboo_smem_size > 0)) {
4079           *((int *)bamboo_cur_msp) = 0;
4080         }
4081 #ifdef GC_FLUSH_DTLB
4082         if(gc_num_flush_dtlb < GC_NUM_FLUSH_DTLB) {
4083           BAMBOO_CLEAN_DTLB();
4084           gc_num_flush_dtlb++;
4085         }
4086 #endif
4087 #ifdef GC_CACHE_ADAPT
4088         // disable the timer interrupt
4089         bamboo_mask_timer_intr();
4090         if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
4091           // get the sampling data 
4092           bamboo_output_dtlb_sampling();
4093         }
4094 #endif // GC_CACHE_ADAPT
4095     // not a gc core, should wait for gcfinish msg
4096     gcprocessing = true;
4097     gc_nocollect(stackptr);
4098
4099     // invalidate all shared mem pointers
4100     bamboo_cur_msp = NULL;
4101     bamboo_smem_size = 0;
4102     bamboo_smem_zero_top = NULL;
4103         gcflag = false;
4104     gcprocessing = false;
4105   }
4106 #ifdef GC_CACHE_ADAPT
4107   // reset the sampling arrays
4108   bamboo_dtlb_sampling_reset();
4109   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
4110         // zero out the gccachesamplingtbl
4111         BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);
4112         BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,
4113                 size_cachesamplingtbl_local_r);
4114         if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
4115           BAMBOO_MEMSET_WH(gccachepolicytbl,0,size_cachepolicytbl);
4116         }
4117   }
4118   // enable the timer interrupt
4119   bamboo_tile_timer_set_next_event(GC_TILE_TIMER_EVENT_SETTING); 
4120   bamboo_unmask_timer_intr();
4121 #endif // GC_CACHE_ADAPT
4122   return true;
4123 } // void gc(struct garbagelist * stackptr)
4124
4125 #ifdef GC_PROFILE
4126 inline void gc_profileStart(void) {
4127   if(!gc_infoOverflow) {
4128     GCInfo* gcInfo = RUNMALLOC(sizeof(struct gc_info));
4129     gc_infoArray[gc_infoIndex] = gcInfo;
4130     gcInfo->index = 1;
4131     gcInfo->time[0] = BAMBOO_GET_EXE_TIME();
4132   }
4133 }
4134
4135 inline void gc_profileItem(void) {
4136   if(!gc_infoOverflow) {
4137     GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
4138     gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
4139   }
4140 }
4141
4142 inline void gc_profileEnd(void) {
4143   if(!gc_infoOverflow) {
4144     GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
4145     gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
4146         gcInfo->time[gcInfo->index++] = gc_num_livespace;
4147         gcInfo->time[gcInfo->index++] = gc_num_freespace;
4148         gcInfo->time[gcInfo->index++] = gc_num_lobj;
4149         gcInfo->time[gcInfo->index++] = gc_num_lobjspace;
4150         gcInfo->time[gcInfo->index++] = gc_num_obj;
4151         gcInfo->time[gcInfo->index++] = gc_num_liveobj;
4152         gcInfo->time[gcInfo->index++] = gc_num_forwardobj;
4153     gc_infoIndex++;
4154     if(gc_infoIndex == GCINFOLENGTH) {
4155       gc_infoOverflow = true;
4156       //taskInfoIndex = 0;
4157     }
4158   }
4159 }
4160
4161 // output the profiling data
4162 void gc_outputProfileData() {
4163 /*#ifdef USEIO
4164   int i,j;
4165   unsigned long long totalgc = 0;
4166
4167   //printf("Start Time, End Time, Duration\n");
4168   // output task related info
4169   for(i = 0; i < gc_infoIndex; i++) {
4170     GCInfo * gcInfo = gc_infoArray[i];
4171     unsigned long long tmp = 0;
4172     for(j = 0; j < gcInfo->index; j++) {
4173       printf("%lld(%lld), ", gcInfo->time[j], (gcInfo->time[j]-tmp));
4174       tmp = gcInfo->time[j];
4175     }
4176     tmp = (tmp-gcInfo->time[0]);
4177     printf(" ++ %lld \n", tmp);
4178     totalgc += tmp;
4179   }
4180
4181   if(gc_infoOverflow) {
4182     printf("Caution: gc info overflow!\n");
4183   }
4184
4185   printf("\n\n total gc time: %lld \n", totalgc);
4186 #else*/
4187   int i = 0;
4188   int j = 0;
4189   unsigned long long totalgc = 0;
4190
4191 #ifndef BAMBOO_MEMPROF
4192   BAMBOO_DEBUGPRINT(0xdddd);
4193 #endif
4194   // output task related info
4195   for(i= 0; i < gc_infoIndex; i++) {
4196     GCInfo * gcInfo = gc_infoArray[i];
4197 #ifdef BAMBOO_MEMPROF
4198     unsigned long long tmp=gcInfo->time[gcInfo->index-8]-gcInfo->time[0]; //0;
4199 #else
4200         unsigned long long tmp = 0;
4201     BAMBOO_DEBUGPRINT(0xddda);
4202     for(j = 0; j < gcInfo->index - 7; j++) {
4203       BAMBOO_DEBUGPRINT(gcInfo->time[j]);
4204       BAMBOO_DEBUGPRINT(gcInfo->time[j]-tmp);
4205       BAMBOO_DEBUGPRINT(0xdddb);
4206       tmp = gcInfo->time[j];
4207     }
4208     tmp = (tmp-gcInfo->time[0]);
4209     BAMBOO_DEBUGPRINT_REG(tmp);
4210         BAMBOO_DEBUGPRINT(0xdddc);
4211         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 7]);
4212         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 6]);
4213         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 5]);
4214         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 4]);
4215         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 3]);
4216         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 2]);
4217         BAMBOO_DEBUGPRINT(gcInfo->time[gcInfo->index - 1]);
4218     BAMBOO_DEBUGPRINT(0xddde);
4219 #endif
4220     totalgc += tmp;
4221   }
4222 #ifndef BAMBOO_MEMPROF
4223   BAMBOO_DEBUGPRINT(0xdddf);
4224 #endif
4225   BAMBOO_DEBUGPRINT_REG(totalgc);
4226
4227   if(gc_infoOverflow) {
4228     BAMBOO_DEBUGPRINT(0xefee);
4229   }
4230
4231 #ifndef BAMBOO_MEMPROF
4232   BAMBOO_DEBUGPRINT(0xeeee);
4233 #endif
4234 //#endif
4235 }
4236 #endif  // #ifdef GC_PROFILE
4237
4238 #endif