bug fixing in multicore gc and add profiling code for gc
authorjzhou <jzhou>
Wed, 10 Feb 2010 17:22:25 +0000 (17:22 +0000)
committerjzhou <jzhou>
Wed, 10 Feb 2010 17:22:25 +0000 (17:22 +0000)
Robust/src/Analysis/Scheduling/ScheduleAnalysis.java
Robust/src/Runtime/MGCHash.c
Robust/src/Runtime/mem.c
Robust/src/Runtime/multicoregarbage.c
Robust/src/Runtime/multicoregarbage.h
Robust/src/Runtime/multicoreruntime.h
Robust/src/Runtime/multicoretask.c
Robust/src/buildscript

index 4334cb2d2bd8ab04303657f7607e905ccdaa0028..0da216febe0f1bd20a27706a89273dfb23925e99 100644 (file)
@@ -419,7 +419,8 @@ public class ScheduleAnalysis {
                       (cdname.equals("KMeans")) || 
                       (cdname.equals("ZTransform")) ||
                       (cdname.equals("TestRunner")) || 
-                      (cdname.equals("LinkList"))) {
+                      (cdname.equals("LinkList")) ||
+                      (cdname.equals("BHRunner"))) {
                     newRate = this.coreNum;
                   } else if(cdname.equals("SentenceParser")) {
                     newRate = 4;
index 2155f673940c1dc923c5eccdda832da2b91b4494..97d8c890cfdb5ff6c3b77eebb5c430db8ef184c9 100755 (executable)
@@ -66,7 +66,7 @@ void mgchashreset() {
       tmpptr=next;
     }
   } else {*/
-         memset(mgc_table, '\0', sizeof(mgchashlistnode_t)*mgc_size);
+         BAMBOO_MEMSET_WH(mgc_table, '\0', sizeof(mgchashlistnode_t)*mgc_size);
   //}
   while(mgc_structs->next!=NULL) {
     mgcliststruct_t *next=mgc_structs->next;
@@ -328,7 +328,7 @@ struct MGCHash * allocateMGCHash(int size,
   thisvar->bucket = 
                (struct MGCNode *) RUNMALLOC(sizeof(struct MGCNode)*size);
        // zero out all the buckets
-       memset(thisvar->bucket, '\0', sizeof(struct MGCNode)*size);
+       BAMBOO_MEMSET_WH(thisvar->bucket, '\0', sizeof(struct MGCNode)*size);
   //Set data counts
   thisvar->num4conflicts = conflicts;
   return thisvar;
index 3ae21fbe15103ee45c28b0999561ba6425dcf96e..9cc3da929f423f99eeccff39d3d887f88474735d 100644 (file)
@@ -51,8 +51,8 @@ memalloc:
   BAMBOO_CLOSE_CRITICAL_SECTION_MEM();
        void * alignedp = 
                (void *)(BAMBOO_CACHE_LINE_SIZE+((int)p-1)&(~BAMBOO_CACHE_LINE_MASK));
-       memset(p, -2, (alignedp - p));
-  memset(alignedp + size, -2, p + isize - alignedp - size);
+       BAMBOO_MEMSET_WH(p, -2, (alignedp - p));
+  BAMBOO_MEMSET_WH(alignedp + size, -2, p + isize - alignedp - size);
        return alignedp;
 }
 #else
index ce6f6537a457012591038ea6c89f18c12c581f80..20a86eb5c80925b3f02f49341829bb7d2af17fa6 100644 (file)
@@ -99,12 +99,6 @@ inline void dumpSMem() {
                                coren = gc_block2core[block%(NUMCORES4GC*2)];
                        }
                        // compute core coordinate
-                       /*int tmpcore = coren;
-                       if((NUMCORES4GC==62) && (tmpcore > 5)) {
-                               tmpcore+=2;
-                       }
-                       x = tmpcore/bamboo_width;
-                       y = tmpcore%bamboo_width;*/
                        x = bamboo_cpu2coords[coren*2]; 
                        y = bamboo_cpu2coords[coren*2+1];
                        tprintf("==== %d, %d : core (%d,%d), saddr %x====\n", 
@@ -394,6 +388,17 @@ inline bool gc_checkCoreStatus() {
        return allStall;
 }
 
+inline bool gc_checkAllCoreStatus() {
+       bool allStall = true;
+       for(int i = 0; i < NUMCORESACTIVE; ++i) {
+               if(gccorestatus[i] != 0) {
+                       allStall = false;
+                       break;
+               } // if(gccorestatus[i] != 0)
+       } // for(i = 0; i < NUMCORESACTIVE; ++i)
+       return allStall;
+}
+
 inline void checkMarkStatue() {
 #ifdef DEBUG
        BAMBOO_DEBUGPRINT(0xee01);
@@ -409,7 +414,7 @@ inline void checkMarkStatue() {
                gcnumsendobjs[BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
                gcnumreceiveobjs[BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
                // check the status of all cores
-               bool allStall = gc_checkCoreStatus();
+               bool allStall = gc_checkAllCoreStatus();
 #ifdef DEBUG
                BAMBOO_DEBUGPRINT(0xee03);
 #endif
@@ -427,26 +432,26 @@ inline void checkMarkStatue() {
                                // reset the corestatus array too
                                gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
                                waitconfirm = true;
-                               numconfirm = NUMCORES4GC - 1;
-                               for(i = 1; i < NUMCORES4GC; ++i) {      
+                               numconfirm = NUMCORESACTIVE - 1;
+                               for(i = 1; i < NUMCORESACTIVE; ++i) {   
                                        gccorestatus[i] = 1;
                                        // send mark phase finish confirm request msg to core i
                                        send_msg_1(i, GCMARKCONFIRM, false);
-                               } // for(i = 1; i < NUMCORES4GC; ++i) 
+                               } // for(i = 1; i < NUMCORESACTIVE; ++i) 
                        } else {
                                // check if the sum of send objs and receive obj are the same
                                // yes->check if the info is the latest; no->go on executing
                                int sumsendobj = 0;
-                               for(i = 0; i < NUMCORES4GC; ++i) {
+                               for(i = 0; i < NUMCORESACTIVE; ++i) {
                                        sumsendobj += gcnumsendobjs[i];
-                               } // for(i = 0; i < NUMCORES4GC; ++i) 
+                               } // for(i = 0; i < NUMCORESACTIVE; ++i) 
 #ifdef DEBUG
                                BAMBOO_DEBUGPRINT(0xee06);
                                BAMBOO_DEBUGPRINT_REG(sumsendobj);
 #endif
-                               for(i = 0; i < NUMCORES4GC; ++i) {
+                               for(i = 0; i < NUMCORESACTIVE; ++i) {
                                        sumsendobj -= gcnumreceiveobjs[i];
-                               } // for(i = 0; i < NUMCORES4GC; ++i) 
+                               } // for(i = 0; i < NUMCORESACTIVE; ++i) 
 #ifdef DEBUG
                                BAMBOO_DEBUGPRINT(0xee07);
                                BAMBOO_DEBUGPRINT_REG(sumsendobj);
@@ -459,9 +464,9 @@ inline void checkMarkStatue() {
                                        // stop mark phase
                                        gcphase = COMPACTPHASE;
                                        // restore the gcstatus for all cores
-                                       for(i = 0; i < NUMCORES4GC; ++i) {
+                                       for(i = 0; i < NUMCORESACTIVE; ++i) {
                                                gccorestatus[i] = 1;
-                                       } // for(i = 0; i < NUMCORES4GC; ++i)
+                                       } // for(i = 0; i < NUMCORESACTIVE; ++i)
                                } // if(0 == sumsendobj)
                        } // if(!gcwaitconfirm) else()
                } // if(allStall)
@@ -558,6 +563,11 @@ inline void initGC() {
                        gcfilledblocks[i] = 0;
                        gcstopblock[i] = 0;
                } // for(i = 0; i < NUMCORES4GC; ++i)
+               for(i = NUMCORES4GC; i < NUMCORESACTIVE; ++i) {
+                       gccorestatus[i] = 1;
+                       gcnumsendobjs[i] = 0; 
+                       gcnumreceiveobjs[i] = 0;
+               }
                gcheaptop = 0;
                gctopcore = 0;
                gctopblock = 0;
@@ -606,8 +616,6 @@ inline void initGC() {
        
        freeMGCHash(gcforwardobjtbl);
        gcforwardobjtbl = allocateMGCHash(20, 3);
-
-       memset(gcsmemtbl, '\0', sizeof(int)*gcnumblock);
 } // void initGC()
 
 // compute load balance for all cores
@@ -733,6 +741,7 @@ inline bool cacheLObjs() {
                if((int)dst < (int)(gclobjtail2->lobjs[gclobjtailindex2])+size) {
                        memmove(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
                } else {
+                       //BAMBOO_WRITE_HINT_CACHE(dst, size);
                  memcpy(dst, gclobjtail2->lobjs[gclobjtailindex2], size);
                }
 #ifdef DEBUG
@@ -750,7 +759,7 @@ inline bool cacheLObjs() {
 // NOTE: the free mem chunks should be maintained in an ordered linklist
 // the listtop param always specify current list tail
 
-// update the gcsmemtbl to record current shared mem usage
+// update the bmmboo_smemtbl to record current shared mem usage
 void updateSmemTbl(int coren,
                               int localtop) {
        int ltopcore = 0;
@@ -766,10 +775,10 @@ void updateSmemTbl(int coren,
        do{
                toset = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
                if(toset < ltopcore) {
-                       gcsmemtbl[toset]=
+                       bamboo_smemtbl[toset]=
                                (toset<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
                } else if(toset == ltopcore) {
-                       gcsmemtbl[toset] = load;
+                       bamboo_smemtbl[toset] = load;
                        break;
                } else {
                        break;
@@ -782,49 +791,12 @@ void updateSmemTbl(int coren,
        }while(true);
 } // void updateSmemTbl(int, int)
 
-inline struct freeMemItem * addFreeMemItem(int ptr,
-                                                      int size,
-                                                                                                                                                                        struct freeMemItem * listtail,
-                                                                                                                                                                        bool* sethead) {
-       struct freeMemItem * tochange = listtail;
-       if(*sethead) {
-               if(tochange->next == NULL) {
-                       if(bamboo_free_mem_list->backuplist != NULL) {
-                               tochange->next = bamboo_free_mem_list->backuplist;
-                               bamboo_free_mem_list->backuplist = NULL;
-                       } else {
-                               tochange->next = 
-                                       (struct freeMemItem *)RUNMALLOC(sizeof(struct freeMemItem));
-                       }
-               } // if(tochange->next == NULL)
-               tochange = tochange->next;
-       } else {
-               *sethead = true;
-       } // if(sethead)
-       tochange->ptr = ptr;
-       tochange->size = size;
-       BLOCKINDEX(ptr, &(tochange->startblock));
-       BLOCKINDEX(ptr+size-1, &(tochange->endblock));
-       // zero out all these spare memory
-       // note that, leave the mem starting from heaptop, as it caches large objs
-       // zero out these cache later when moving large obj
-       {
-               INTPTR tmp = tochange->ptr;
-               unsigned long long int size = tochange->size;
-               while(size > 0) {
-                       int tsize = size>1024*1024*1024?1024*1024*1024:size;
-                       memset(tmp, '\0', tsize);
-                       size -= tsize;
-                       tmp += tsize;
-               }
-       }
-       return tochange;
-} // struct freeMemItem * addFreeMemItem(int,int,struct freeMemItem*,bool*, int)
-
 inline void moveLObjs() {
 #ifdef DEBUG
        BAMBOO_DEBUGPRINT(0xea01);
 #endif
+       // zero out the smemtbl
+       BAMBOO_MEMSET_WH(bamboo_smemtbl, 0, sizeof(int)*gcnumblock);
        // find current heap top
        // flush all gcloads to indicate the real heap top on one core
        // previous it represents the next available ptr on a core
@@ -839,7 +811,7 @@ inline void moveLObjs() {
 #ifdef DEBUG
   BAMBOO_DEBUGPRINT(0xea02);
        BAMBOO_DEBUGPRINT_REG(gcloads[0]);
-       BAMBOO_DEBUGPRINT_REG(gcsmemtbl[0]);
+       BAMBOO_DEBUGPRINT_REG(bamboo_smemtbl[0]);
 #endif
        for(int i = 1; i < NUMCORES4GC; i++) {
                int tmptop = 0;
@@ -871,14 +843,14 @@ inline void moveLObjs() {
        int bound = 0;
        int i = 0;
        for(i = gcnumblock-1; i >= 0; i--) {
-               if(gcsmemtbl[i] > 0) {
+               if(bamboo_smemtbl[i] > 0) {
                        break;
                }
        }
        if(i == -1) {
                tmpheaptop = gcbaseva;
        } else {
-               tmpheaptop = gcbaseva+gcsmemtbl[i]+((i<NUMCORES4GC)?
+               tmpheaptop = gcbaseva+bamboo_smemtbl[i]+((i<NUMCORES4GC)?
                                (BAMBOO_SMEM_SIZE_L*i):
                                (BAMBOO_SMEM_SIZE*(i-NUMCORES4GC)+BAMBOO_LARGE_SMEM_BOUND));
        }
@@ -893,7 +865,7 @@ inline void moveLObjs() {
        BAMBOO_DEBUGPRINT_REG(gcheaptop);
 #endif
        // flush the sbstartbl
-       memset(&(gcsbstarttbl[gcreservedsb]), '\0', 
+       BAMBOO_MEMSET_WH(&(gcsbstarttbl[gcreservedsb]), '\0', 
                (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE-gcreservedsb)*sizeof(INTPTR));
        if(tomove == 0) {
                gcheaptop = tmpheaptop;
@@ -929,9 +901,9 @@ inline void moveLObjs() {
                                // this object acrosses blocks
                                if(cpysize > 0) {
                                        // close current block, fill its header
-                                       memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+                                       BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
                                        *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
-                                       gcsmemtbl[b]+=BAMBOO_CACHE_LINE_SIZE; // add the size of the header
+                                       bamboo_smemtbl[b]+=BAMBOO_CACHE_LINE_SIZE; // add the size of the header
                                        cpysize = 0;
                                        base = tmpheaptop;
                                        if(remain == 0) {
@@ -948,12 +920,13 @@ inline void moveLObjs() {
                                if((int)gcheaptop < (int)(tmpheaptop)+size) {
                                  memmove(tmpheaptop, gcheaptop, size);
                                } else {
+                                       //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
                                        memcpy(tmpheaptop, gcheaptop, size);
                                }
                                // fill the remaining space with -2 padding
-                               memset(tmpheaptop+size, -2, isize-size);
+                               BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
                                // zero out original mem caching the lobj
-                               memset(gcheaptop, '\0', size);
+                               BAMBOO_MEMSET_WH(gcheaptop, '\0', size);
 #ifdef DEBUG
                                BAMBOO_DEBUGPRINT(0xea05);
                                BAMBOO_DEBUGPRINT_REG(gcheaptop);
@@ -987,7 +960,7 @@ inline void moveLObjs() {
                                } // if(host == BAMBOO_NUM_OF_CORE) else ...
                                tmpheaptop += isize;
 
-                               // set the gcsbstarttbl and gcsmemtbl
+                               // set the gcsbstarttbl and bamboo_smemtbl
                                int tmpsbs = 1+(isize-remain-1)/BAMBOO_SMEM_SIZE;
                                for(int k = 1; k < tmpsbs; k++) {
                                        gcsbstarttbl[sb+k] = (INTPTR)(-1);
@@ -996,7 +969,7 @@ inline void moveLObjs() {
                                bound = (b<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
                                BLOCKINDEX(tmpheaptop-1, &tmpsbs);
                                for(; b < tmpsbs; b++) {
-                                       gcsmemtbl[b] = bound;
+                                       bamboo_smemtbl[b] = bound;
                                        if(b==NUMCORES4GC-1) {
                                                bound = BAMBOO_SMEM_SIZE;
                                        }
@@ -1005,22 +978,22 @@ inline void moveLObjs() {
                                        gcsbstarttbl[sb] = (INTPTR)(-1);
                                        remain = ((tmpheaptop-gcbaseva)<(BAMBOO_LARGE_SMEM_BOUND)) ? 
                                                                         BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
-                                       gcsmemtbl[b] = bound;
+                                       bamboo_smemtbl[b] = bound;
                                } else {
                                        gcsbstarttbl[sb] = (INTPTR)(tmpheaptop);
                                        remain = tmpheaptop-gcbaseva;
-                                       gcsmemtbl[b] = remain%bound;
-                                       remain = bound - gcsmemtbl[b];
+                                       bamboo_smemtbl[b] = remain%bound;
+                                       remain = bound - bamboo_smemtbl[b];
                                } // if(((isize-remain)%(BAMBOO_SMEM_SIZE)) == 0) else ...
 
                                // close current block and fill the header
-                               memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+                               BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
                                *((int*)base) = isize + BAMBOO_CACHE_LINE_SIZE;
                                cpysize = 0;
                                base = tmpheaptop;
                                if(remain == BAMBOO_CACHE_LINE_SIZE) {
                                        // fill with 0 in case
-                                       memset(tmpheaptop, '\0', remain);
+                                       BAMBOO_MEMSET_WH(tmpheaptop, '\0', remain);
                                }
                                remain -= BAMBOO_CACHE_LINE_SIZE;
                                tmpheaptop += BAMBOO_CACHE_LINE_SIZE;
@@ -1030,12 +1003,13 @@ inline void moveLObjs() {
                                if((int)gcheaptop < (int)(tmpheaptop)+size) {
                                memmove(tmpheaptop, gcheaptop, size);
                                } else {
+                                       //BAMBOO_WRITE_HINT_CACHE(tmpheaptop, size);
                                        memcpy(tmpheaptop, gcheaptop, size);
                                }
                                // fill the remaining space with -2 padding
-                               memset(tmpheaptop+size, -2, isize-size);
+                               BAMBOO_MEMSET_WH(tmpheaptop+size, -2, isize-size);
                                // zero out original mem caching the lobj
-                               memset(gcheaptop, '\0', size);
+                               BAMBOO_MEMSET_WH(gcheaptop, '\0', size);
 #ifdef DEBUG
                                BAMBOO_DEBUGPRINT(0xea06);
                                BAMBOO_DEBUGPRINT_REG(gcheaptop);
@@ -1071,15 +1045,15 @@ inline void moveLObjs() {
                                } // if(host == BAMBOO_NUM_OF_CORE) else ...
                                tmpheaptop += isize;
 
-                               // update gcsmemtbl
-                               gcsmemtbl[b] += isize;
+                               // update bamboo_smemtbl
+                               bamboo_smemtbl[b] += isize;
                        } // if(remain < isize) else ...
                } // while(gc_lobjmoreItems())
                if(cpysize > 0) {
                        // close current block, fill the header
-                       memset(base, '\0', BAMBOO_CACHE_LINE_SIZE);
+                       BAMBOO_MEMSET_WH(base, '\0', BAMBOO_CACHE_LINE_SIZE);
                        *((int*)base) = cpysize + BAMBOO_CACHE_LINE_SIZE;
-                       gcsmemtbl[b] += BAMBOO_CACHE_LINE_SIZE; // add the size of the header
+                       bamboo_smemtbl[b] += BAMBOO_CACHE_LINE_SIZE; // add the size of the header
                } else {
                        tmpheaptop -= BAMBOO_CACHE_LINE_SIZE;
                }
@@ -1091,82 +1065,19 @@ inline void moveLObjs() {
        BAMBOO_DEBUGPRINT(0xea07);
        BAMBOO_DEBUGPRINT_REG(gcheaptop);
 #endif
-
-       // update the free mem list
-       // create new free mem list according to gcsmemtbl
-       bool sethead = false;
-       if(bamboo_free_mem_list->head == NULL) {
-               bamboo_free_mem_list->head = bamboo_free_mem_list->backuplist;
-               bamboo_free_mem_list->backuplist = NULL;
-       }
-       struct freeMemItem * tochange = bamboo_free_mem_list->head;
-       if(tochange == NULL) {
-               bamboo_free_mem_list->head = tochange = 
-                       (struct freeMemItem *)RUNMALLOC(sizeof(struct freeMemItem));
-               tochange->next = NULL;
-       }
-       int startptr = 0;
-       size = 0;
-       bound = BAMBOO_SMEM_SIZE_L;
-       for(i = 0; i < gcnumblock-bamboo_reserved_smem; i++) {
-               if(gcsmemtbl[i] < bound) {
-                       if(gcsmemtbl[i] == 0) {
-                               // blank one
-                               if(startptr == 0) {
-                                       // a start of a new free mem chunk
-                                       startptr = gcbaseva+((i<NUMCORES4GC)?(i*BAMBOO_SMEM_SIZE_L)
-                                                       :(BAMBOO_LARGE_SMEM_BOUND+(i-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
-                               } // if(startptr == 0) 
-                               size += bound;
-                       } else {
-                               if(startptr != 0) {
-                                       // the end of previous free mem chunk
-                                       tochange = addFreeMemItem(startptr,size,tochange,&sethead);
-                                       //startptr = 0;
-                                       //size = 0;
-                               }
-                               // start of a new free mem chunk
-                               startptr = gcbaseva+((i<NUMCORES4GC)?(i*BAMBOO_SMEM_SIZE_L)
-                                     :((BAMBOO_LARGE_SMEM_BOUND+(i-NUMCORES4GC)*BAMBOO_SMEM_SIZE)))
-                                                        +gcsmemtbl[i];
-                               size = bound-gcsmemtbl[i];
-                       } // if(gcsmemtbl[i] == 0) else
-               } else {
-                       if(startptr != 0) {
-                               // the end of previous free mem chunk
-                               tochange = addFreeMemItem(startptr,size,tochange,&sethead);
-                               startptr = 0;
-                               size = 0;
-                       } // if(startptr != 0) {
-               } // if(gcsmemtbl[i] < bound) else
-               if(i == NUMCORES4GC-1) {
-                       bound = BAMBOO_SMEM_SIZE;
-               }
-       } // for(i = 0; i < gcnumblock; i++) {
-       if(startptr != 0) {
-               tochange = addFreeMemItem(startptr, size, tochange, &sethead);
-               startptr = 0;
-               size = 0;
-       }
-       // remove the remaing list to the back up list, only remain one node, 
-       // free the others
-       if(tochange->next != NULL) {
-               struct freeMemItem * blist = NULL;
-               if(bamboo_free_mem_list->backuplist != NULL) {
-                       blist = tochange->next;
+       
+       bamboo_free_block = 0;
+  int tbound = 0;
+  do {
+               tbound = (bamboo_free_block<NUMCORES4GC)?
+                       BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+               if(bamboo_smemtbl[bamboo_free_block] == tbound) {
+                       bamboo_free_block++;
                } else {
-                       bamboo_free_mem_list->backuplist = tochange->next;
-                       blist = bamboo_free_mem_list->backuplist->next;
-                       bamboo_free_mem_list->backuplist->next = NULL;
+                       // the first non-full partition
+                       break;
                }
-               tochange->next = NULL;
-               while(blist != NULL) {
-                       struct freeMemItem * tmp = blist;
-                       blist = blist->next;
-                       RUNFREE(tmp);
-               } // if(blist != NULL)
-       }
-
+       } while(true);
 #ifdef DEBUG
        BAMBOO_DEBUGPRINT(0xea08);
        BAMBOO_DEBUGPRINT_REG(gcheaptop);
@@ -1232,9 +1143,6 @@ inline void tomark(struct garbagelist * stackptr) {
 #endif
                for(i=0; i<stackptr->size; i++) {
                        if(stackptr->array[i] != NULL) {
-                               //BAMBOO_START_CRITICAL_SECTION();
-                               //gc_enqueue_I(stackptr->array[i]);
-                               //BAMBOO_CLOSE_CRITICAL_SECTION();
                          markObj(stackptr->array[i]);
                        }
                }
@@ -1255,9 +1163,6 @@ inline void tomark(struct garbagelist * stackptr) {
                                struct ObjectHash * set=parameter->objectset;
                                struct ObjectNode * ptr=set->listhead;
                                while(ptr!=NULL) {
-                                       //BAMBOO_START_CRITICAL_SECTION();
-                                       //gc_enqueue_I((void *)ptr->key);
-                                       //BAMBOO_CLOSE_CRITICAL_SECTION();
                                        markObj((void *)ptr->key);
                                        ptr=ptr->lnext;
                                }
@@ -1271,9 +1176,6 @@ inline void tomark(struct garbagelist * stackptr) {
                BAMBOO_DEBUGPRINT(0xe504);
 #endif
                for(i=0; i<currtpd->numParameters; i++) {
-                       //BAMBOO_START_CRITICAL_SECTION();
-                       //gc_enqueue_I(currtpd->parameterArray[i]);
-                       //BAMBOO_CLOSE_CRITICAL_SECTION();
                        markObj(currtpd->parameterArray[i]);
                }
        }
@@ -1288,9 +1190,6 @@ inline void tomark(struct garbagelist * stackptr) {
                        struct taskparamdescriptor *tpd=ptr->src;
                        int i;
                        for(i=0; i<tpd->numParameters; i++) {
-                               //BAMBOO_START_CRITICAL_SECTION();
-                               //gc_enqueue_I(tpd->parameterArray[i]);
-                               //BAMBOO_CLOSE_CRITICAL_SECTION();
                                markObj(tpd->parameterArray[i]);
                        }
                        ptr=ptr->inext;
@@ -1305,9 +1204,6 @@ inline void tomark(struct garbagelist * stackptr) {
        while(tmpobjptr != NULL) {
                struct transObjInfo * objInfo = 
                        (struct transObjInfo *)(tmpobjptr->objectptr); 
-               //BAMBOO_START_CRITICAL_SECTION();
-               //gc_enqueue_I(objInfo->objptr);
-               //BAMBOO_CLOSE_CRITICAL_SECTION();
                markObj(objInfo->objptr);
                tmpobjptr = getNextQueueItem(tmpobjptr);
        }
@@ -1320,9 +1216,6 @@ inline void tomark(struct garbagelist * stackptr) {
        while(item != NULL) {
                struct transObjInfo * totransobj = 
                        (struct transObjInfo *)(item->objectptr);
-               //BAMBOO_START_CRITICAL_SECTION();
-               //gc_enqueue_I(totransobj->objptr);
-               //BAMBOO_CLOSE_CRITICAL_SECTION();
                markObj(totransobj->objptr);
                item = getNextQueueItem(item);
        } // while(item != NULL)
@@ -1332,10 +1225,8 @@ inline void tomark(struct garbagelist * stackptr) {
 #endif
        // enqueue lock related info
        for(i = 0; i < runtime_locklen; ++i) {
-        //gc_enqueue_I((void *)(runtime_locks[i].redirectlock));
         markObj((void *)(runtime_locks[i].redirectlock));
         if(runtime_locks[i].value != NULL) {
-                //gc_enqueue_I((void *)(runtime_locks[i].value));
                 markObj((void *)(runtime_locks[i].value));
         }
        }
@@ -1596,10 +1487,6 @@ inline void compact2Heaptop() {
        BAMBOO_DEBUGPRINT_REG(b);
        BAMBOO_DEBUGPRINT_REG(remain);
 #endif
-       /*if((gctopcore == STARTUPCORE) && (b == 0)) {
-               remain -= gcreservedsb*BAMBOO_SMEM_SIZE;
-               p += gcreservedsb*BAMBOO_SMEM_SIZE;
-       }*/
        for(int i = 0; i < NUMCORES4GC; i++) {
                BAMBOO_START_CRITICAL_SECTION();
                if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0)) {
@@ -1771,12 +1658,18 @@ innernextSBlock:
                orig->blockbase = orig->base;
                orig->sblockindex = (orig->blockbase-BAMBOO_BASE_VA)/BAMBOO_SMEM_SIZE;
                sbchanged = true;
+               int blocknum = 0;
+               BLOCKINDEX(orig->base, &blocknum);
+               if(bamboo_smemtbl[blocknum] == 0) {
+                       // goto next block
+                       goto innernextSBlock;
+               }
        } else if(0 == (orig->blockbase%BAMBOO_SMEM_SIZE)) {
                orig->sblockindex += 1;
                sbchanged = true;
        } // if((orig->blockbase >= orig->bound) || (orig->ptr >= orig->bound)...
 
-       // check if this sblock should be omitted or have special start point
+       // check if this sblock should be skipped or have special start point
        if(gcsbstarttbl[orig->sblockindex] == -1) {
                // goto next sblock
 #ifdef DEBUG
@@ -1951,10 +1844,10 @@ innermoveobj:
                // check to see if remaining space is enough
                if(to->top + isize > to->bound) {
                        // fill 0 indicating the end of this block
-                       memset(to->ptr,  '\0', to->bound - to->top);
+                       BAMBOO_MEMSET_WH(to->ptr,  '\0', to->bound - to->top);
                        // fill the header of this block and then go to next block
        to->offset += to->bound - to->top;
-                       memset(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
+                       BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
                        (*((int*)(to->base))) = to->offset;
                        nextBlock(to);
                        if(stopblock == to->numblocks) {
@@ -1969,10 +1862,11 @@ innermoveobj:
                        if((int)(orig->ptr) < (int)(to->ptr)+size) {
                          memmove(to->ptr, orig->ptr, size);
                        } else {
+                               //BAMBOO_WRITE_HINT_CACHE(to->ptr, size);
                                memcpy(to->ptr, orig->ptr, size);
                        }
                        // fill the remaining space with -2
-                       memset(to->ptr+size, -2, isize-size);
+                       BAMBOO_MEMSET_WH(to->ptr+size, -2, isize-size);
                }
                // store mapping info
                BAMBOO_START_CRITICAL_SECTION();
@@ -1981,7 +1875,6 @@ innermoveobj:
                //MGCHashadd_I(gcpointertbl, orig->ptr, to->ptr);
                BAMBOO_CLOSE_CRITICAL_SECTION();
          //}
-
 #ifdef DEBUG
                BAMBOO_DEBUGPRINT(0xcdce);
                BAMBOO_DEBUGPRINT_REG(orig->ptr);
@@ -1993,7 +1886,7 @@ innermoveobj:
                to->top += isize;
                if(to->top == to->bound) {
                        // fill the header of this block and then go to next block
-                       memset(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
+                       BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
                        (*((int*)(to->base))) = to->offset;
                        nextBlock(to);
                }
@@ -2095,7 +1988,7 @@ innercompact:
        // if no objs have been compact, do nothing, 
        // otherwise, fill the header of this block
        if(to->offset > BAMBOO_CACHE_LINE_SIZE) {
-               memset(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
+               BAMBOO_MEMSET_WH(to->base, '\0', BAMBOO_CACHE_LINE_SIZE);
                (*((int*)(to->base))) = to->offset;
        } else {
                to->offset = 0;
@@ -2248,6 +2141,9 @@ inline void * flushObj(void * objptr) {
 #ifdef DEBUG
        BAMBOO_DEBUGPRINT(0xe401);
 #endif
+       if(objptr == NULL) {
+               return NULL;
+       }
        void * dstptr = NULL;
        if(ISSHAREDOBJ(objptr)) {
 #ifdef DEBUG
@@ -2437,8 +2333,10 @@ inline void flush(struct garbagelist * stackptr) {
 #ifdef DEBUG
                                        BAMBOO_DEBUGPRINT_REG(objptr);
 #endif
-                                       ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = 
-                                               flushObj(objptr);
+                                       if(objptr != NULL) {
+                                               ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = 
+                                                       flushObj(objptr);
+                                       }
                                }
                        } else {
 #ifdef DEBUG
@@ -2452,11 +2350,12 @@ inline void flush(struct garbagelist * stackptr) {
 #endif
                                        unsigned int offset=pointer[i];
                                        void * objptr=*((void **)(((char *)ptr)+offset));
-
 #ifdef DEBUG
                                        BAMBOO_DEBUGPRINT_REG(objptr);
 #endif
-                                       *((void **)(((char *)ptr)+offset)) = flushObj(objptr);
+                                       if(objptr != NULL) {
+                                               *((void **)(((char *)ptr)+offset)) = flushObj(objptr);
+                                       }
                                } // for(i=1; i<=size; i++) 
                        } // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
                        // restore the mark field, indicating that this obj has been flushed
@@ -2477,7 +2376,6 @@ inline void flush(struct garbagelist * stackptr) {
                BAMBOO_DEBUGPRINT(0xe309);
 #endif
                void * ptr = gc_lobjdequeue(NULL, NULL);
-               //if(ISSHAREDOBJ(ptr)) {
                void * tptr = flushObj(ptr);
 #ifdef DEBUG
                BAMBOO_DEBUGPRINT(0xe30a);
@@ -2488,8 +2386,7 @@ inline void flush(struct garbagelist * stackptr) {
                if(tptr != NULL) {
                        ptr = tptr;
                }
-               //}
-               if(/*(!ISSHAREDOBJ(ptr)) || */(((int *)(ptr))[6] == COMPACTED)) {
+               if(((int *)(ptr))[6] == COMPACTED) {
                        int type = ((int *)(ptr))[0];
                        // scan all pointers in ptr
                        unsigned INTPTR * pointer;
@@ -2518,8 +2415,10 @@ inline void flush(struct garbagelist * stackptr) {
 #ifdef DEBUG
                                        BAMBOO_DEBUGPRINT_REG(objptr);
 #endif
-                                       ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = 
-                                               flushObj(objptr);
+                                       if(objptr != NULL) {
+                                               ((void **)(((char *)&ao->___length___)+sizeof(int)))[j] = 
+                                                       flushObj(objptr);
+                                       }
                                }
                        } else {
 #ifdef DEBUG
@@ -2537,13 +2436,13 @@ inline void flush(struct garbagelist * stackptr) {
 #ifdef DEBUG
                                        BAMBOO_DEBUGPRINT_REG(objptr);
 #endif
-                                       *((void **)(((char *)ptr)+offset)) = flushObj(objptr);
+                                       if(objptr != NULL) {
+                                               *((void **)(((char *)ptr)+offset)) = flushObj(objptr);
+                                       }
                                } // for(i=1; i<=size; i++) 
                        } // if (pointer==0) else if (((INTPTR)pointer)==1) else ()
                        // restore the mark field, indicating that this obj has been flushed
-                       //if(ISSHAREDOBJ(ptr)) {
-                               ((int *)(ptr))[6] = INIT;
-                       //}
+                       ((int *)(ptr))[6] = INIT;
                } // if(((int *)(ptr))[6] == COMPACTED)
        } // while(gc_lobjmoreItems())
 #ifdef DEBUG
@@ -2564,12 +2463,9 @@ inline void flush(struct garbagelist * stackptr) {
 inline void gc_collect(struct garbagelist * stackptr) {
        // core collector routine
        while(true) {
-               //BAMBOO_START_CRITICAL_SECTION();
                if(INITPHASE == gcphase) {
-                       //BAMBOO_CLOSE_CRITICAL_SECTION();
                        break;
                }
-               //BAMBOO_CLOSE_CRITICAL_SECTION();
        }
 #ifdef RAWPATH // TODO GC_DEBUG
        tprintf("Do initGC\n");
@@ -2578,12 +2474,9 @@ inline void gc_collect(struct garbagelist * stackptr) {
        //send init finish msg to core coordinator
        send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
        while(true) {
-               //BAMBOO_START_CRITICAL_SECTION();
                if(MARKPHASE == gcphase) {
-                       //BAMBOO_CLOSE_CRITICAL_SECTION();
                        break;
                }
-               //BAMBOO_CLOSE_CRITICAL_SECTION();
        }
 #ifdef RAWPATH // TODO GC_DEBUG
        tprintf("Start mark phase\n");
@@ -2597,12 +2490,9 @@ inline void gc_collect(struct garbagelist * stackptr) {
        tprintf("Finish compact phase\n");
 #endif
        while(true) {
-               //BAMBOO_START_CRITICAL_SECTION();
                if(FLUSHPHASE == gcphase) {
-                       //BAMBOO_CLOSE_CRITICAL_SECTION();
                        break;
                }
-               //BAMBOO_CLOSE_CRITICAL_SECTION();
        }
 #ifdef RAWPATH // TODO GC_DEBUG
        tprintf("Start flush phase\n");
@@ -2613,12 +2503,57 @@ inline void gc_collect(struct garbagelist * stackptr) {
 #endif
 
        while(true) {
-               //BAMBOO_START_CRITICAL_SECTION();
                if(FINISHPHASE == gcphase) {
-                       //BAMBOO_CLOSE_CRITICAL_SECTION();
                        break;
                }
-               //BAMBOO_CLOSE_CRITICAL_SECTION();
+       }
+#ifdef RAWPATH // TODO GC_DEBUG
+       tprintf("Finish gc!\n");
+#endif
+} // void gc_collect(struct garbagelist * stackptr)
+
+inline void gc_nocollect(struct garbagelist * stackptr) {
+       while(true) {
+               if(INITPHASE == gcphase) {
+                       break;
+               }
+       }
+#ifdef RAWPATH // TODO GC_DEBUG
+       tprintf("Do initGC\n");
+#endif
+       initGC();
+       //send init finish msg to core coordinator
+       send_msg_2(STARTUPCORE, GCFINISHINIT, BAMBOO_NUM_OF_CORE, false);
+       while(true) {
+               if(MARKPHASE == gcphase) {
+                       break;
+               }
+       }
+#ifdef RAWPATH // TODO GC_DEBUG
+       tprintf("Start mark phase\n");
+#endif
+       mark(true, stackptr);
+#ifdef RAWPATH // TODO GC_DEBUG
+       tprintf("Finish mark phase, wait for flush\n");
+#endif
+       // non-gc core collector routine
+       while(true) {
+               if(FLUSHPHASE == gcphase) {
+                       break;
+               }
+       }
+#ifdef RAWPATH // TODO GC_DEBUG
+       tprintf("Start flush phase\n");
+#endif
+       flush(stackptr);
+#ifdef RAWPATH // TODO GC_DEBUG
+       tprintf("Finish flush phase\n");
+#endif
+
+       while(true) {
+               if(FINISHPHASE == gcphase) {
+                       break;
+               }
        }
 #ifdef RAWPATH // TODO GC_DEBUG
        tprintf("Finish gc!\n");
@@ -2643,6 +2578,10 @@ inline void gc(struct garbagelist * stackptr) {
                        return;
                }
 
+#ifdef GC_PROFILE
+               gc_profileStart();
+#endif
+
 #ifdef RAWPATH // TODO GC_DEBUG
                tprintf("start gc! \n");
                //dumpSMem();
@@ -2652,7 +2591,8 @@ inline void gc(struct garbagelist * stackptr) {
                waitconfirm = false;
                waitconfirm = 0;
                gcphase = INITPHASE;
-               for(i = 1; i < NUMCORES4GC; i++) {
+               // Note: all cores need to init gc including non-gc cores
+               for(i = 1; i < NUMCORESACTIVE/*NUMCORES4GC*/; i++) {
                        // send GC init messages to all cores
                        send_msg_1(i, GCSTARTINIT, false);
                }
@@ -2666,20 +2606,21 @@ inline void gc(struct garbagelist * stackptr) {
 
                gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
                while(true) {
-                       BAMBOO_START_CRITICAL_SECTION();
-                       if(gc_checkCoreStatus()) {
-                               BAMBOO_CLOSE_CRITICAL_SECTION();
+                       if(gc_checkAllCoreStatus()) {
                                break;
                        }
-                       BAMBOO_CLOSE_CRITICAL_SECTION();
                }
+#ifdef GC_PROFILE
+               gc_profileItem();
+#endif
 #ifdef RAWPATH // TODO GC_DEBUG
                tprintf("Start mark phase \n");
 #endif
                // all cores have finished compacting
                // restore the gcstatus of all cores
+               // Note: all cores have to do mark including non-gc cores
                gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-               for(i = 1; i < NUMCORES4GC; ++i) {
+               for(i = 1; i < NUMCORESACTIVE/*NUMCORES4GC*/; ++i) {
                        gccorestatus[i] = 1;
                        // send GC start messages to all cores
                        send_msg_1(i, GCSTART, false);
@@ -2697,6 +2638,7 @@ inline void gc(struct garbagelist * stackptr) {
                        checkMarkStatue(); 
                }  // while(MARKPHASE == gcphase)
                // send msgs to all cores requiring large objs info
+               // Note: only need to ask gc cores, non-gc cores do not host any objs
                numconfirm = NUMCORES4GC - 1;
                for(i = 1; i < NUMCORES4GC; ++i) {
                        send_msg_1(i, GCLOBJREQUEST, false);
@@ -2711,6 +2653,9 @@ inline void gc(struct garbagelist * stackptr) {
                if(gcheaptop < gcmarkedptrbound) {
                        gcheaptop = gcmarkedptrbound;
                }
+#ifdef GC_PROFILE
+               gc_profileItem();
+#endif
 #ifdef RAWPATH // TODO GC_DEBUG
                tprintf("prepare to cache large objs \n");
                //dumpSMem();
@@ -2769,6 +2714,10 @@ inline void gc(struct garbagelist * stackptr) {
                        gcrequiredmems[i] = 0;
                }
 
+#ifdef GC_PROFILE
+               gc_profileItem();
+#endif
+
                // compact phase
                bool finalcompact = false;
                // initialize pointers for comapcting
@@ -2859,7 +2808,9 @@ inline void gc(struct garbagelist * stackptr) {
                        } // if(gctomove)
 
                } // while(COMPACTPHASE == gcphase) 
-       
+#ifdef GC_PROFILE
+               gc_profileItem();
+#endif
 #ifdef RAWPATH // TODO GC_DEBUG
                tprintf("prepare to move large objs \n");
                //dumpSMem();
@@ -2876,12 +2827,16 @@ inline void gc(struct garbagelist * stackptr) {
 
                gcphase = FLUSHPHASE;
                gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-               for(i = 1; i < NUMCORES4GC; ++i) {
+               // Note: all cores should flush their runtime data including non-gc 
+               //       cores
+               for(i = 1; i < NUMCORESACTIVE/*NUMCORES4GC*/; ++i) {
                        // send start flush messages to all cores
                        gccorestatus[i] = 1;
                        send_msg_1(i, GCSTARTFLUSH, false);
                }
-
+#ifdef GC_PROFILE
+               gc_profileItem();
+#endif
 #ifdef RAWPATH // TODO GC_DEBUG
                tprintf("Start flush phase \n");
 #endif
@@ -2890,14 +2845,26 @@ inline void gc(struct garbagelist * stackptr) {
                gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
                while(FLUSHPHASE == gcphase) {
                        // check the status of all cores
-                       if(gc_checkCoreStatus()) {
+                       if(gc_checkAllCoreStatus()) {
                                break;
                        }
                } // while(FLUSHPHASE == gcphase)
                gcphase = FINISHPHASE;
 
+               // invalidate all shared mem pointers
+               // put it here as it takes time to inform all the other cores to 
+               // finish gc and it might cause problem when some core resumes 
+               // mutator earlier than the other cores
+               bamboo_cur_msp = NULL;
+               bamboo_smem_size = 0;
+               gcflag = false;
+               gcprocessing = false;
+
+#ifdef GC_PROFILE
+               gc_profileEnd();
+#endif
                gccorestatus[BAMBOO_NUM_OF_CORE] = 1;
-               for(i = 1; i < NUMCORES4GC; ++i) {
+               for(i = 1; i < NUMCORESACTIVE/*NUMCORES4GC*/; ++i) {
                        // send gc finish messages to all cores
                        send_msg_1(i, GCFINISH, false);
                        gccorestatus[i] = 1;
@@ -2906,18 +2873,116 @@ inline void gc(struct garbagelist * stackptr) {
                tprintf("gc finished \n");
                //dumpSMem();
 #endif
-       } else {
+       } else if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
                gcprocessing = true;
                gc_collect(stackptr);
-       }
 
-       // invalidate all shared mem pointers
-       bamboo_cur_msp = NULL;
-       bamboo_smem_size = 0;
+               // invalidate all shared mem pointers
+               bamboo_cur_msp = NULL;
+               bamboo_smem_size = 0;
 
-       gcflag = false;
-       gcprocessing = false;
+               gcflag = false;
+               gcprocessing = false;
+       } else {
+               // not a gc core, should wait for gcfinish msg
+         gcprocessing = true;
+               gc_nocollect(stackptr);
 
+               // invalidate all shared mem pointers
+               bamboo_cur_msp = NULL;
+               bamboo_smem_size = 0;
+
+               gcflag = false;
+               gcprocessing = false;
+       }
 } // void gc(struct garbagelist * stackptr)
 
+#ifdef GC_PROFILE
+inline void gc_profileStart(void) {
+  if(!gc_infoOverflow) {
+               GCInfo* gcInfo = RUNMALLOC(sizeof(struct gc_info));
+         gc_infoArray[gc_infoIndex] = gcInfo;
+               gcInfo->index = 1;
+               gcInfo->time[0] = BAMBOO_GET_EXE_TIME();
+  }
+}
+
+inline void gc_profileItem(void) {
+  if(!gc_infoOverflow) {
+               GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
+               gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
+  }
+}
+
+inline void gc_profileEnd(void) {
+  if(!gc_infoOverflow) {
+               GCInfo* gcInfo = gc_infoArray[gc_infoIndex];
+         gcInfo->time[gcInfo->index++] = BAMBOO_GET_EXE_TIME();
+               gc_infoIndex++;
+         if(gc_infoIndex == GCINFOLENGTH) {
+                 gc_infoOverflow = true;
+                 //taskInfoIndex = 0;
+         }
+  }
+}
+
+// output the profiling data
+void gc_outputProfileData() {
+#ifdef USEIO
+  int i,j;
+       unsigned long long totalgc = 0;
+
+  //printf("Start Time, End Time, Duration\n");
+  // output task related info
+  for(i = 0; i < gc_infoIndex; i++) {
+               GCInfo * gcInfo = gc_infoArray[i];
+               unsigned long long tmp = 0;
+               for(j = 0; j < gcInfo->index; j++) {
+                       printf("%lld(%lld), ", gcInfo->time[j], (gcInfo->time[j]-tmp));
+                       tmp = gcInfo->time[j];
+               }
+               tmp = (tmp-gcInfo->time[0]);
+               printf(" ++ %lld \n", tmp);
+               totalgc += tmp;
+  }
+
+  if(gc_infoOverflow) {
+    printf("Caution: gc info overflow!\n");
+  }
+
+       printf("\n\n total gc time: %lld \n", totalgc);
+#else
+  int i = 0;
+  int j = 0;
+       unsigned long long totalgc = 0;
+
+  BAMBOO_DEBUGPRINT(0xdddd);
+  // output task related info
+  for(i= 0; i < gc_infoIndex; i++) {
+               GCInfo * gcInfo = gc_infoArray[i];
+               unsigned long long tmp = 0;
+               BAMBOO_DEBUGPRINT(0xddda);
+               for(j = 0; j < gcInfo->index; j++) {
+                       BAMBOO_DEBUGPRINT(gcInfo->time[j]);
+                       BAMBOO_DEBUGPRINT(gcInfo->time[j]-tmp);
+                       BAMBOO_DEBUGPRINT(0xdddb);
+                       tmp = gcInfo->time[j];
+               }
+               tmp = (tmp-gcInfo->time[0]);
+               BAMBOO_DEBUGPRINT_REG(tmp);
+               BAMBOO_DEBUGPRINT(0xdddc);
+               totalgc += tmp;
+  }
+       BAMBOO_DEBUGPRINT(0xdddd);
+       BAMBOO_DEBUGPRINT_REG(totalgc);
+
+  if(gc_infoOverflow) {
+    BAMBOO_DEBUGPRINT(0xefee);
+  }
+
+  BAMBOO_DEBUGPRINT(0xeeee);
+#endif
+}
+#endif  // #ifdef GC_PROFILE
+
 #endif
index 95329471f5c450de434dcd0fdfc5389fb04f2950..08ba12bca2c585439b939c31f60a9bfbc480daba 100644 (file)
 #ifdef GC_DEBUG
 #define BAMBOO_SMEM_SIZE_L (BAMBOO_SMEM_SIZE * 2)
 #else
-#define BAMBOO_SMEM_SIZE_L (32 * BAMBOO_SMEM_SIZE)
+#define BAMBOO_SMEM_SIZE_L (2 * BAMBOO_SMEM_SIZE)
 #endif
-#define BAMBOO_LARGE_SMEM_BOUND (BAMBOO_SMEM_SIZE_L*NUMCORES4GC) // NUMCORES=62
+#define BAMBOO_LARGE_SMEM_BOUND (BAMBOO_SMEM_SIZE_L*NUMCORES4GC) 
+         // let each gc core to have one big block, this is very important 
+                                // for the computation of NUMBLOCKS(s, n), DO NOT change this!
 
 #define NUMPTRS 100
 
+// for GC profile
+#ifdef GC_PROFILE
+#define GCINFOLENGTH 100
+
+typedef struct gc_info {
+  unsigned long long time[7];
+       int index;
+} GCInfo;
+
+GCInfo * gc_infoArray[GCINFOLENGTH];
+int gc_infoIndex;
+bool gc_infoOverflow;
+#endif
+
 typedef enum {
        INIT = 0,     // 0
        DISCOVERED,   // 1
@@ -44,11 +60,11 @@ volatile GCPHASETYPE gcphase; // indicating GC phase
 int gccurr_heaptop;
 struct MGCHash * gcforwardobjtbl; // cache forwarded objs in mark phase
 // for mark phase termination
-int gccorestatus[NUMCORES4GC]; // records status of each core
-                            // 1: running gc
-                            // 0: stall
-int gcnumsendobjs[NUMCORES4GC]; // records how many objects sent out
-int gcnumreceiveobjs[NUMCORES4GC]; // records how many objects received
+int gccorestatus[NUMCORESACTIVE]; // records status of each core
+                                  // 1: running gc
+                                  // 0: stall
+int gcnumsendobjs[NUMCORESACTIVE]; // records how many objects sent out
+int gcnumreceiveobjs[NUMCORESACTIVE]; // records how many objects received
 bool gcbusystatus;
 int gcself_numsendobjs;
 int gcself_numreceiveobjs;
@@ -90,10 +106,6 @@ int gcreservedsb;  // number of reserved sblock for sbstarttbl
 int gcnumblock; // number of total blocks in the shared mem
 int gcbaseva; // base va for shared memory without reserved sblocks
 
-// table recording the number of used bytes in each block
-// Note: this table resides on master core's local heap
-int * gcsmemtbl;
-
 #define ISSHAREDOBJ(p) \
        ((((int)p)>gcbaseva)&&(((int)p)<(gcbaseva+(BAMBOO_SHARED_MEM_SIZE))))
 
@@ -163,6 +175,7 @@ int * gcsmemtbl;
 
 inline void gc(struct garbagelist * stackptr); // core coordinator routine
 inline void gc_collect(struct garbagelist* stackptr);//core collector routine
+inline void gc_nocollect(struct garbagelist* stackptr);//non-gc core collector routine
 inline void transferMarkResults_I();
 inline void gc_enqueue_I(void *ptr);
 inline void gc_lobjenqueue_I(void *ptr, int length, int host);
@@ -176,5 +189,12 @@ inline void * gc_lobjdequeue4(int * length, int * host);
 inline int gc_lobjmoreItems4();
 inline void gc_lobjqueueinit4();
 
+#ifdef GC_PROFILE
+INLINE void gc_profileStart(void);
+INLINE void gc_profileItem(void);
+INLINE void gc_profileEnd(void);
+void gc_outputProfileData();
+#endif
+
 #endif
 
index 14364342038845d1f8c69c97ce1def57488c2ca2..bdf4ab858e11cca2b8679de7f525b41bb3725ee0 100644 (file)
@@ -236,10 +236,10 @@ struct Queue * totransobjqueue; // queue to hold objs to be transferred
 #define BAMBOO_SMEM_SIZE (64 * 64) // (BAMBOO_PAGE_SIZE)
 #define BAMBOO_SHARED_MEM_SIZE ((BAMBOO_PAGE_SIZE) * (BAMBOO_NUM_PAGES))
 #else
-#define BAMBOO_NUM_PAGES (64 * 1024) //(64 * 4 * 0.75) //(1024 * 1024 * 3.5)  3G
+#define BAMBOO_NUM_PAGES (15 * 1024) //(64 * 4 * 0.75) //(1024 * 1024 * 3.5)  3G
 #define BAMBOO_PAGE_SIZE (16 * 1024)// * 1024)  // (4096)
 #define BAMBOO_SMEM_SIZE (16 * 1024)
-#define BAMBOO_SHARED_MEM_SIZE (1024 * 1024 * 1024)
+#define BAMBOO_SHARED_MEM_SIZE (1024 * 1024 * 240) //(1024 * 1024 * 1024)
 //(3.0 * 1024 * 1024 * 1024) // 3G// ((BAMBOO_PAGE_SIZE) * (BAMBOO_NUM_PAGES))
 #endif
 
@@ -272,7 +272,13 @@ struct freeMemList {
                                          // only maintain 1 fremmMemItem
 };
 
-struct freeMemList * bamboo_free_mem_list;
+// table recording the number of allocated bytes on each block
+// Note: this table resides on the bottom of the shared heap for all cores
+//       to access
+int * bamboo_smemtbl;
+int bamboo_free_block;
+//bool bamboo_smem_flushed;
+//struct freeMemList * bamboo_free_mem_list;
 int bamboo_reserved_smem; // reserved blocks on the top of the shared heap
                           // e.g. 20% of the heap and should not be allocated
                                                                                                        // otherwise gc is invoked
@@ -398,6 +404,8 @@ INLINE void send_msg_6(int targetcore,
                                                                                         unsigned long n4, 
                                                                                         unsigned long n5,
                                                                                         bool isinterrupton);
+INLINE void cache_msg_1(int targetcore, 
+                                                                                               unsigned long n0);
 INLINE void cache_msg_2(int targetcore, 
                                    unsigned long n0, 
                                                                                                unsigned long n1);
@@ -478,6 +486,11 @@ void outputProfileData();
 // BAMBOO_MSG_AVAIL(): checking if there are msgs coming in                //
 // BAMBOO_GCMSG_AVAIL(): checking if there are gcmsgs coming in            //
 // BAMBOO_GET_EXE_TIME(): rountine to get current clock cycle number       //
+// BAMBOO_MEMSET_WH(x, y, z): memset the specified region of memory (start //
+//                            address x, size z) to value y with write     //
+//                            hint, the processor will not fetch the       //
+//                            current content of the memory and directly   //
+//                            write                                        //
 //                                                                         //
 // runtime_arch.h should also define following global parameters:          //
 // bamboo_cpu2coords: map the cpu # to (x,y) coordinates                   //
index 0acb6cab7a6f9e779338b32047d1fc9e267deb77..6d1ee7639e657ebac33024f12ca61f0625592b92 100644 (file)
@@ -53,17 +53,23 @@ void initruntimedata() {
                        // initialize the profile data arrays
                        profilestatus[i] = 1;
 #endif
-    } // for(i = 0; i < NUMCORESACTIVE; ++i)
 #ifdef MULTICORE_GC
-               for(i = 0; i < NUMCORES4GC; ++i) {
                        gccorestatus[i] = 1;
                        gcnumsendobjs[i] = 0; 
       gcnumreceiveobjs[i] = 0;
+#endif
+    } // for(i = 0; i < NUMCORESACTIVE; ++i)
+#ifdef MULTICORE_GC
+               for(i = 0; i < NUMCORES4GC; ++i) {
                        gcloads[i] = 0;
                        gcrequiredmems[i] = 0;
                        gcstopblock[i] = 0;
                        gcfilledblocks[i] = 0;
     } // for(i = 0; i < NUMCORES4GC; ++i)
+#ifdef GC_PROFILE
+               gc_infoIndex = 0;
+               gc_infoOverflow = false;
+#endif
 #endif
                numconfirm = 0;
                waitconfirm = false; 
@@ -119,7 +125,8 @@ void initruntimedata() {
        gcmovepending = 0;
        gcblock2fill = 0;
        gcsbstarttbl = BAMBOO_BASE_VA;
-       gcsmemtbl = RUNMALLOC_I(sizeof(int)*gcnumblock);
+       bamboo_smemtbl = (void *)gcsbstarttbl
+               + (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE)*sizeof(INTPTR); 
 #else
        // create the lock table, lockresult table and obj queue
   locktable.size = 20;
@@ -169,9 +176,6 @@ void disruntimedata() {
        freeRuntimeHash(gcpointertbl);
        //freeMGCHash(gcpointertbl);
        freeMGCHash(gcforwardobjtbl);
-       if(gcsmemtbl != NULL) {
-               RUNFREE(gcsmemtbl);
-       }
 #else
        freeRuntimeHash(lockRedirectTbl);
        freeRuntimeHash(objRedirectLockTbl);
@@ -446,6 +450,13 @@ void checkCoreStatus() {
                                                } // if(!allStall)
                                        } // while(true)
 #endif
+
+                                       // gc_profile mode, ourput gc prfiling data
+#ifdef MULTICORE_GC
+#ifdef GC_PROFILE
+                                       gc_outputProfileData();
+#endif // #ifdef GC_PROFILE
+#endif // #ifdef MULTICORE_GC
                                        disruntimedata();
                                        terminate(); // All done.
                                } // if(!waitconfirm)
@@ -1203,181 +1214,160 @@ inline void addNewObjInfo(void * nobj) {
 #endif
 
 #ifdef MULTICORE_GC
-struct freeMemItem * findFreeMemChunk_I(int coren,
-                                                   int isize,
-                                                   int * tofindb) {
-       struct freeMemItem * freemem = bamboo_free_mem_list->head;
-       struct freeMemItem * prev = NULL;
+void * localmalloc_I(int coren,
+                                int isize,
+                                int * allocsize) {
+       void * mem = NULL;
        int i = 0;
        int j = 0;
-       *tofindb = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
-       // check available shared mem chunks
+       int tofindb = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
+       int totest = tofindb;
+       int bound = BAMBOO_SMEM_SIZE_L;
+       int foundsmem = 0;
+       int size = 0;
        do {
-               int foundsmem = 0;
-               switch(bamboo_smem_mode) {
-                       case SMEMLOCAL: {
-                               int startb = freemem->startblock;
-                               int endb = freemem->endblock;
-                               while(startb > *tofindb) {
-                                       i++;
-                                       if(2==i) {
-                                               i = 0;
-                                               j++;
-                                       }
-                                       *tofindb = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
-                               } // while(startb > tofindb)
-                               if(startb <= *tofindb) {
-                                       if((endb >= *tofindb) && (freemem->size >= isize)) {
-                                               foundsmem = 1;
-                                       } else if(*tofindb > gcnumblock-1) {
-                                               // no more local mem
-                                               foundsmem = 2;
-                                       } // if(endb >= tofindb) 
-                               } // if(startb <= tofindb)
-                               break;
-                       }
-
-                       case SMEMFIXED: {
-                               int startb = freemem->startblock;
-                               int endb = freemem->endblock;
-                               if(startb <= *tofindb) {
-                                       if((endb >= *tofindb)  && (freemem->size >= isize)) {
-                                               foundsmem = 1;
-                                       } 
-                               } else {
-                                       // use the global mem
-                                       if(((startb > NUMCORES4GC-1) && (freemem->size >= isize)) || 
-                                                       ((endb > NUMCORES4GC-1) && ((freemem->size-
-                                                               (gcbaseva+BAMBOO_LARGE_SMEM_BOUND-freemem->ptr))>=isize))) {
-                                               foundsmem = 1;
-                                       }
-                               }
-                               break;
-                       }
-
-                       case SMEMMIXED: {
-                               // TODO not supported yet
-                               BAMBOO_EXIT(0xe001);
-                               break;
-                       }
-
-                       case SMEMGLOBAL: {
-                   foundsmem = (freemem->size >= isize);
-                               break;
-                       }
-                       default:
-                               break;
-               }
-
-               if(1 == foundsmem) {
-                       // found one
-                       break;
-               } else if (2 == foundsmem) {
-                       // terminate, no more mem
-                       freemem = NULL;
-                       break;
-               }
-               if(freemem->size == 0) {
-                       // an empty item, remove it
-                       struct freeMemItem * toremove = freemem;
-                       freemem = freemem->next;
-                       if(prev == NULL ){
-                               // the head
-                               bamboo_free_mem_list->head = freemem;
+               bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+               int nsize = bamboo_smemtbl[totest];
+               bool islocal = true;
+               if(nsize < bound) {
+                       bool tocheck = true;
+                       // have some space in the block
+                       if(totest == tofindb) {
+                               // the first partition
+                               size = bound - nsize;
+                       } else if(nsize == 0) {
+                               // an empty partition, can be appended
+                               size += bound;
                        } else {
-                               prev->next = freemem;
-                       }
-                       // put it to the tail of the list for reuse
-                       if(bamboo_free_mem_list->backuplist == NULL) {
-                               //toremove->next = bamboo_free_mem_list->backuplist;
-                               bamboo_free_mem_list->backuplist = toremove;
-                               bamboo_free_mem_list->backuplist->next = NULL;
-                       } else {
-                               // free it
-                               RUNFREE(toremove);
+                               // not an empty partition, can not be appended
+                               // the last continuous block is not big enough, go to check the next
+                               // local block
+                               islocal = true;
+                               tocheck = false;
+                       } // if(totest == tofindb) else if(nsize == 0) else ...
+                       if(tocheck) {
+                               if(size >= isize) {
+                                       // have enough space in the block, malloc
+                                       foundsmem = 1;
+                                       break;
+                               } else {
+                                       // no enough space yet, try to append next continuous block
+                                       islocal = false;
+                               } // if(size > isize) else ...
+                       } // if(tocheck)
+               } // if(nsize < bound)
+               if(islocal) {
+                       // no space in the block, go to check the next block
+                       i++;
+                       if(2==i) {
+                               i = 0;
+                               j++;
                        }
+                       tofindb = totest = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
                } else {
-                       prev = freemem;
-                       freemem = freemem->next;
+                       totest += 1;
+               } // if(islocal) else ...
+               if(totest > gcnumblock-1-bamboo_reserved_smem) {
+                       // no more local mem, do not find suitable block
+                       foundsmem = 2;
+                       break;
+               } // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+       } while(true);
+
+       if(foundsmem == 1) {
+               // find suitable block
+               mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC)?
+                               (BAMBOO_SMEM_SIZE_L*tofindb):(BAMBOO_LARGE_SMEM_BOUND+
+                                       (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+               *allocsize = size;
+               // set bamboo_smemtbl
+               for(i = tofindb; i <= totest; i++) {
+                       bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
                }
-       } while(freemem != NULL);
-
-       return freemem;
-} // struct freeMemItem * findFreeMemChunk_I(int, int, int *)
-
-void * localmalloc_I(int tofindb,
-                                int isize,
-                                struct freeMemItem * freemem,
-                                int * allocsize) {
-       void * mem = NULL;
-       int startb = freemem->startblock;
-       int endb = freemem->endblock;
-       int tmpptr = gcbaseva+((tofindb<NUMCORES4GC)?tofindb*BAMBOO_SMEM_SIZE_L
-               :BAMBOO_LARGE_SMEM_BOUND+(tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE);
-       if((freemem->size+freemem->ptr-tmpptr)>=isize) {
-               mem = (tmpptr>freemem->ptr)?((void *)tmpptr):(freemem->ptr);
-       } else {
-               mem = (void *)(freemem->size+freemem->ptr-isize);
-       }
-       // check the remaining space in this block
-       int remain = (int)(mem-gcbaseva);
-       int bound = (BAMBOO_SMEM_SIZE);
-       if(remain < BAMBOO_LARGE_SMEM_BOUND) {
-               bound = (BAMBOO_SMEM_SIZE_L);
-       }
-       remain = bound - remain%bound;
-       if(remain < isize) {
-               // this object acrosses blocks
-               *allocsize = isize;
-       } else {
-               // round the asigned block to the end of the current block
-               *allocsize = remain;
-       }
-       if(freemem->ptr == (int)mem) {
-               freemem->ptr = ((void*)freemem->ptr) + (*allocsize);
-               freemem->size -= *allocsize;
-               BLOCKINDEX(freemem->ptr, &(freemem->startblock));
-       } else if((freemem->ptr+freemem->size) == ((int)mem+(*allocsize))) {
-               freemem->size -= *allocsize;
-               BLOCKINDEX(((int)mem)-1, &(freemem->endblock));
-       } else {
-               struct freeMemItem * tmp = 
-                       (struct freeMemItem *)RUNMALLOC_I(sizeof(struct freeMemItem));
-               tmp->ptr = (int)mem+*allocsize;
-               tmp->size = freemem->ptr+freemem->size-(int)mem-*allocsize;
-               BLOCKINDEX(tmp->ptr, &(tmp->startblock));
-               tmp->endblock = freemem->endblock;
-               tmp->next = freemem->next;
-               freemem->next = tmp;
-               freemem->size = (int)mem - freemem->ptr;
-               BLOCKINDEX(((int)mem-1), &(freemem->endblock));
+       } else if(foundsmem == 2) {
+               // no suitable block
+               *allocsize = 0;
        }
+
        return mem;
-} // void * localmalloc_I(int, int, struct freeMemItem *, int *)
+} // void * localmalloc_I(int, int, int *)
 
-void * globalmalloc_I(int isize,
-                                 struct freeMemItem * freemem,
+void * globalmalloc_I(int coren,
+                                 int isize,
                                  int * allocsize) {
-       void * mem = (void *)(freemem->ptr);
-       // check the remaining space in this block
-       int remain = (int)(mem-gcbaseva);
-       int bound = (BAMBOO_SMEM_SIZE);
-       if(remain < BAMBOO_LARGE_SMEM_BOUND) {
-               bound = (BAMBOO_SMEM_SIZE_L);
+       void * mem = NULL;
+       int tofindb = bamboo_free_block; //0;
+       int totest = tofindb;
+       int bound = BAMBOO_SMEM_SIZE_L;
+       int foundsmem = 0;
+       int size = 0;
+       if(tofindb > gcnumblock-1-bamboo_reserved_smem) {
+               *allocsize = 0;
+               return NULL;
        }
-       remain = bound - remain%bound;
-       if(remain < isize) {
-               // this object acrosses blocks
-               *allocsize = isize;
-       } else {
-               // round the asigned block to the end of the current block
-               *allocsize = remain;
+       do {
+               bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+               int nsize = bamboo_smemtbl[totest];
+               bool isnext = false;
+               if(nsize < bound) {
+                       bool tocheck = true;
+                       // have some space in the block
+                       if(totest == tofindb) {
+                               // the first partition
+                               size = bound - nsize;
+                       } else if(nsize == 0) {
+                               // an empty partition, can be appended
+                               size += bound;
+                       } else {
+                               // not an empty partition, can not be appended
+                               // the last continuous block is not big enough, start another block
+                               isnext = true;
+                               tocheck = false;
+                       } // if(totest == tofindb) else if(nsize == 0) else ...
+                       if(tocheck) {
+                               if(size >= isize) {
+                                       // have enough space in the block, malloc
+                                       foundsmem = 1;
+                                       break;
+                               } // if(size > isize) 
+                       } // if(tocheck)
+               } else {
+                       isnext = true;
+               }// if(nsize < bound) else ...
+               totest += 1;
+               if(totest > gcnumblock-1-bamboo_reserved_smem) {
+                       // no more local mem, do not find suitable block
+                       foundsmem = 2;
+                       break;
+               } // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+               if(isnext) {
+                       // start another block
+                       tofindb = totest;
+               } // if(islocal) 
+       } while(true);
+
+       if(foundsmem == 1) {
+               // find suitable block
+               mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC)?
+                               (BAMBOO_SMEM_SIZE_L*tofindb):(BAMBOO_LARGE_SMEM_BOUND+
+                                       (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+               *allocsize = size;
+               // set bamboo_smemtbl
+               for(int i = tofindb; i <= totest; i++) {
+                       bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+               }
+               if(tofindb == bamboo_free_block) {
+                       bamboo_free_block = totest+1;
+               }
+       } else if(foundsmem == 2) {
+               // no suitable block
+               *allocsize = 0;
+               mem = NULL;
        }
-       freemem->ptr = ((void*)freemem->ptr) + (*allocsize);
-       freemem->size -= *allocsize;
+
        return mem;
-} // void * globalmalloc_I(int, struct freeMemItem *, int *)
-#endif
+} // void * globalmalloc_I(int, int, int *)
+#endif // #ifdef MULTICORE_GC
 
 // malloc from the shared memory
 void * smemalloc_I(int coren,
@@ -1386,47 +1376,36 @@ void * smemalloc_I(int coren,
        void * mem = NULL;
 #ifdef MULTICORE_GC
        int isize = size+(BAMBOO_CACHE_LINE_SIZE);
-       int toallocate = (isize>(BAMBOO_SMEM_SIZE)) ? (isize):(BAMBOO_SMEM_SIZE);
-       // go through free mem list for suitable chunks
-       int tofindb = 0;
-       struct freeMemItem * freemem = findFreeMemChunk_I(coren, isize, &tofindb);
-
-       // allocate shared mem if available
-       if(freemem != NULL) {
-               switch(bamboo_smem_mode) {
-                       case SMEMLOCAL: {
-                               mem = localmalloc_I(tofindb, isize, freemem, allocsize);
-                               break;
-                       }
 
-                       case SMEMFIXED: {
-                               int startb = freemem->startblock;
-                               int endb = freemem->endblock;
-                               if(startb > tofindb) {
-                                       // malloc on global mem
-                                       mem = globalmalloc_I(isize, freemem, allocsize);
-                               } else {
-                                       // malloc on local mem
-                                       mem = localmalloc_I(tofindb, isize, freemem, allocsize);
-                               }
-                               break;
-                       }
+       // go through the bamboo_smemtbl for suitable partitions
+       switch(bamboo_smem_mode) {
+               case SMEMLOCAL: {
+                 mem = localmalloc_I(coren, isize, allocsize);
+                       break;
+         }
 
-                       case SMEMMIXED: {
-                               // TODO not supported yet
-                               BAMBOO_EXIT(0xe002);
-                               break;
-                       }
+               case SMEMFIXED: {
+                       // TODO not supported yet
+                       BAMBOO_EXIT(0xe001);
+                       break;
+               }
 
-                       case SMEMGLOBAL: {
-                               mem = globalmalloc_I(isize,freemem, allocsize);
-                               break;
-                       }
+               case SMEMMIXED: {
+                       // TODO not supported yet
+                       BAMBOO_EXIT(0xe002);
+                       break;
+               }
 
-                       default:
-                               break;
+               case SMEMGLOBAL: {
+                       mem = globalmalloc_I(coren, isize, allocsize);
+                       break;
                }
-       } else {
+
+               default:
+                       break;
+       }
+
+       if(mem == NULL) {
 #else
        int toallocate = (size>(BAMBOO_SMEM_SIZE)) ? (size):(BAMBOO_SMEM_SIZE);
        mem = mspace_calloc(bamboo_free_msp, 1, toallocate);
@@ -1862,15 +1841,28 @@ msg:
                  BAMBOO_DEBUGPRINT(0xe88a);
 #endif
 #endif
+                       int allocsize = 0;
+                 void * mem = NULL;
 #ifdef MULTICORE_GC
                        if(gcprocessing) {
                                // is currently doing gc, dump this msg
+                               if(INITPHASE == gcphase) {
+                                       // if still in the initphase of gc, send a startinit msg again
+                                       if(isMsgSending) {
+                                               cache_msg_1(msgdata[2], GCSTARTINIT);
+                                       } else {
+                                               send_msg_1(msgdata[2], GCSTARTINIT, true);
+                                       }
+                               }
                                break;
-                       }
+                       } 
 #endif
-                       int allocsize = 0;
-                 void * mem = smemalloc_I(msgdata[2], msgdata[1], &allocsize);
+                       mem = smemalloc_I(msgdata[2], msgdata[1], &allocsize);
                        if(mem == NULL) {
+                               // in this case, the gcflag of the startup core has been set
+                               // and the gc should be started later, then a GCSTARTINIT msg
+                               // will be sent to the requesting core to notice it to start gc
+                               // and try malloc again
                                break;
                        }
                        // send the start_va to request core
@@ -1902,6 +1894,7 @@ msg:
          } else {
 #ifdef MULTICORE_GC
                        // fill header to store the size of this mem block
+                       memset(msgdata[1], 0, BAMBOO_CACHE_LINE_SIZE);
                        (*((int*)msgdata[1])) = msgdata[2];
                  bamboo_smem_size = msgdata[2] - BAMBOO_CACHE_LINE_SIZE;
                        bamboo_cur_msp = msgdata[1] + BAMBOO_CACHE_LINE_SIZE;
@@ -1967,7 +1960,8 @@ msg:
                BAMBOO_DEBUGPRINT(0xe88c);
                BAMBOO_DEBUGPRINT_REG(msgdata[1]);
 #endif
-               if(msgdata[1] < NUMCORES4GC) {
+               // All cores should do init GC
+               if(msgdata[1] < NUMCORESACTIVE) {
                        gccorestatus[msgdata[1]] = 0;
                }
        }
@@ -1981,7 +1975,8 @@ msg:
 #endif
                  BAMBOO_EXIT(0xb002);
                }
-               if(msgdata[1] < NUMCORES4GC) {
+               // all cores should do mark
+               if(msgdata[1] < NUMCORESACTIVE) {
                        gccorestatus[msgdata[1]] = 0;
                        gcnumsendobjs[msgdata[1]] = msgdata[2];
                        gcnumreceiveobjs[msgdata[1]] = msgdata[3];
@@ -2003,6 +1998,7 @@ msg:
                int filledblocks = msgdata[2];
                int heaptop = msgdata[3];
                int data4 = msgdata[4];
+               // only gc cores need to do compact
                if(cnum < NUMCORES4GC) {
                        if(COMPACTPHASE == gcphase) {
                                gcfilledblocks[cnum] = filledblocks;
@@ -2022,39 +2018,6 @@ msg:
                                }
                        } else {
                                gccorestatus[cnum] = 0;
-                               // check if there is pending move request
-                               /*if(gcmovepending > 0) {
-                                       int j;
-                                       for(j = 0; j < NUMCORES4GC; j++) {
-                                               if(gcrequiredmems[j]>0) {
-                                                       break;
-                                               }
-                                       }
-                                       if(j < NUMCORES4GC) {
-                                               // find match
-                                               int tomove = 0;
-                                               int startaddr = 0;
-                                               gcrequiredmems[j] = assignSpareMem_I(cnum, 
-                                                                                                                                                                                          gcrequiredmems[j], 
-                                                                                                                                                                                          &tomove, 
-                                                                                                                                                                                          &startaddr);
-                                               if(STARTUPCORE == j) {
-                                                       gcdstcore = cnum;
-                                                       gctomove = true;
-                                                       gcmovestartaddr = startaddr;
-                                                       gcblock2fill = tomove;
-                                               } else {
-                                                       if(isMsgSending) {
-                                                               cache_msg_4(j, GCMOVESTART, cnum, startaddr, tomove);
-                                                       } else {
-                                                               send_msg_4(j, GCMOVESTART, cnum, startaddr, tomove, true);
-                                                       }
-                                               } // if(STARTUPCORE == j)
-                                               if(gcrequiredmems[j] == 0) {
-                                                       gcmovepending--;
-                                               }
-                                       } // if(j < NUMCORES4GC)
-                               } // if(gcmovepending > 0) */
                        } // if(data4>0)
                } // if(cnum < NUMCORES4GC)
          break;
@@ -2070,7 +2033,8 @@ msg:
 #endif
                  BAMBOO_EXIT(0xb004);
                } 
-               if(msgdata[1] < NUMCORES4GC) {
+               // all cores should do flush
+               if(msgdata[1] < NUMCORESACTIVE) {
                  gccorestatus[msgdata[1]] = 0;
                }
          break;
@@ -2084,8 +2048,9 @@ msg:
 
        case GCMARKCONFIRM: {
                // received a marked phase finish confirm request msg
+               // all cores should do mark
                if((BAMBOO_NUM_OF_CORE == STARTUPCORE) 
-                               || (BAMBOO_NUM_OF_CORE > NUMCORES4GC - 1)) {
+                               || (BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)) {
                  // wrong core to receive such msg
                  BAMBOO_EXIT(0xb005);
                } else {
@@ -2238,9 +2203,6 @@ msg:
        default:
                break;
        }
-       /*for(; msgdataindex > 0; --msgdataindex) {
-               msgdata[msgdataindex-1] = -1;
-       }*/
   memset(msgdata, '\0', sizeof(int) * msgdataindex);
        msgdataindex = 0;
        msglength = BAMBOO_MSG_BUF_LENGTH;
@@ -2604,22 +2566,6 @@ newtask:
          //clock2 = BAMBOO_GET_EXE_TIME();
 
          for(i = 0; i < runtime_locklen; i++) {
-         /*for(i = 0; i < numparams; i++) {
-                 void * param = currtpd->parameterArray[i];
-                 int * lock = 0;
-                 bool insert = true;
-                 if(((struct ___Object___ *)param)->type == STARTUPTYPE) {
-                         islock = false;
-                         taskpointerarray[i+OFFSET]=param;
-                         goto execute;
-                 }
-                 if(((struct ___Object___ *)param)->lock == NULL) {
-                         lock = (int *)param;
-                 } else {
-                         lock = (int *)(((struct ___Object___ *)param)->lock);
-                 }
-                 */
-
                  int * lock = (int *)(runtime_locks[i].redirectlock);
                  islock = true;
                  // require locks for this parameter if it is not a startup object
@@ -2667,18 +2613,9 @@ newtask:
                                BAMBOO_DEBUGPRINT_REG(lock);
 #endif
                                // check if has the lock already
-                               /*bool giveup = true;
-                               for(j = 0; j < runtime_locklen; j++) {
-                         if(runtime_locks[j].value == lock) {
-                                 giveup = false;
-                                 break;
-                         }
-                 }
-                               if(giveup) {*/
                          // can not get the lock, try later
                          // release all grabbed locks for previous parameters
                          for(j = 0; j < i; ++j) { 
-                         //for(j = 0; j < runtime_locklen; ++j) {
                                  lock = (int*)(runtime_locks[j].redirectlock);
                                  releasewritelock(lock);
                          }
@@ -2697,12 +2634,7 @@ newtask:
 #endif
                          goto newtask;
                                //}
-                 }/* else { // line 2794: if(grount == 0)
-                 // TODO
-                 runtime_locks[runtime_locklen].value = (int)lock;
-                 runtime_locks[runtime_locklen].redirectlock = (int)param;
-                 runtime_locklen++;
-                 }*/
+                 }
          } // line 2752:  for(i = 0; i < runtime_locklen; i++)
 
          /*long clock3;
@@ -3249,4 +3181,184 @@ void toiNext(struct tagobjectiterator *it,
     Objnext(&it->it);
   }
 }
+
+#ifdef PROFILE
+inline void profileTaskStart(char * taskname) {
+  if(!taskInfoOverflow) {
+         TaskInfo* taskInfo = RUNMALLOC(sizeof(struct task_info));
+         taskInfoArray[taskInfoIndex] = taskInfo;
+         taskInfo->taskName = taskname;
+         taskInfo->startTime = BAMBOO_GET_EXE_TIME();
+         taskInfo->endTime = -1;
+         taskInfo->exitIndex = -1;
+         taskInfo->newObjs = NULL;
+  }
+}
+
+inline void profileTaskEnd() {
+  if(!taskInfoOverflow) {
+         taskInfoArray[taskInfoIndex]->endTime = BAMBOO_GET_EXE_TIME();
+         taskInfoIndex++;
+         if(taskInfoIndex == TASKINFOLENGTH) {
+                 taskInfoOverflow = true;
+                 //taskInfoIndex = 0;
+         }
+  }
+}
+
+// output the profiling data
+void outputProfileData() {
+#ifdef USEIO
+  int i;
+  unsigned long long totaltasktime = 0;
+  unsigned long long preprocessingtime = 0;
+  unsigned long long objqueuecheckingtime = 0;
+  unsigned long long postprocessingtime = 0;
+  //int interruptiontime = 0;
+  unsigned long long other = 0;
+  unsigned long long averagetasktime = 0;
+  int tasknum = 0;
+
+  printf("Task Name, Start Time, End Time, Duration, Exit Index(, NewObj Name, Num)+\n");
+  // output task related info
+  for(i = 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    unsigned long long duration = tmpTInfo->endTime - tmpTInfo->startTime;
+    printf("%s, %lld, %lld, %lld, %lld", 
+                       tmpTInfo->taskName, tmpTInfo->startTime, tmpTInfo->endTime, 
+                       duration, tmpTInfo->exitIndex);
+       // summarize new obj info
+       if(tmpTInfo->newObjs != NULL) {
+               struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+               struct RuntimeIterator * iter = NULL;
+               while(0 == isEmpty(tmpTInfo->newObjs)) {
+                       char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+                       if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+                               int num = 0;
+                               RuntimeHashget(nobjtbl, (int)objtype, &num);
+                               RuntimeHashremovekey(nobjtbl, (int)objtype);
+                               num++;
+                               RuntimeHashadd(nobjtbl, (int)objtype, num);
+                       } else {
+                               RuntimeHashadd(nobjtbl, (int)objtype, 1);
+                       }
+                       //printf(stderr, "new obj!\n");
+               }
+
+               // output all new obj info
+               iter = RuntimeHashcreateiterator(nobjtbl);
+               while(RunhasNext(iter)) {
+                       char * objtype = (char *)Runkey(iter);
+                       int num = Runnext(iter);
+                       printf(", %s, %d", objtype, num);
+               }
+       }
+       printf("\n");
+    if(strcmp(tmpTInfo->taskName, "tpd checking") == 0) {
+      preprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "post task execution") == 0) {
+      postprocessingtime += duration;
+    } else if(strcmp(tmpTInfo->taskName, "objqueue checking") == 0) {
+      objqueuecheckingtime += duration;
+    } else {
+      totaltasktime += duration;
+      averagetasktime += duration;
+      tasknum++;
+    }
+  }
+
+  if(taskInfoOverflow) {
+    printf("Caution: task info overflow!\n");
+  }
+
+  other = totalexetime-totaltasktime-preprocessingtime-postprocessingtime;
+  averagetasktime /= tasknum;
+
+  printf("\nTotal time: %lld\n", totalexetime);
+  printf("Total task execution time: %lld (%d%%)\n", totaltasktime, 
+                          (int)(((double)totaltasktime/(double)totalexetime)*100));
+  printf("Total objqueue checking time: %lld (%d%%)\n", 
+                          objqueuecheckingtime, 
+                                (int)(((double)objqueuecheckingtime/(double)totalexetime)*100));
+  printf("Total pre-processing time: %lld (%d%%)\n", preprocessingtime, 
+                          (int)(((double)preprocessingtime/(double)totalexetime)*100));
+  printf("Total post-processing time: %lld (%d%%)\n", postprocessingtime, 
+                          (int)(((double)postprocessingtime/(double)totalexetime)*100));
+  printf("Other time: %lld (%d%%)\n", other, 
+                          (int)(((double)other/(double)totalexetime)*100));
+
+  printf("\nAverage task execution time: %lld\n", averagetasktime);
+#else
+  int i = 0;
+  int j = 0;
+
+  BAMBOO_DEBUGPRINT(0xdddd);
+  // output task related info
+  for(i= 0; i < taskInfoIndex; i++) {
+    TaskInfo* tmpTInfo = taskInfoArray[i];
+    char* tmpName = tmpTInfo->taskName;
+    int nameLen = strlen(tmpName);
+    BAMBOO_DEBUGPRINT(0xddda);
+    for(j = 0; j < nameLen; j++) {
+      BAMBOO_DEBUGPRINT_REG(tmpName[j]);
+    }
+    BAMBOO_DEBUGPRINT(0xdddb);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->startTime);
+    BAMBOO_DEBUGPRINT_REG(tmpTInfo->endTime);
+       BAMBOO_DEBUGPRINT_REG(tmpTInfo->exitIndex);
+       if(tmpTInfo->newObjs != NULL) {
+               struct RuntimeHash * nobjtbl = allocateRuntimeHash(5);
+               struct RuntimeIterator * iter = NULL;
+               while(0 == isEmpty(tmpTInfo->newObjs)) {
+                       char * objtype = (char *)(getItem(tmpTInfo->newObjs));
+                       if(RuntimeHashcontainskey(nobjtbl, (int)(objtype))) {
+                               int num = 0;
+                               RuntimeHashget(nobjtbl, (int)objtype, &num);
+                               RuntimeHashremovekey(nobjtbl, (int)objtype);
+                               num++;
+                               RuntimeHashadd(nobjtbl, (int)objtype, num);
+                       } else {
+                               RuntimeHashadd(nobjtbl, (int)objtype, 1);
+                       }
+               }
+
+               // ouput all new obj info
+               iter = RuntimeHashcreateiterator(nobjtbl);
+               while(RunhasNext(iter)) {
+                       char * objtype = (char *)Runkey(iter);
+                       int num = Runnext(iter);
+                       int nameLen = strlen(objtype);
+                       BAMBOO_DEBUGPRINT(0xddda);
+                       for(j = 0; j < nameLen; j++) {
+                               BAMBOO_DEBUGPRINT_REG(objtype[j]);
+                       }
+                       BAMBOO_DEBUGPRINT(0xdddb);
+                       BAMBOO_DEBUGPRINT_REG(num);
+               }
+       }
+    BAMBOO_DEBUGPRINT(0xdddc);
+  }
+
+  if(taskInfoOverflow) {
+    BAMBOO_DEBUGPRINT(0xefee);
+  }
+
+  // output interrupt related info
+  /*for(i = 0; i < interruptInfoIndex; i++) {
+       InterruptInfo* tmpIInfo = interruptInfoArray[i];
+       BAMBOO_DEBUGPRINT(0xddde);
+       BAMBOO_DEBUGPRINT_REG(tmpIInfo->startTime);
+       BAMBOO_DEBUGPRINT_REG(tmpIInfo->endTime);
+       BAMBOO_DEBUGPRINT(0xdddf);
+     }
+
+     if(interruptInfoOverflow) {
+       BAMBOO_DEBUGPRINT(0xefef);
+     }*/
+
+  BAMBOO_DEBUGPRINT(0xeeee);
+#endif
+}
+#endif  // #ifdef PROFILE
+
 #endif
index 931382bded42b298badd6fbeb4d5140ce7346c8f..04195913193484aaded4815e85cb5400ad287898 100755 (executable)
@@ -72,6 +72,7 @@ echo -o binary
 echo -nojava do not run bristlecone compiler
 echo -instructionfailures inject code for instructionfailures
 echo -profile build with profile options
+echo -gcprofile build with gcprofile options
 echo -accurateprofile build with accurate profile information including pre/post task processing info
 echo "-useio use standard io to output profiling data (should be used together with -raw and -profile), it only works with single core version"
 echo "-enable-assertions execute assert statements during compilation"
@@ -117,6 +118,7 @@ RAWCONFIG=''
 DEBUGFLAG=false
 RAWPATHFLAG=false
 PROFILEFLAG=false
+GCPROFILEFLAG=false
 ACCURATEPROFILEFLAG=false
 USEIOFLAG=false
 INTERRUPTFLAG=false
@@ -277,6 +279,9 @@ elif [[ $1 = '-profile' ]]
 then
 PROFILEFLAG=true
 EXTRAOPTIONS="$EXTRAOPTIONS -pg"
+elif [[ $1 = '-gcprofile' ]]
+then
+GCPROFILEFLAG=true
 elif [[ $1 = '-accurateprofile' ]]
 then
 ACCURATEPROFILEFLAG=true
@@ -299,11 +304,11 @@ JAVAOPTS="$JAVAOPTS -multicore"
 elif [[ $1 = '-numcore' ]]
 then
 JAVAOPTS="$JAVAOPTS -numcore $2"
-GCCORES="GC_$2"
 shift
 elif [[ $1 = '-numcore4gc' ]]
 then
 JAVAOPTS="$JAVAOPTS -numcore4gc $2"
+GCCORES="GC_$2"
 shift
 elif [[ $1 = '-raw' ]]
 then
@@ -689,6 +694,11 @@ then #MULTICOREGC version
 TILERACFLAGS="${TILERACFLAGS} -DMULTICORE_GC -D${GCCORES}"
 fi
 
+if $GCPROFILEFLAG
+then # GC_PROFILE version
+TILERACFLAGS="${TILERACFLAGS} -DGC_PROFILE"
+fi
+
 cp $ROBUSTROOT/Tilera/Runtime/$MAKEFILE ./Makefile
 cp $ROBUSTROOT/Tilera/Runtime/$SIMHVC ./sim.hvc
 cp $ROBUSTROOT/Tilera/Runtime/$PCIHVC ./pci.hvc