some bug fixes

[IRC.git] / Robust / src / Runtime / multicoretask.c
diff --git a/Robust/src/Runtime/multicoretask.c b/Robust/src/Runtime/multicoretask.c

index 7ba29a8518b56a29423172f4f88347ca162471a2..f7a830b5cdd146f274c04d14ee3578539bf6828f 100644 (file)
--- a/Robust/src/Runtime/multicoretask.c
+++ b/Robust/src/Runtime/multicoretask.c
@@ -25,20 +25,163 @@ int enqueuetasks_I(struct parameterwrapper *parameter,
                     int numenterflags);
  
  #ifdef MULTICORE_GC
+#ifdef SMEMF
+#define NUM_CORES2TEST 5
+#ifdef GC_1
+int core2test[1][NUM_CORES2TEST] = {
+  {0, -1, -1, -1, -1}
+};
+#elif defined GC_56
+int core2test[56][NUM_CORES2TEST] = {
+  { 0, -1,  7, -1,  1}, { 1, -1,  8,  0,  2}, { 2, -1,  9,  1,  3},
+  { 3, -1, 10,  2,  4}, { 4, -1, 11,  3,  5}, { 5, -1, 12,  4,  6},
+  { 6, -1, 13,  5, -1}, { 7,  0, 14, -1,  8}, { 8,  1, 15,  7,  9},
+  { 9,  2, 16,  8, 10}, {10,  3, 17,  9, 11}, {11,  4, 18, 10, 12},
+  {12,  5, 19, 11, 13}, {13,  6, 20, 12, -1}, {14,  7, 21, -1, 15},
+  {15,  8, 22, 14, 16}, {16,  9, 23, 15, 17}, {17, 10, 24, 16, 18},
+  {18, 11, 25, 17, 19}, {19, 12, 26, 18, 20}, {20, 13, 27, 19, -1},
+  {21, 14, 28, -1, 22}, {22, 15, 29, 21, 23}, {23, 16, 30, 22, 24},
+  {24, 17, 31, 23, 25}, {25, 18, 32, 24, 26}, {26, 19, 33, 25, 27},
+  {27, 20, 34, 26, -1}, {28, 21, 35, -1, 29}, {29, 22, 36, 28, 30},
+  {30, 23, 37, 29, 31}, {31, 24, 38, 30, 32}, {32, 25, 39, 31, 33},
+  {33, 26, 40, 32, 34}, {34, 27, 41, 33, -1}, {35, 28, 42, -1, 36},
+  {36, 29, 43, 35, 37}, {37, 30, 44, 36, 38}, {38, 31, 45, 37, 39},
+  {39, 32, 46, 38, 40}, {40, 33, 47, 39, 41}, {41, 34, 48, 40, -1},
+  {42, 35, 49, -1, 43}, {43, 36, 50, 42, 44}, {44, 37, 51, 43, 45},
+  {45, 38, 52, 44, 46}, {46, 39, 53, 45, 47}, {47, 40, 54, 46, 48},
+  {48, 41, 55, 47, -1}, {49, 42, -1, -1, 50}, {50, 43, -1, 49, 51},
+  {51, 44, -1, 50, 52}, {52, 45, -1, 51, 53}, {53, 46, -1, 52, 54},
+  {54, 47, -1, 53, 55}, {55, 48, -1, 54, -1}
+};
+#elif defined GC_62
+int core2test[62][NUM_CORES2TEST] = {
+  { 0, -1,  6, -1,  1}, { 1, -1,  7,  0,  2}, { 2, -1,  8,  1,  3},
+  { 3, -1,  9,  2,  4}, { 4, -1, 10,  3,  5}, { 5, -1, 11,  4, -1},
+  { 6,  0, 14, -1,  7}, { 7,  1, 15,  6,  8}, { 8,  2, 16,  7,  9},
+  { 9,  3, 17,  8, 10}, {10,  4, 18,  9, 11}, {11,  5, 19, 10, 12},
+  {12, -1, 20, 11, 13}, {13, -1, 21, 12, -1}, {14,  6, 22, -1, 15},
+  {15,  7, 23, 14, 16}, {16,  8, 24, 15, 17}, {17,  9, 25, 16, 18},
+  {18, 10, 26, 17, 19}, {19, 11, 27, 18, 20}, {20, 12, 28, 19, 21},
+  {21, 13, 29, 28, -1}, {22, 14, 30, -1, 23}, {23, 15, 31, 22, 24},
+  {24, 16, 32, 23, 25}, {25, 17, 33, 24, 26}, {26, 18, 34, 25, 27},
+  {27, 19, 35, 26, 28}, {28, 20, 36, 27, 29}, {29, 21, 37, 28, -1},
+  {30, 22, 38, -1, 31}, {31, 23, 39, 30, 32}, {32, 24, 40, 31, 33},
+  {33, 25, 41, 32, 34}, {34, 26, 42, 33, 35}, {35, 27, 43, 34, 36},
+  {36, 28, 44, 35, 37}, {37, 29, 45, 36, -1}, {38, 30, 46, -1, 39},
+  {39, 31, 47, 38, 40}, {40, 32, 48, 39, 41}, {41, 33, 49, 40, 42},
+  {42, 34, 50, 41, 43}, {43, 35, 51, 42, 44}, {44, 36, 52, 43, 45},
+  {45, 37, 53, 44, -1}, {46, 38, 54, -1, 47}, {47, 39, 55, 46, 48},
+  {48, 40, 56, 47, 49}, {49, 41, 57, 48, 50}, {50, 42, 58, 49, 51},
+  {51, 43, 59, 50, 52}, {52, 44, 60, 51, 53}, {53, 45, 61, 52, -1},
+  {54, 46, -1, -1, 55}, {55, 47, -1, 54, 56}, {56, 48, -1, 55, 57},
+  {57, 49, -1, 56, 59}, {58, 50, -1, 57, 59}, {59, 51, -1, 58, 60},
+  {60, 52, -1, 59, 61}, {61, 53, -1, 60, -1}
+};
+#endif // GC_1
+#elif defined SMEMM
+unsigned int gcmem_mixed_threshold = 0;
+unsigned int gcmem_mixed_usedmem = 0;
+#define NUM_CORES2TEST 9
+#ifdef GC_1
+int core2test[1][NUM_CORES2TEST] = {
+  {0, -1, -1, -1, -1, -1, -1, -1, -1}
+};
+#elif defined GC_56
+int core2test[56][NUM_CORES2TEST] = {
+  { 0, -1,  7, -1,  1, -1, 14, -1,  2}, { 1, -1,  8,  0,  2, -1, 15, -1,  3}, 
+  { 2, -1,  9,  1,  3, -1, 16,  0,  4}, { 3, -1, 10,  2,  4, -1, 17,  1,  5}, 
+  { 4, -1, 11,  3,  5, -1, 18,  2,  6}, { 5, -1, 12,  4,  6, -1, 19,  3, -1},
+  { 6, -1, 13,  5, -1, -1, 20,  4, -1}, { 7,  0, 14, -1,  8, -1, 21, -1,  9}, 
+  { 8,  1, 15,  7,  9, -1, 22, -1, 10}, { 9,  2, 16,  8, 10, -1, 23,  7, 11}, 
+  {10,  3, 17,  9, 11, -1, 24,  8, 12}, {11,  4, 18, 10, 12, -1, 25,  9, 13},
+  {12,  5, 19, 11, 13, -1, 26, 10, -1}, {13,  6, 20, 12, -1, -1, 27, 11, -1}, 
+  {14,  7, 21, -1, 15,  0, 28, -1, 16}, {15,  8, 22, 14, 16,  1, 29, -1, 17}, 
+  {16,  9, 23, 15, 17,  2, 30, 14, 18}, {17, 10, 24, 16, 18,  3, 31, 15, 19},
+  {18, 11, 25, 17, 19,  4, 32, 16, 20}, {19, 12, 26, 18, 20,  5, 33, 17, -1}, 
+  {20, 13, 27, 19, -1,  6, 34, 18, -1}, {21, 14, 28, -1, 22,  7, 35, -1, 23}, 
+  {22, 15, 29, 21, 23,  8, 36, -1, 24}, {23, 16, 30, 22, 24,  9, 37, 21, 25},
+  {24, 17, 31, 23, 25, 10, 38, 22, 26}, {25, 18, 32, 24, 26, 11, 39, 23, 27}, 
+  {26, 19, 33, 25, 27, 12, 40, 24, -1}, {27, 20, 34, 26, -1, 13, 41, 25, -1}, 
+  {28, 21, 35, -1, 29, 14, 42, -1, 30}, {29, 22, 36, 28, 30, 15, 43, -1, 31},
+  {30, 23, 37, 29, 31, 16, 44, 28, 32}, {31, 24, 38, 30, 32, 17, 45, 29, 33}, 
+  {32, 25, 39, 31, 33, 18, 46, 30, 34}, {33, 26, 40, 32, 34, 19, 47, 31, -1}, 
+  {34, 27, 41, 33, -1, 20, 48, 32, -1}, {35, 28, 42, -1, 36, 21, 49, -1, 37},
+  {36, 29, 43, 35, 37, 22, 50, -1, 38}, {37, 30, 44, 36, 38, 23, 51, 35, 39}, 
+  {38, 31, 45, 37, 39, 24, 52, 36, 40}, {39, 32, 46, 38, 40, 25, 53, 37, 41}, 
+  {40, 33, 47, 39, 41, 26, 54, 38, -1}, {41, 34, 48, 40, -1, 27, 55, 39, -1},
+  {42, 35, 49, -1, 43, 28, -1, -1, 44}, {43, 36, 50, 42, 44, 29, -1, -1, 45}, 
+  {44, 37, 51, 43, 45, 30, -1, 42, 46}, {45, 38, 52, 44, 46, 31, -1, 43, 47}, 
+  {46, 39, 53, 45, 47, 32, -1, 44, 48}, {47, 40, 54, 46, 48, 33, -1, 45, -1},
+  {48, 41, 55, 47, -1, 34, -1, 46, -1}, {49, 42, -1, -1, 50, 35, -1, -1, 51}, 
+  {50, 43, -1, 49, 51, 36, -1, -1, 52}, {51, 44, -1, 50, 52, 37, -1, 49, 53}, 
+  {52, 45, -1, 51, 53, 38, -1, 50, 54}, {53, 46, -1, 52, 54, 39, -1, 51, 55},
+  {54, 47, -1, 53, 55, 40, -1, 52, -1}, {55, 48, -1, 54, -1, 41, -1, 53, -1}
+};
+#elif defined GC_62
+int core2test[62][NUM_CORES2TEST] = {
+  { 0, -1,  6, -1,  1, -1, 14, -1,  2}, { 1, -1,  7,  0,  2, -1, 15, -1,  3}, 
+  { 2, -1,  8,  1,  3, -1, 16,  0,  4}, { 3, -1,  9,  2,  4, -1, 17,  1,  5}, 
+  { 4, -1, 10,  3,  5, -1, 18,  2, -1}, { 5, -1, 11,  4, -1, -1, 19,  3, -1},
+  { 6,  0, 14, -1,  7, -1, 22, -1,  8}, { 7,  1, 15,  6,  8, -1, 23, -1,  9}, 
+  { 8,  2, 16,  7,  9, -1, 24,  6, 10}, { 9,  3, 17,  8, 10, -1, 25,  7, 11}, 
+  {10,  4, 18,  9, 11, -1, 26,  8, 12}, {11,  5, 19, 10, 12, -1, 27,  9, 13},
+  {12, -1, 20, 11, 13, -1, 28, 10, -1}, {13, -1, 21, 12, -1, -1, 29, 11, -1}, 
+  {14,  6, 22, -1, 15,  0, 30, -1, 16}, {15,  7, 23, 14, 16,  1, 31, -1, 17}, 
+  {16,  8, 24, 15, 17,  2, 32, 14, 18}, {17,  9, 25, 16, 18,  3, 33, 15, 19},
+  {18, 10, 26, 17, 19,  4, 34, 16, 20}, {19, 11, 27, 18, 20,  5, 35, 17, 21}, 
+  {20, 12, 28, 19, 21, -1, 36, 18, -1}, {21, 13, 29, 28, -1, -1, 37, 19, -1}, 
+  {22, 14, 30, -1, 23,  6, 38, -1, 24}, {23, 15, 31, 22, 24,  7, 39, -1, 25},
+  {24, 16, 32, 23, 25,  8, 40, 22, 26}, {25, 17, 33, 24, 26,  9, 41, 23, 27}, 
+  {26, 18, 34, 25, 27, 10, 42, 24, 28}, {27, 19, 35, 26, 28, 11, 43, 25, 29}, 
+  {28, 20, 36, 27, 29, 12, 44, 26, -1}, {29, 21, 37, 28, -1, 13, 45, 27, -1},
+  {30, 22, 38, -1, 31, 22, 46, -1, 32}, {31, 23, 39, 30, 32, 15, 47, -1, 33}, 
+  {32, 24, 40, 31, 33, 16, 48, 30, 34}, {33, 25, 41, 32, 34, 17, 49, 31, 35}, 
+  {34, 26, 42, 33, 35, 18, 50, 32, 36}, {35, 27, 43, 34, 36, 19, 51, 33, 37},
+  {36, 28, 44, 35, 37, 20, 52, 34, -1}, {37, 29, 45, 36, -1, 21, 53, 35, -1}, 
+  {38, 30, 46, -1, 39, 22, 54, -1, 40}, {39, 31, 47, 38, 40, 23, 55, -1, 41}, 
+  {40, 32, 48, 39, 41, 24, 56, 38, 42}, {41, 33, 49, 40, 42, 25, 57, 39, 43},
+  {42, 34, 50, 41, 43, 26, 58, 40, 44}, {43, 35, 51, 42, 44, 27, 59, 41, 45}, 
+  {44, 36, 52, 43, 45, 28, 60, 42, -1}, {45, 37, 53, 44, -1, 29, 61, 43, -1}, 
+  {46, 38, 54, -1, 47, 30, -1, -1, 48}, {47, 39, 55, 46, 48, 31, -1, -1, 49},
+  {48, 40, 56, 47, 49, 32, -1, 46, 50}, {49, 41, 57, 48, 50, 33, -1, 47, 51}, 
+  {50, 42, 58, 49, 51, 34, -1, 48, 52}, {51, 43, 59, 50, 52, 35, -1, 49, 53}, 
+  {52, 44, 60, 51, 53, 36, -1, 50, -1}, {53, 45, 61, 52, -1, 37, -1, 51, -1},
+  {54, 46, -1, -1, 55, 38, -1, -1, 56}, {55, 47, -1, 54, 56, 39, -1, -1, 57}, 
+  {56, 48, -1, 55, 57, 40, -1, 54, 58}, {57, 49, -1, 56, 59, 41, -1, 55, 59}, 
+  {58, 50, -1, 57, 59, 42, -1, 56, 60}, {59, 51, -1, 58, 60, 43, -1, 57, 61},
+  {60, 52, -1, 59, 61, 44, -1, 58, -1}, {61, 53, -1, 60, -1, 45, -1, 59, -1}
+};
+#endif // GC_1
+#endif
+
  inline __attribute__((always_inline))
  void setupsmemmode(void) {
  #ifdef SMEML
+  // Only allocate local mem chunks to each core.
+  // If a core has used up its local shared memory, start gc.
    bamboo_smem_mode = SMEMLOCAL;
  #elif defined SMEMF
+  // Allocate the local shared memory to each core with the highest priority,
+  // if a core has used up its local shared memory, try to allocate the 
+  // shared memory that belong to its neighbours, if also failed, start gc.
    bamboo_smem_mode = SMEMFIXED;
  #elif defined SMEMM
+  // Allocate the local shared memory to each core with the highest priority,
+  // if a core has used up its local shared memory, try to allocate the 
+  // shared memory that belong to its neighbours first, if failed, check 
+  // current memory allocation rate, if it has already reached the threshold,
+  // start gc, otherwise, allocate the shared memory globally.  If all the 
+  // shared memory has been used up, start gc.
    bamboo_smem_mode = SMEMMIXED;
  #elif defined SMEMG
+  // Allocate all the memory chunks globally, do not consider the host cores
+  // When all the shared memory are used up, start gc.
    bamboo_smem_mode = SMEMGLOBAL;
  #else
    // defaultly using local mode
    //bamboo_smem_mode = SMEMLOCAL;
-  bamboo_smem_mode = SMEMGLOBAL;
+  //bamboo_smem_mode = SMEMGLOBAL;
+  //bamboo_smem_mode = SMEMFIXED;
  #endif
  } // void setupsmemmode(void)
  #endif
@@ -59,8 +202,8 @@ void initruntimedata() {
  #endif
  #ifdef MULTICORE_GC
        gccorestatus[i] = 1;
-      gcnumsendobjs[i] = 0;
-      gcnumreceiveobjs[i] = 0;
+      gcnumsendobjs[0][i] = gcnumsendobjs[1][i] = 0;
+      gcnumreceiveobjs[0][i] = gcnumreceiveobjs[1][i] = 0;
  #endif
      } // for(i = 0; i < NUMCORESACTIVE; ++i)
  #ifdef MULTICORE_GC
@@ -73,6 +216,8 @@ void initruntimedata() {
  #ifdef GC_PROFILE
      gc_infoIndex = 0;
      gc_infoOverflow = false;
+       gc_num_livespace = 0;
+       gc_num_freespace = 0;
  #endif
  #endif
      numconfirm = 0;
@@ -100,7 +245,7 @@ void initruntimedata() {
    outmsglast = 0;
    outmsgleft = 0;
    isMsgHanging = false;
-  isMsgSending = false;
+  //isMsgSending = false;
  
    smemflag = true;
    bamboo_cur_msp = NULL;
@@ -108,6 +253,7 @@ void initruntimedata() {
    totransobjqueue = createQueue_I();
  
  #ifdef MULTICORE_GC
+  bamboo_smem_zero_top = NULL;
    gcflag = false;
    gcprocessing = false;
    gcphase = FINISHPHASE;
@@ -115,13 +261,16 @@ void initruntimedata() {
    gcself_numsendobjs = 0;
    gcself_numreceiveobjs = 0;
    gcmarkedptrbound = 0;
-  //mgchashCreate(2000, 0.75);
+#ifdef LOCALHASHTBL_TEST
    gcpointertbl = allocateRuntimeHash_I(20);
-  //gcpointertbl = allocateMGCHash(20);
+#else
+  gcpointertbl = mgchashCreate_I(2000, 0.75);
+#endif
+  //gcpointertbl = allocateMGCHash_I(20);
    gcforwardobjtbl = allocateMGCHash_I(20, 3);
    gcobj2map = 0;
    gcmappedobj = 0;
-  gcismapped = false;
+  //gcismapped = false;
    gcnumlobjs = 0;
    gcheaptop = 0;
    gctopcore = 0;
@@ -132,7 +281,34 @@ void initruntimedata() {
    gcblock2fill = 0;
    gcsbstarttbl = BAMBOO_BASE_VA;
    bamboo_smemtbl = (void *)gcsbstarttbl
-                   + (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE)*sizeof(INTPTR);
+               + (BAMBOO_SHARED_MEM_SIZE/BAMBOO_SMEM_SIZE)*sizeof(INTPTR);
+  if(BAMBOO_NUM_OF_CORE < NUMCORES4GC) {
+       int t_size = ((BAMBOO_RMSP_SIZE)-sizeof(mgcsharedhashtbl_t)*2
+               -128*sizeof(size_t))/sizeof(mgcsharedhashlistnode_t)-2;
+       int kk = 0;
+       unsigned int tmp_k = 1 << (sizeof(int)*8 -1);
+       while(((t_size & tmp_k) == 0) && (kk < sizeof(int)*8)) {
+         t_size = t_size << 1;
+         kk++;
+       }
+       t_size = tmp_k >> kk;
+       gcsharedptbl = mgcsharedhashCreate_I(t_size,0.30);//allocateGCSharedHash_I(20);
+  } else {
+       gcsharedptbl = NULL;
+  }
+  BAMBOO_MEMSET_WH(gcrpointertbls,0,sizeof(mgcsharedhashtbl_t *)*NUMCORES4GC);
+         //sizeof(struct RuntimeHash *)*NUMCORES4GC);
+#ifdef SMEMM
+  gcmem_mixed_threshold = (unsigned int)((BAMBOO_SHARED_MEM_SIZE
+               -bamboo_reserved_smem*BAMBOO_SMEM_SIZE)*0.8);
+  gcmem_mixed_usedmem = 0;
+#endif
+#ifdef GC_PROFILE//_S
+  gc_num_obj = 0;
+  gc_num_liveobj = 0;
+  gc_num_forwardobj = 0;
+  gc_num_profiles = NUMCORESACTIVE - 1;
+#endif
  #else
    // create the lock table, lockresult table and obj queue
    locktable.size = 20;
@@ -180,10 +356,15 @@ void initruntimedata() {
  inline __attribute__((always_inline))
  void disruntimedata() {
  #ifdef MULTICORE_GC
-  //mgchashDelete();
+#ifdef LOCALHASHTBL_TEST
    freeRuntimeHash(gcpointertbl);
+#else
+  mgchashDelete(gcpointertbl);
+#endif
    //freeMGCHash(gcpointertbl);
    freeMGCHash(gcforwardobjtbl);
+  // for mapping info structures
+  //freeRuntimeHash(gcrcoretbl);
  #else
    freeRuntimeHash(lockRedirectTbl);
    freeRuntimeHash(objRedirectLockTbl);
@@ -240,11 +421,6 @@ bool checkObjQueue() {
      getwritelock_I(obj);
      while(!lockflag) {
        BAMBOO_WAITING_FOR_LOCK(0);
-         // check for outgoing sends
-         if (isMsgHanging) {
-               extern inline void send_hanging_msg(bool);
-               send_hanging_msg(true);
-         } 
      }             // while(!lockflag)
      grount = lockresult;
  #ifdef DEBUG
@@ -403,8 +579,8 @@ void checkCoreStatus() {
           for(i = 1; i < NUMCORESACTIVE; ++i) {
             corestatus[i] = 1;
             // send status confirm msg to core i
-           send_msg_1(i, STATUSCONFIRM);
-         }                               // for(i = 1; i < NUMCORESACTIVE; ++i)
+           send_msg_1(i, STATUSCONFIRM, false);
+         }   // for(i = 1; i < NUMCORESACTIVE; ++i)
           return;
         } else {
           // all the core status info are the latest
@@ -423,7 +599,9 @@ void checkCoreStatus() {
  
           BAMBOO_DEBUGPRINT(BAMBOO_GET_EXE_TIME() - bamboo_start_time);
           //BAMBOO_DEBUGPRINT_REG(total_num_t6); // TODO for test
+#ifndef BAMBOO_MEMPROF
           BAMBOO_DEBUGPRINT(0xbbbbbbbb);
+#endif
  #endif
           // profile mode, send msgs to other cores to request pouring
           // out progiling data
@@ -434,7 +612,7 @@ void checkCoreStatus() {
  #endif
           for(i = 1; i < NUMCORESACTIVE; ++i) {
             // send profile request msg to core i
-           send_msg_2(i, PROFILEOUTPUT, totalexetime);
+           send_msg_2(i, PROFILEOUTPUT, totalexetime, false);
           } // for(i = 1; i < NUMCORESACTIVE; ++i)
           // pour profiling data on startup core
           outputProfileData();
@@ -454,8 +632,8 @@ void checkCoreStatus() {
               BAMBOO_DEBUGPRINT(0xe000 + profilestatus[i]);
  #endif
               if(profilestatus[i] != 0) {
-               allStall = false;
-               break;
+                       allStall = false;
+                       break;
               }
             }  // for(i = 0; i < NUMCORESACTIVE; ++i)
             if(!allStall) {
@@ -469,11 +647,14 @@ void checkCoreStatus() {
             } else {
               BAMBOO_ENTER_CLIENT_MODE_FROM_RUNTIME();
               break;
-           }                                     // if(!allStall)
-         }                               // while(true)
+           }  // if(!allStall)
+         }  // while(true)
  #endif
  
           // gc_profile mode, ourput gc prfiling data
+#ifdef BAMBOO_MEMPROF
+         //terminatememprof();
+#endif // #ifndef BAMBOO_MEMPROF
  #ifdef MULTICORE_GC
  #ifdef GC_PROFILE
           gc_outputProfileData();
@@ -522,6 +703,7 @@ inline void run(void * arg) {
    BAMBOO_DEBUGPRINT_REG(corenum);
    BAMBOO_DEBUGPRINT(STARTUPCORE);
  #endif
+ //BAMBOO_DEBUGPRINT(0xeeee); // TODO
  
    // initialize runtime data structures
    initruntimedata();
@@ -624,7 +806,7 @@ inline void run(void * arg) {
  #endif
               // send stall msg
               send_msg_4(STARTUPCORE, TRANSTALL, BAMBOO_NUM_OF_CORE,
-                        self_numsendobjs, self_numreceiveobjs);
+                        self_numsendobjs, self_numreceiveobjs, false);
               sendStall = true;
               isfirst = true;
               busystatus = false;
@@ -1041,7 +1223,7 @@ void enqueueObject(void * vptr,
             //slotid is parameter->tagarray[2*i];
             int tagid=parameter->tagarray[2*i+1];
             if (tagid!=tagptr->flag)
-             goto nextloop;                                           /*We don't have this tag */
+             goto nextloop;           /*We don't have this tag */
           }
         } else {                         //multiple tags
           struct ArrayObject * ao=(struct ArrayObject *) tagptr;
@@ -1106,7 +1288,7 @@ void enqueueObject_I(void * vptr,
        /* Check tags */
        if (parameter->numbertags>0) {
         if (tagptr==NULL)
-         goto nextloop;                               //that means the object has no tag
+         goto nextloop;      //that means the object has no tag
         //but that param needs tag
         else if(tagptr->type==TAGTYPE) {                         //one tag
           //struct ___TagDescriptor___ * tag=(struct ___TagDescriptor___*) tagptr;
@@ -1114,7 +1296,7 @@ void enqueueObject_I(void * vptr,
             //slotid is parameter->tagarray[2*i];
             int tagid=parameter->tagarray[2*i+1];
             if (tagid!=tagptr->flag)
-             goto nextloop;                                           /*We don't have this tag */
+             goto nextloop;            /*We don't have this tag */
           }
         } else {                         //multiple tags
           struct ArrayObject * ao=(struct ArrayObject *) tagptr;
@@ -1237,13 +1419,16 @@ inline void addNewObjInfo(void * nobj) {
  #endif
  
  #ifdef MULTICORE_GC
+// Only allocate local mem chunks to each core.
+// If a core has used up its local shared memory, start gc.
  void * localmalloc_I(int coren,
                       int isize,
                       int * allocsize) {
    void * mem = NULL;
+  int gccorenum = (coren < NUMCORES4GC) ? (coren) : (coren % NUMCORES4GC);
    int i = 0;
    int j = 0;
-  int tofindb = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
+  int tofindb = gc_core2block[2*gccorenum+i]+(NUMCORES4GC*2)*j;
    int totest = tofindb;
    int bound = BAMBOO_SMEM_SIZE_L;
    int foundsmem = 0;
@@ -1256,56 +1441,56 @@ void * localmalloc_I(int coren,
        bool tocheck = true;
        // have some space in the block
        if(totest == tofindb) {
-       // the first partition
-       size = bound - nsize;
+               // the first partition
+               size = bound - nsize;
        } else if(nsize == 0) {
-       // an empty partition, can be appended
-       size += bound;
+               // an empty partition, can be appended
+               size += bound;
        } else {
-       // not an empty partition, can not be appended
-       // the last continuous block is not big enough, go to check the next
-       // local block
-       islocal = true;
-       tocheck = false;
-      }                   // if(totest == tofindb) else if(nsize == 0) else ...
+               // not an empty partition, can not be appended
+               // the last continuous block is not big enough, go to check the next
+               // local block
+               islocal = true;
+               tocheck = false;
+      } // if(totest == tofindb) else if(nsize == 0) else ...
        if(tocheck) {
-       if(size >= isize) {
-         // have enough space in the block, malloc
-         foundsmem = 1;
-         break;
-       } else {
-         // no enough space yet, try to append next continuous block
-         islocal = false;
-       }                         // if(size > isize) else ...
-      }                   // if(tocheck)
-    }             // if(nsize < bound)
+               if(size >= isize) {
+                 // have enough space in the block, malloc
+                 foundsmem = 1;
+                 break;
+               } else {
+                 // no enough space yet, try to append next continuous block
+                 islocal = false;
+               }  // if(size > isize) else ...
+      }  // if(tocheck)
+    } // if(nsize < bound)
      if(islocal) {
        // no space in the block, go to check the next block
        i++;
        if(2==i) {
-       i = 0;
-       j++;
+               i = 0;
+               j++;
        }
-      tofindb = totest = gc_core2block[2*coren+i]+(NUMCORES4GC*2)*j;
+      tofindb = totest = gc_core2block[2*gccorenum+i]+(NUMCORES4GC*2)*j;
      } else {
        totest += 1;
-    }             // if(islocal) else ...
+    }  // if(islocal) else ...
      if(totest > gcnumblock-1-bamboo_reserved_smem) {
        // no more local mem, do not find suitable block
        foundsmem = 2;
        break;
-    }             // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
    } while(true);
  
    if(foundsmem == 1) {
      // find suitable block
      mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
-                                            (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
-                                                                            (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
+          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
      *allocsize = size;
      // set bamboo_smemtbl
      for(i = tofindb; i <= totest; i++) {
-      bamboo_smemtbl[i]=(i<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
      }
    } else if(foundsmem == 2) {
      // no suitable block
@@ -1315,6 +1500,222 @@ void * localmalloc_I(int coren,
    return mem;
  } // void * localmalloc_I(int, int, int *)
  
+#ifdef SMEMF
+// Allocate the local shared memory to each core with the highest priority,
+// if a core has used up its local shared memory, try to allocate the 
+// shared memory that belong to its neighbours, if also failed, start gc.
+void * fixedmalloc_I(int coren,
+                     int isize,
+                     int * allocsize) {
+  void * mem = NULL;
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int gccorenum = (coren < NUMCORES4GC) ? (coren) : (coren % NUMCORES4GC);
+  int coords_x = bamboo_cpu2coords[gccorenum*2];
+  int coords_y = bamboo_cpu2coords[gccorenum*2+1];
+  int ii = 1;
+  int tofindb = gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+  int totest = tofindb;
+  int bound = BAMBOO_SMEM_SIZE_L;
+  int foundsmem = 0;
+  int size = 0;
+  do {
+    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+    int nsize = bamboo_smemtbl[totest];
+    bool islocal = true;
+    if(nsize < bound) {
+      bool tocheck = true;
+      // have some space in the block
+      if(totest == tofindb) {
+               // the first partition
+               size = bound - nsize;
+      } else if(nsize == 0) {
+               // an empty partition, can be appended
+               size += bound;
+      } else {
+               // not an empty partition, can not be appended
+               // the last continuous block is not big enough, go to check the next
+               // local block
+               islocal = true;
+               tocheck = false;
+      } // if(totest == tofindb) else if(nsize == 0) else ...
+      if(tocheck) {
+               if(size >= isize) {
+                 // have enough space in the block, malloc
+                 foundsmem = 1;
+                 break;
+               } else {
+                 // no enough space yet, try to append next continuous block
+                 // TODO may consider to go to next local block?
+                 islocal = false;
+               }  // if(size > isize) else ...
+      }  // if(tocheck)
+    } // if(nsize < bound)
+    if(islocal) {
+      // no space in the block, go to check the next block
+      i++;
+      if(2==i) {
+               i = 0;
+               j++;
+      }
+      tofindb=totest=
+               gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+    } else {
+      totest += 1;
+    }  // if(islocal) else ...
+    if(totest > gcnumblock-1-bamboo_reserved_smem) {
+      // no more local mem, do not find suitable block on local mem
+         // try to malloc shared memory assigned to the neighbour cores
+         do{
+               k++;
+               if(k >= NUM_CORES2TEST) {
+                 // no more memory available on either coren or its neighbour cores
+                 foundsmem = 2;
+                 goto memsearchresult;
+               }
+         } while(core2test[gccorenum][k] == -1);
+         i = 0;
+         j = 0;
+         tofindb=totest=gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+  } while(true);
+
+memsearchresult:
+  if(foundsmem == 1) {
+    // find suitable block
+    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
+          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
+          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+    *allocsize = size;
+    // set bamboo_smemtbl
+    for(i = tofindb; i <= totest; i++) {
+      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+    }
+  } else if(foundsmem == 2) {
+    // no suitable block
+    *allocsize = 0;
+  }
+
+  return mem;
+} // void * fixedmalloc_I(int, int, int *)
+#endif // #ifdef SMEMF
+
+#ifdef SMEMM
+// Allocate the local shared memory to each core with the highest priority,
+// if a core has used up its local shared memory, try to allocate the 
+// shared memory that belong to its neighbours first, if failed, check 
+// current memory allocation rate, if it has already reached the threshold,
+// start gc, otherwise, allocate the shared memory globally.  If all the 
+// shared memory has been used up, start gc.
+void * mixedmalloc_I(int coren,
+                     int isize,
+                     int * allocsize) {
+  void * mem = NULL;
+  int i = 0;
+  int j = 0;
+  int k = 0;
+  int gccorenum = (coren < NUMCORES4GC) ? (coren) : (coren % NUMCORES4GC);
+  int ii = 1;
+  int tofindb = gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+  int totest = tofindb;
+  int bound = BAMBOO_SMEM_SIZE_L;
+  int foundsmem = 0;
+  int size = 0;
+  do {
+    bound = (totest < NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+    int nsize = bamboo_smemtbl[totest];
+    bool islocal = true;
+    if(nsize < bound) {
+      bool tocheck = true;
+      // have some space in the block
+      if(totest == tofindb) {
+               // the first partition
+               size = bound - nsize;
+      } else if(nsize == 0) {
+               // an empty partition, can be appended
+               size += bound;
+      } else {
+               // not an empty partition, can not be appended
+               // the last continuous block is not big enough, go to check the next
+               // local block
+               islocal = true;
+               tocheck = false;
+      } // if(totest == tofindb) else if(nsize == 0) else ...
+      if(tocheck) {
+               if(size >= isize) {
+                 // have enough space in the block, malloc
+                 foundsmem = 1;
+                 break;
+               } else {
+                 // no enough space yet, try to append next continuous block
+                 // TODO may consider to go to next local block?
+                 islocal = false;
+               }  // if(size > isize) else ...
+      }  // if(tocheck)
+    } // if(nsize < bound)
+    if(islocal) {
+      // no space in the block, go to check the next block
+      i++;
+      if(2==i) {
+               i = 0;
+               j++;
+      }
+      tofindb=totest=
+               gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+    } else {
+      totest += 1;
+    }  // if(islocal) else ...
+    if(totest > gcnumblock-1-bamboo_reserved_smem) {
+      // no more local mem, do not find suitable block on local mem
+         // try to malloc shared memory assigned to the neighbour cores
+         do{
+               k++;
+               if(k >= NUM_CORES2TEST) {
+                 if(gcmem_mixed_usedmem >= gcmem_mixed_threshold) {
+                       // no more memory available on either coren or its neighbour cores
+                       foundsmem = 2;
+                       goto memmixedsearchresult;
+                 } else {
+                       // try allocate globally
+                       mem = globalmalloc_I(coren, isize, allocsize);
+                       return mem;
+                 }
+               }
+         } while(core2test[gccorenum][k] == -1);
+         i = 0;
+         j = 0;
+         tofindb=totest=
+               gc_core2block[2*core2test[gccorenum][k]+i]+(NUMCORES4GC*2)*j;
+    }  // if(totest > gcnumblock-1-bamboo_reserved_smem) ...
+  } while(true);
+
+memmixedsearchresult:
+  if(foundsmem == 1) {
+    // find suitable block
+    mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
+          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
+          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+    *allocsize = size;
+    // set bamboo_smemtbl
+    for(i = tofindb; i <= totest; i++) {
+      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
+    }
+       gcmem_mixed_usedmem += size;
+       if(tofindb == bamboo_free_block) {
+      bamboo_free_block = totest+1;
+    }
+  } else if(foundsmem == 2) {
+    // no suitable block
+    *allocsize = 0;
+  }
+
+  return mem;
+} // void * mixedmalloc_I(int, int, int *)
+#endif // #ifdef SMEMM
+
+// Allocate all the memory chunks globally, do not consider the host cores
+// When all the shared memory are used up, start gc.
  void * globalmalloc_I(int coren,
                        int isize,
                        int * allocsize) {
@@ -1325,6 +1726,7 @@ void * globalmalloc_I(int coren,
    int foundsmem = 0;
    int size = 0;
    if(tofindb > gcnumblock-1-bamboo_reserved_smem) {
+       // Out of shared memory
      *allocsize = 0;
      return NULL;
    }
@@ -1336,24 +1738,24 @@ void * globalmalloc_I(int coren,
        bool tocheck = true;
        // have some space in the block
        if(totest == tofindb) {
-       // the first partition
-       size = bound - nsize;
+               // the first partition
+               size = bound - nsize;
        } else if(nsize == 0) {
-       // an empty partition, can be appended
-       size += bound;
+               // an empty partition, can be appended
+               size += bound;
        } else {
-       // not an empty partition, can not be appended
-       // the last continuous block is not big enough, start another block
-       isnext = true;
-       tocheck = false;
-      }                   // if(totest == tofindb) else if(nsize == 0) else ...
+               // not an empty partition, can not be appended
+               // the last continuous block is not big enough, start another block
+               isnext = true;
+               tocheck = false;
+      }  // if(totest == tofindb) else if(nsize == 0) else ...
        if(tocheck) {
-       if(size >= isize) {
-         // have enough space in the block, malloc
-         foundsmem = 1;
-         break;
-       }                         // if(size > isize)
-      }                   // if(tocheck)
+               if(size >= isize) {
+                 // have enough space in the block, malloc
+                 foundsmem = 1;
+                 break;
+               }  // if(size > isize)
+      }   // if(tocheck)
      } else {
        isnext = true;
      }            // if(nsize < bound) else ...
@@ -1366,18 +1768,18 @@ void * globalmalloc_I(int coren,
      if(isnext) {
        // start another block
        tofindb = totest;
-    }             // if(islocal)
+    } // if(islocal)
    } while(true);
  
    if(foundsmem == 1) {
      // find suitable block
      mem = gcbaseva+bamboo_smemtbl[tofindb]+((tofindb<NUMCORES4GC) ?
-                                            (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
-                                                                            (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
+          (BAMBOO_SMEM_SIZE_L*tofindb) : (BAMBOO_LARGE_SMEM_BOUND+
+          (tofindb-NUMCORES4GC)*BAMBOO_SMEM_SIZE));
      *allocsize = size;
      // set bamboo_smemtbl
      for(int i = tofindb; i <= totest; i++) {
-      bamboo_smemtbl[i]=(i<NUMCORES4GC) ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+      bamboo_smemtbl[i]=(i<NUMCORES4GC)?BAMBOO_SMEM_SIZE_L:BAMBOO_SMEM_SIZE;
      }
      if(tofindb == bamboo_free_block) {
        bamboo_free_block = totest+1;
@@ -1408,14 +1810,22 @@ void * smemalloc_I(int coren,
    }
  
    case SMEMFIXED: {
-    // TODO not supported yet
-    BAMBOO_EXIT(0xe001);
+#ifdef SMEMF
+       mem = fixedmalloc_I(coren, isize, allocsize);
+#else
+       // not supported yet
+       BAMBOO_EXIT(0xe001);
+#endif
      break;
    }
  
    case SMEMMIXED: {
-    // TODO not supported yet
+#ifdef SMEMM
+       mem = mixedmalloc_I(coren, isize, allocsize);
+#else
+       // not supported yet
      BAMBOO_EXIT(0xe002);
+#endif
      break;
    }
  
@@ -1488,6 +1898,7 @@ INLINE int checkMsgLength_I(int size) {
  #ifdef MULTICORE_GC
    case GCSTARTINIT:
    case GCSTART:
+  case GCSTARTMAPINFO:
    case GCSTARTFLUSH:
    case GCFINISH:
    case GCMARKCONFIRM:
@@ -1502,9 +1913,10 @@ INLINE int checkMsgLength_I(int size) {
    case PROFILEFINISH:
  #ifdef MULTICORE_GC
    case GCSTARTCOMPACT:
+  case GCMARKEDOBJ:
    case GCFINISHINIT:
+  case GCFINISHMAPINFO:
    case GCFINISHFLUSH:
-  case GCMARKEDOBJ:
  #endif
      {
        msglength = 2;
@@ -1516,6 +1928,7 @@ INLINE int checkMsgLength_I(int size) {
  #ifdef MULTICORE_GC
    case GCMAPREQUEST:
    case GCMAPINFO:
+  case GCMAPTBL:
    case GCLOBJMAPPING:
  #endif
      {
@@ -1533,6 +1946,9 @@ INLINE int checkMsgLength_I(int size) {
  #ifdef MULTICORE_GC
    case GCFINISHMARK:
    case GCMOVESTART:
+#ifdef GC_PROFILE//_S
+  case GCPROFILES:
+#endif
  #endif
      {
        msglength = 4;
@@ -1572,7 +1988,10 @@ INLINE int checkMsgLength_I(int size) {
    default:
    {
      BAMBOO_DEBUGPRINT_REG(type);
+       BAMBOO_DEBUGPRINT_REG(size);
      BAMBOO_DEBUGPRINT_REG(msgdataindex);
+       BAMBOO_DEBUGPRINT_REG(msgdatalast);
+       BAMBOO_DEBUGPRINT_REG(msgdatafull);
      int i = 6;
      while(i-- > 0) {
        BAMBOO_DEBUGPRINT(msgdata[msgdataindex+i]);
@@ -1596,7 +2015,7 @@ INLINE int checkMsgLength_I(int size) {
  
  INLINE void processmsg_transobj_I() {
    MSG_INDEXINC_I();
-  struct transObjInfo * transObj = RUNMALLOC_I(sizeof(struct transObjInfo));
+  struct transObjInfo * transObj=RUNMALLOC_I(sizeof(struct transObjInfo));
    int k = 0;
  #ifdef DEBUG
  #ifndef CLOSE_PRINT
@@ -1622,7 +2041,7 @@ INLINE void processmsg_transobj_I() {
      //BAMBOO_DEBUGPRINT_REG(transObj->queues[2*k]);
  #endif
  #endif
-    transObj->queues[2*k+1] = msgdata[msgdataindex];             //[3+2*k+1];
+    transObj->queues[2*k+1] = msgdata[msgdataindex];        //[3+2*k+1];
      MSG_INDEXINC_I();
  #ifdef DEBUG
  #ifndef CLOSE_PRINT
@@ -1694,7 +2113,7 @@ INLINE void processmsg_lockrequest_I() {
    int data4 = msgdata[msgdataindex];       // request core
    MSG_INDEXINC_I();
    // -1: redirected, 0: approved, 1: denied
-  int deny = processlockrequest(locktype, data3, data2, data4, data4, true);
+  int deny=processlockrequest(locktype, data3, data2, data4, data4, true);
    if(deny == -1) {
      // this lock request is redirected
      return;
@@ -1702,11 +2121,11 @@ INLINE void processmsg_lockrequest_I() {
      // send response msg
      // for 32 bit machine, the size is always 4 words, cache the msg first
      int tmp = deny==1 ? LOCKDENY : LOCKGROUNT;
-    //if(isMsgSending) {
+    if(BAMBOO_CHECK_SEND_MODE()) {
      cache_msg_4(data4, tmp, locktype, data2, data3);
-    /*} else {
-            send_msg_4(data4, tmp, locktype, data2, data3);
-       }*/
+    } else {
+    send_msg_4(data4, tmp, locktype, data2, data3, true);
+    }
    }
  }
  
@@ -1802,13 +2221,13 @@ INLINE void processmsg_redirectlock_I() {
    } else {
      // send response msg
      // for 32 bit machine, the size is always 4 words, cache the msg first
-    //if(isMsgSending) {
+    if(BAMBOO_CHECK_SEND_MODE()) {
      cache_msg_4(data4, deny==1 ? REDIRECTDENY : REDIRECTGROUNT,
                  data1, data2, data3);
-    /*} else {
-            send_msg_4(data4, deny==1?REDIRECTDENY:REDIRECTGROUNT,
-                                                     data1, data2, data3);
-       }*/
+    } else {
+    send_msg_4(data4, deny==1?REDIRECTDENY:REDIRECTGROUNT,
+               data1, data2, data3, true);
+    }
    }
  }
  
@@ -1902,11 +2321,11 @@ INLINE void processmsg_profileoutput_I() {
    MSG_INDEXINC_I();
    outputProfileData();
    // cache the msg first
-  //if(isMsgSending) {
+  if(BAMBOO_CHECK_SEND_MODE()) {
    cache_msg_2(STARTUPCORE, PROFILEFINISH, BAMBOO_NUM_OF_CORE);
-  /*} else {
-          send_msg_2(STARTUPCORE, PROFILEFINISH, BAMBOO_NUM_OF_CORE);
-     }*/
+  } else {
+  send_msg_2(STARTUPCORE, PROFILEFINISH, BAMBOO_NUM_OF_CORE, true);
+  }
  }
  
  INLINE void processmsg_profilefinish_I() {
@@ -1941,15 +2360,15 @@ INLINE void processmsg_statusconfirm_I() {
  #endif
  #endif
      // cache the msg first
-    //if(isMsgSending) {
+    if(BAMBOO_CHECK_SEND_MODE()) {
      cache_msg_5(STARTUPCORE, STATUSREPORT,
                  busystatus ? 1 : 0, BAMBOO_NUM_OF_CORE,
                  self_numsendobjs, self_numreceiveobjs);
-    /*} else {
-            send_msg_5(STARTUPCORE, STATUSREPORT, busystatus?1:0,
-                                                     BAMBOO_NUM_OF_CORE, self_numsendobjs,
-                                                     self_numreceiveobjs);
-       }*/
+    } else {
+    send_msg_5(STARTUPCORE, STATUSREPORT, busystatus?1:0,
+               BAMBOO_NUM_OF_CORE, self_numsendobjs,
+               self_numreceiveobjs, true);
+    }
    }
  }
  
@@ -2028,22 +2447,22 @@ INLINE void processmsg_memrequest_I() {
        if(INITPHASE == gcphase) {
         // if still in the initphase of gc, send a startinit msg again,
         // cache the msg first
-       //if(isMsgSending) {
+       if(BAMBOO_CHECK_SEND_MODE()) {
         cache_msg_1(data2, GCSTARTINIT);
-       /*} else {
-               send_msg_1(data2, GCSTARTINIT);
-          }*/
+       } else {
+       send_msg_1(data2, GCSTARTINIT, true);
+       }
        }
      } else {
  #endif
      mem = smemalloc_I(data2, data1, &allocsize);
      if(mem != NULL) {
        // send the start_va to request core, cache the msg first
-      //if(isMsgSending) {
+      if(BAMBOO_CHECK_SEND_MODE()) {
        cache_msg_3(data2, MEMRESPONSE, mem, allocsize);
-      /*} else {
-              send_msg_3(data2, MEMRESPONSE, mem, allocsize);
-         }*/
+      } else {
+      send_msg_3(data2, MEMRESPONSE, mem, allocsize, true);
+      }
      } // if mem == NULL, the gcflag of the startup core has been set
      // and the gc should be started later, then a GCSTARTINIT msg
      // will be sent to the requesting core to notice it to start gc
@@ -2081,13 +2500,18 @@ INLINE void processmsg_memresponse_I() {
    if(data2 == 0) {
      bamboo_smem_size = 0;
      bamboo_cur_msp = 0;
+#ifdef MULTICORE_GC
+       bamboo_smem_zero_top = 0;
+#endif
    } else {
  #ifdef MULTICORE_GC
      // fill header to store the size of this mem block
-    memset(data1, 0, BAMBOO_CACHE_LINE_SIZE);
+    BAMBOO_MEMSET_WH(data1, '\0', BAMBOO_CACHE_LINE_SIZE); 
+       //memset(data1, 0, BAMBOO_CACHE_LINE_SIZE);
      (*((int*)data1)) = data2;
      bamboo_smem_size = data2 - BAMBOO_CACHE_LINE_SIZE;
      bamboo_cur_msp = data1 + BAMBOO_CACHE_LINE_SIZE;
+       bamboo_smem_zero_top = bamboo_cur_msp;
  #else
      bamboo_smem_size = data2;
      bamboo_cur_msp =(void*)(data1);
@@ -2109,6 +2533,7 @@ INLINE void processmsg_gcstartinit_I() {
      bamboo_smem_size = 0;
      bamboo_cur_msp = NULL;
      smemflag = true;
+       bamboo_smem_zero_top = NULL;
    }
  }
  
@@ -2128,6 +2553,10 @@ INLINE void processmsg_gcstartcompact_I() {
    gcphase = COMPACTPHASE;
  }
  
+INLINE void processmsg_gcstartmapinfo_I() {
+  gcphase = MAPPHASE;
+}
+
  INLINE void processmsg_gcstartflush_I() {
    gcphase = FLUSHPHASE;
  }
@@ -2171,8 +2600,16 @@ INLINE void processmsg_gcfinishmark_I() {
    // all cores should do mark
    if(data1 < NUMCORESACTIVE) {
      gccorestatus[data1] = 0;
-    gcnumsendobjs[data1] = data2;
-    gcnumreceiveobjs[data1] = data3;
+       int entry_index = 0;
+       if(waitconfirm)  {
+         // phase 2
+         entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
+       } else {
+         // phase 1
+         entry_index = gcnumsrobjs_index;
+       }
+    gcnumsendobjs[entry_index][data1] = data2;
+    gcnumreceiveobjs[entry_index][data1] = data3;
    }
  }
  
@@ -2206,11 +2643,11 @@ INLINE void processmsg_gcfinishcompact_I() {
        int dstcore = 0;
        if(gcfindSpareMem_I(&startaddr, &tomove, &dstcore, data4, cnum)) {
         // cache the msg first
-       //if(isMsgSending) {
+       if(BAMBOO_CHECK_SEND_MODE()) {
         cache_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove);
-       /*} else {
-                     send_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove);
-             }*/
+       } else {
+       send_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove, true);
+       }
        }
      } else {
        gccorestatus[cnum] = 0;
@@ -2218,6 +2655,25 @@ INLINE void processmsg_gcfinishcompact_I() {
    }       // if(cnum < NUMCORES4GC)
  }
  
+INLINE void processmsg_gcfinishmapinfo_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  // received a map phase finish msg
+  if(BAMBOO_NUM_OF_CORE != STARTUPCORE) {
+    // non startup core can not receive this msg
+    // return -1
+#ifndef CLOSE_PRINT
+    BAMBOO_DEBUGPRINT_REG(data1);
+#endif
+    BAMBOO_EXIT(0xb004);
+  }
+  // all cores should do flush
+  if(data1 < NUMCORES4GC) {
+    gccorestatus[data1] = 0;
+  }
+}
+
+
  INLINE void processmsg_gcfinishflush_I() {
    int data1 = msgdata[msgdataindex];
    MSG_INDEXINC_I();
@@ -2228,7 +2684,7 @@ INLINE void processmsg_gcfinishflush_I() {
  #ifndef CLOSE_PRINT
      BAMBOO_DEBUGPRINT_REG(data1);
  #endif
-    BAMBOO_EXIT(0xb004);
+    BAMBOO_EXIT(0xb005);
    }
    // all cores should do flush
    if(data1 < NUMCORESACTIVE) {
@@ -2240,18 +2696,18 @@ INLINE void processmsg_gcmarkconfirm_I() {
    if((BAMBOO_NUM_OF_CORE == STARTUPCORE)
       || (BAMBOO_NUM_OF_CORE > NUMCORESACTIVE - 1)) {
      // wrong core to receive such msg
-    BAMBOO_EXIT(0xb005);
+    BAMBOO_EXIT(0xb006);
    } else {
      // send response msg, cahce the msg first
-    //if(isMsgSending) {
+    if(BAMBOO_CHECK_SEND_MODE()) {
      cache_msg_5(STARTUPCORE, GCMARKREPORT, BAMBOO_NUM_OF_CORE,
                  gcbusystatus, gcself_numsendobjs,
                  gcself_numreceiveobjs);
-    /*} else {
-            send_msg_5(STARTUPCORE, GCMARKREPORT, BAMBOO_NUM_OF_CORE,
-                                                     gcbusystatus, gcself_numsendobjs,
-                                                     gcself_numreceiveobjs);
-       }*/
+    } else {
+    send_msg_5(STARTUPCORE, GCMARKREPORT, BAMBOO_NUM_OF_CORE,
+               gcbusystatus, gcself_numsendobjs,
+               gcself_numreceiveobjs, true);
+    }
    }
  }
  
@@ -2270,14 +2726,21 @@ INLINE void processmsg_gcmarkreport_I() {
  #ifndef CLOSE_PRINT
      BAMBOO_DEBUGPRINT_REG(data2);
  #endif
-    BAMBOO_EXIT(0xb006);
+    BAMBOO_EXIT(0xb007);
    } else {
+       int entry_index = 0;
      if(waitconfirm) {
+         // phse 2
        numconfirm--;
-    }
+         entry_index = (gcnumsrobjs_index == 0) ? 1 : 0;
+    } else {
+         // can never reach here
+         // phase 1
+         entry_index = gcnumsrobjs_index;
+       }
      gccorestatus[data1] = data2;
-    gcnumsendobjs[data1] = data3;
-    gcnumreceiveobjs[data1] = data4;
+    gcnumsendobjs[entry_index][data1] = data3;
+    gcnumreceiveobjs[entry_index][data1] = data4;
    }
  }
  
@@ -2290,7 +2753,9 @@ INLINE void processmsg_gcmarkedobj_I() {
      // set the flag as DISCOVERED
      ((int *)data1)[6] = DISCOVERED;
      gc_enqueue_I(data1);
-  }
+  } 
+  // set the remote flag
+  ((int *)data1)[6] |= REMOTEM;
    gcself_numreceiveobjs++;
    gcbusystatus = true;
  }
@@ -2312,19 +2777,22 @@ INLINE void processmsg_gcmaprequest_I() {
    void * dstptr = NULL;
    int data1 = msgdata[msgdataindex];
    MSG_INDEXINC_I();
-  //dstptr = mgchashSearch(msgdata[1]);
  #ifdef GC_PROFILE
-  unsigned long long ttime = BAMBOO_GET_EXE_TIME();
+  // TODO unsigned long long ttime = BAMBOO_GET_EXE_TIME();
  #endif
+#ifdef LOCALHASHTBL_TEST
    RuntimeHashget(gcpointertbl, data1, &dstptr);
+#else
+  dstptr = mgchashSearch(gcpointertbl, data1);
+#endif
+  //MGCHashget(gcpointertbl, data1, &dstptr);
  #ifdef GC_PROFILE
-  flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
+  // TODO flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
  #endif
    int data2 = msgdata[msgdataindex];
    MSG_INDEXINC_I();
-  //MGCHashget(gcpointertbl, msgdata[1], &dstptr);
  #ifdef GC_PROFILE
-  unsigned long long ttimei = BAMBOO_GET_EXE_TIME();
+  // TODO unsigned long long ttimei = BAMBOO_GET_EXE_TIME();
  #endif
    if(NULL == dstptr) {
      // no such pointer in this core, something is wrong
@@ -2332,7 +2800,7 @@ INLINE void processmsg_gcmaprequest_I() {
      BAMBOO_DEBUGPRINT_REG(data1);
      BAMBOO_DEBUGPRINT_REG(data2);
  #endif
-    BAMBOO_EXIT(0xb007);
+    BAMBOO_EXIT(0xb009);
      //assume that the object was not moved, use the original address
      /*if(isMsgSending) {
              cache_msg_3(msgdata[2], GCMAPINFO, msgdata[1], msgdata[1]);
@@ -2341,14 +2809,14 @@ INLINE void processmsg_gcmaprequest_I() {
         }*/
    } else {
      // send back the mapping info, cache the msg first
-    //if(isMsgSending) {
+    if(BAMBOO_CHECK_SEND_MODE()) {
      cache_msg_3(data2, GCMAPINFO, data1, (int)dstptr);
-    /*} else {
-            send_msg_3(data2, GCMAPINFO, data1, (int)dstptr);
-       }*/
+    } else {
+    send_msg_3(data2, GCMAPINFO, data1, (int)dstptr, true);
+    }
    }
  #ifdef GC_PROFILE
-  flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimei;
+  // TODO flushstalltime_i += BAMBOO_GET_EXE_TIME()-ttimei;
    //num_mapinforequest_i++;
  #endif
  }
@@ -2359,27 +2827,30 @@ INLINE void processmsg_gcmapinfo_I() {
  #endif
    int data1 = msgdata[msgdataindex];
    MSG_INDEXINC_I();
-  if(data1 != gcobj2map) {
-    // obj not matched, something is wrong
-#ifdef DEBUG
-    BAMBOO_DEBUGPRINT_REG(gcobj2map);
-    BAMBOO_DEBUGPRINT_REG(msgdata[1]);
+  gcmappedobj = msgdata[msgdataindex];  // [2]
+  MSG_INDEXINC_I();
+#ifdef LOCALHASHTBL_TEST
+  RuntimeHashadd_I(gcpointertbl, data1, gcmappedobj);
+#else
+  mgchashInsert_I(gcpointertbl, data1, gcmappedobj);
  #endif
-    BAMBOO_EXIT(0xb008);
-  } else {
-    gcmappedobj = msgdata[msgdataindex];                     // [2]
-    MSG_INDEXINC_I();
-    //mgchashReplace_I(msgdata[1], msgdata[2]);
-    //mgchashInsert_I(gcobj2map, gcmappedobj);
-    RuntimeHashadd_I(gcpointertbl, gcobj2map, gcmappedobj);
-    //MGCHashadd_I(gcpointertbl, gcobj2map, gcmappedobj);
+  //MGCHashadd_I(gcpointertbl, data1, gcmappedobj);
+  if(data1 == gcobj2map) {
+       gcismapped = true;
    }
-  gcismapped = true;
  #ifdef GC_PROFILE
    //flushstalltime += BAMBOO_GET_EXE_TIME() - ttime;
  #endif
  }
  
+INLINE void processmsg_gcmaptbl_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  gcrpointertbls[data2] = (mgcsharedhashtbl_t *)data1; //(struct GCSharedHash *)data1;
+}
+
  INLINE void processmsg_gclobjinfo_I() {
    numconfirm--;
  
@@ -2391,7 +2862,7 @@ INLINE void processmsg_gclobjinfo_I() {
  #ifndef CLOSE_PRINT
      BAMBOO_DEBUGPRINT_REG(data2);
  #endif
-    BAMBOO_EXIT(0xb009);
+    BAMBOO_EXIT(0xb00b);
    }
    // store the mark result info
    int cnum = data2;
@@ -2418,10 +2889,29 @@ INLINE void processmsg_gclobjmapping_I() {
    MSG_INDEXINC_I();
    int data2 = msgdata[msgdataindex];
    MSG_INDEXINC_I();
-  //mgchashInsert_I(msgdata[1], msgdata[2]);
+#ifdef LOCALHASHTBL_TEST
    RuntimeHashadd_I(gcpointertbl, data1, data2);
-  //MGCHashadd_I(gcpointertbl, msgdata[1], msgdata[2]);
+#else
+  mgchashInsert_I(gcpointertbl, data1, data2);
+#endif
+  //MGCHashadd_I(gcpointertbl, data1, data2);
+  mgcsharedhashInsert_I(gcsharedptbl, data1, data2);
+}
+
+#ifdef GC_PROFILE//_S
+INLINE void processmsg_gcprofiles_I() {
+  int data1 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data2 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  int data3 = msgdata[msgdataindex];
+  MSG_INDEXINC_I();
+  gc_num_obj += data1;
+  gc_num_liveobj += data2;
+  gc_num_forwardobj += data3;
+  gc_num_profiles--;
  }
+#endif
  #endif // #ifdef MULTICORE_GC
  
  // receive object transferred from other cores
@@ -2451,7 +2941,7 @@ processmsg:
    if((size == 0) || (checkMsgLength_I(size) == -1)) {
      // not a whole msg
      // have new coming msg
-    if(BAMBOO_MSG_AVAIL() != 0) {
+    if((BAMBOO_MSG_AVAIL() != 0) && !msgdatafull) {
        goto msg;
      } else {
        return -1;
@@ -2594,6 +3084,12 @@ processmsg:
        break;
      }                     // case GCSTARTCOMPACT
  
+       case GCSTARTMAPINFO: {
+      // received a flush phase start msg
+      processmsg_gcstartmapinfo_I();
+      break;
+    }                     // case GCSTARTFLUSH
+
      case GCSTARTFLUSH: {
        // received a flush phase start msg
        processmsg_gcstartflush_I();
@@ -2616,6 +3112,11 @@ processmsg:
        break;
      }                     // case GCFINISHCOMPACT
  
+       case GCFINISHMAPINFO: {
+      processmsg_gcfinishmapinfo_I();
+      break;
+    }                     // case GCFINISHMAPINFO
+
      case GCFINISHFLUSH: {
        processmsg_gcfinishflush_I();
        break;
@@ -2662,7 +3163,13 @@ processmsg:
        break;
      }                     // case GCMAPINFO
  
-    case GCLOBJREQUEST: {
+    case GCMAPTBL: {
+      // received a mapping tbl response msg
+      processmsg_gcmaptbl_I();
+      break;
+    }                     // case GCMAPTBL
+       
+       case GCLOBJREQUEST: {
        // received a large objs info request msg
        transferMarkResults_I();
        break;
@@ -2680,16 +3187,24 @@ processmsg:
        break;
      }                     // case GCLOBJMAPPING
  
+#ifdef GC_PROFILE//_S
+       case GCPROFILES: {
+      // received a gcprofiles msg
+      processmsg_gcprofiles_I();
+      break;
+    }
+#endif
  #endif // #ifdef MULTICORE_GC
  
      default:
        break;
      }             // switch(type)
-                  //memset(msgdata, '\0', sizeof(int) * msgdataindex);
-                  //msgdataindex = 0;
+    //memset(msgdata, '\0', sizeof(int) * msgdataindex);
+    //msgdataindex = 0;
      msglength = BAMBOO_MSG_BUF_LENGTH;
      // TODO
      //printf("++ msg: %x \n", type);
+
      if(msgdataindex != msgdatalast) {
        // still have available msg
        goto processmsg;
@@ -2703,7 +3218,7 @@ processmsg:
      // have new coming msg
      if(BAMBOO_MSG_AVAIL() != 0) {
        goto msg;
-    }
+    } // TODO
  
  #ifdef PROFILE
  /*if(isInterrupt) {
@@ -2929,7 +3444,7 @@ void releasewritelock_r(void * lock, void * redirectlock) {
      // reside on this core
      if(!RuntimeHashcontainskey(locktbl, reallock)) {
        // no locks for this object, something is wrong
-      BAMBOO_EXIT(0xa011);
+      BAMBOO_EXIT(0xa00b);
      } else {
        int rwlock_obj = 0;
        struct LockValue * lockvalue = NULL;
@@ -2956,7 +3471,7 @@ void releasewritelock_r(void * lock, void * redirectlock) {
      // send lock release with redirect info msg
      // for 32 bit machine, the size is always 4 words
      send_msg_4(targetcore, REDIRECTRELEASE, 1, (int)lock,
-               (int)redirectlock);
+               (int)redirectlock, false);
    }
  }
  #endif
@@ -3073,20 +3588,10 @@ newtask:
  #endif
        while(!lockflag) {
         BAMBOO_WAITING_FOR_LOCK(0);
-       // check for outgoing sends
-    if (isMsgHanging) {
-      extern inline void send_hanging_msg(bool);
-      send_hanging_msg(true);
-    } 
           }
  #ifndef INTERRUPT
        if(reside) {
         while(BAMBOO_WAITING_FOR_LOCK(0) != -1) {
-         // check for outgoing sends
-         if (isMsgHanging) {
-               extern inline void send_hanging_msg(bool);
-               send_hanging_msg(true);
-         } 
         }
        }
  #endif