From 73b1f4b8d531caef574e78431481837a7490bcd3 Mon Sep 17 00:00:00 2001
From: jzhou <jzhou>
Date: Thu, 20 Aug 2009 01:29:31 +0000
Subject: [PATCH] finish gc codes and fix some bug

---
 Robust/src/IR/Flat/BuildCode.java     |  10 +-
 Robust/src/Main/Main.java             |  10 +-
 Robust/src/Runtime/RAW/task_arch.c    |  52 ---
 Robust/src/Runtime/multicoregarbage.c | 640 ++++++++++++++++++--------
 Robust/src/Runtime/multicoregarbage.h |  24 +-
 Robust/src/Runtime/multicoreruntime.c |   4 +
 Robust/src/Runtime/multicoreruntime.h |  26 +-
 Robust/src/Runtime/multicoretask.c    | 156 +++++--
 Robust/src/buildscript                |  19 +-
 9 files changed, 631 insertions(+), 310 deletions(-)
diff --git a/Robust/src/IR/Flat/BuildCode.java b/Robust/src/IR/Flat/BuildCode.java
index 240e6c5f..d0db3db7 100644
--- a/Robust/src/IR/Flat/BuildCode.java
+++ b/Robust/src/IR/Flat/BuildCode.java
@@ -594,7 +594,7 @@ public class BuildCode {
       } else {
         outclassdefs.println("  int version;");
         outclassdefs.println("  int * lock;");  // lock entry for this obj
-        outclassdefs.println("  void * mutex;");  
+        outclassdefs.println("  int mutex;");  
         outclassdefs.println("  int lockcount;");
         if(state.MULTICOREGC) {
           outclassdefs.println("  int marked;");
@@ -1303,13 +1303,13 @@ public class BuildCode {
       if((!state.MULTICORE) || (cn.getSymbol().equals("TagDescriptor"))) {
 	classdefout.println("  void * flagptr;");
       } else if (state.MULTICORE) {
-        if(state.MULTICOREGC) {
-          classdefout.println("  int marked;");
-        }
 	classdefout.println("  int version;");
     classdefout.println("  int * lock;");  // lock entry for this obj
-    classdefout.println("  void * mutex;");  
+    classdefout.println("  int mutex;");  
     classdefout.println("  int lockcount;");
+    if(state.MULTICOREGC) {
+      classdefout.println("  int marked;");
+    }
       }
       if (state.OPTIONAL) {
 	classdefout.println("  int numfses;");
diff --git a/Robust/src/Main/Main.java b/Robust/src/Main/Main.java
index 6df19dfc..b1b34092 100644
--- a/Robust/src/Main/Main.java
+++ b/Robust/src/Main/Main.java
@@ -401,16 +401,16 @@ public class Main {
 	if(isDistributeInfo) {
 	    mcImplSynthesis.distribution(isDisAll, startnum);
 	} else {
-	    double timeStartAnalysis = (double) System.nanoTime();
+	    //double timeStartAnalysis = (double) System.nanoTime();
 	    mcImplSynthesis.setScheduleThreshold(20);
 	    mcImplSynthesis.setProbThreshold(0);
 	    mcImplSynthesis.setGenerateThreshold(30);
 	    Vector<Schedule> scheduling = mcImplSynthesis.synthesis();
 	    
-	    double timeEndAnalysis = (double) System.nanoTime();
-	    double dt = (timeEndAnalysis - timeStartAnalysis)/(Math.pow( 10.0, 9.0 ) );
-	    System.err.println("The analysis took" + dt +  "sec.");
-        System.exit(0);
+	    //double timeEndAnalysis = (double) System.nanoTime();
+	    //double dt = (timeEndAnalysis - timeStartAnalysis)/(Math.pow( 10.0, 9.0 ) );
+	    //System.err.println("The analysis took" + dt +  "sec.");
+        //System.exit(0);
 
 	    // generate multicore codes
 	    if(state.MULTICORE) {
diff --git a/Robust/src/Runtime/RAW/task_arch.c b/Robust/src/Runtime/RAW/task_arch.c
index 7751ece9..40c42655 100644
--- a/Robust/src/Runtime/RAW/task_arch.c
+++ b/Robust/src/Runtime/RAW/task_arch.c
@@ -838,9 +838,7 @@ bool getreadlock(void * ptr) {
   return true;
 }
 
-void releasewritelock_r(void * lock, void * redirectlock);
 bool getreadlock_I_r(void * ptr, void * redirectlock, int core, bool cache);
-
 bool getwritelock_I_r(void* lock, void* redirectlock, int core, bool cache);
 
 void releasereadlock(void * ptr) {
@@ -1063,56 +1061,6 @@ void releasewritelock(void * ptr) {
   }
 }
 
-void releasewritelock_r(void * lock, void * redirectlock) {
-  int targetcore = 0;
-  int reallock = (int)lock;
-  targetcore = (reallock >> 5) % BAMBOO_TOTALCORE;
-
-#ifdef DEBUG
-  BAMBOO_DEBUGPRINT(0xe671);
-  BAMBOO_DEBUGPRINT_REG((int)lock);
-  BAMBOO_DEBUGPRINT_REG(reallock);
-  BAMBOO_DEBUGPRINT_REG(targetcore);
-#endif
-
-  if(targetcore == BAMBOO_NUM_OF_CORE) {
-	BAMBOO_START_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf001);
-#endif
-    // reside on this core
-    if(!RuntimeHashcontainskey(locktbl, reallock)) {
-      // no locks for this object, something is wrong
-      BAMBOO_EXIT(0xa01d);
-    } else {
-      int rwlock_obj = 0;
-	  struct LockValue * lockvalue = NULL;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT(0xe672);
-#endif
-      RuntimeHashget(locktbl, reallock, &rwlock_obj);
-	  lockvalue = (struct LockValue *)rwlock_obj;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-      lockvalue->value++;
-	  lockvalue->redirectlock = (int)redirectlock;
-#ifdef DEBUG
-      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
-#endif
-    }
-	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
-#ifdef DEBUG
-	BAMBOO_DEBUGPRINT(0xf000);
-#endif
-    return;
-  } else {
-	  // send lock release with redirect info msg
-	  // for 32 bit machine, the size is always 4 words
-	  send_msg_4(targetcore, REDIRECTRELEASE, 1, (int)lock, (int)redirectlock);
-  }
-}
-
 bool getwritelock_I(void * ptr) {
   int targetcore = 0;
   lockobj = (int)ptr;
diff --git a/Robust/src/Runtime/multicoregarbage.c b/Robust/src/Runtime/multicoregarbage.c
index d609d2a8..22a6783e 100644
--- a/Robust/src/Runtime/multicoregarbage.c
+++ b/Robust/src/Runtime/multicoregarbage.c
@@ -250,7 +250,18 @@ inline void transferMarkResults() {
 		BAMBOO_DEBUGPRINT(0xffff);
 #endif
 	} // if(isMsgSending)
-} // void transferMarkResults() 
+} // void transferMarkResults()
+
+inline bool gc_checkCoreStatus() {
+	bool allStall = true;
+	for(int i = 0; i < NUMCORES; ++i) {
+		if(gccorestatus[i] != 0) {
+			allStall = false;
+			break;
+		} // if(gccorestatus[i] != 0)
+	} // for(i = 0; i < NUMCORES; ++i)
+	return allStall;
+}
 
 inline void checkMarkStatue() {
 	if((!waitconfirm) || 
@@ -260,13 +271,7 @@ inline void checkMarkStatue() {
 		gcnumsendobjs[BAMBOO_NUM_OF_CORE] = gcself_numsendobjs;
 		gcnumreceiveobjs[BAMBOO_NUM_OF_CORE] = gcself_numreceiveobjs;
 		// check the status of all cores
-		bool allStall = true;
-		for(i = 0; i < NUMCORES; ++i) {
-			if(gccorestatus[i] != 0) {
-				allStall = false;
-				break;
-			} // if(gccorestatus[i] != 0)
-		} // for(i = 0; i < NUMCORES; ++i)
+		bool allStall = gc_checkCoreStatus();
 		if(allStall) {
 			// check if the sum of send objs and receive obj are the same
 			// yes->check if the info is the latest; no->go on executing
@@ -344,14 +349,58 @@ inline bool preGC() {
 	} // if((!waitconfirm) || 
 } // bool preGC()
 
+inline void initGC() {
+	for(i = 0; i < NUMCORES; ++i) {
+		gccorestatus[i] = 1;
+		gcnumsendobjs[i] = 0; 
+		gcnumreceiveobjs[i] = 0;
+		gcloads[i] = 0;
+		gcrequiredmems[i] = 0;
+		gcfilledblocks[i] = 0;
+		gcstopblock[i] = 0;
+	} // for(i = 0; i < NUMCORES; ++i)
+	gcself_numsendobjs = 0;
+	gcself_numreceiveobjs = 0;
+	gcmarkedptrbound = 0;
+	gcobj2map = 0;
+	gcmappedobj = 0;
+	gcismapped = false;
+	gcnumlobjs = 0;
+	gcheaptop = 0;
+	gctopcore = 0;
+	gcheapdirection = 1;
+	gcreservedsb = 0;
+	gcmovestartaddr = 0;
+	gctomove = false;
+	gcblock2fill = 0;
+	gcmovepending = 0;
+
+	// initialize queue
+	if (gchead==NULL) {
+		gcheadindex=0;
+		gctailindex=0;
+		gctailindex2 = 0;
+		gchead=gctail=gctail2=malloc(sizeof(struct pointerblock));
+	}
+	// initialize the large obj queues
+	if (gclobjhead==NULL) {
+		gclobjheadindex=0;
+		gclobjtailindex=0;
+		gclobjtailindex2 = 0;
+		gclobjhead=gclobjtail=gclobjtail2=
+			malloc(sizeof(struct lobjpointerblock));
+	}
+} // void initGC()
+
 // compute load balance for all cores
 inline int loadbalance(int heaptop) {
 	// compute load balance
 	int i;
 
 	// get the total loads
-	gcloads[0]+=BAMBOO_SMEM_SIZE*gcreservedsb;//reserved sblocks for sbstartbl
-	int tloads = gcloads[0];
+	gcloads[STARTUPCORE]+=
+		BAMBOO_SMEM_SIZE*gcreservedsb;//reserved sblocks for sbstartbl
+	int tloads = gcloads[STARTUPCORE];
 	for(i = 1; i < NUMCORES; i++) {
 		tloads += gcloads[i];
 	}
@@ -409,8 +458,22 @@ inline bool cacheLObjs() {
 
 inline void moveLObjs() {
 	// find current heap top
+	// flush all gcloads to indicate the real heap top on one core
+	// previous it represents the next available ptr on a core
+	if((gcloads[0] > BAMBOO_BASE_VA+BAMBOO_SMEM_SIZE_L) 
+			&& (gcloads[0] % BAMBOO_SMEM_SIZE == 0)) {
+		// edge of a block, check if this is exactly the heaptop
+		BASEPTR(0, gcfilledblocks[0]-1, &gcloads[0]);
+		gcloads[0]+=(gcfilledblocks[0]>1?BAMBOO_SMEM_SIZE:BAMBOO_SMEM_SIZE_L);
+	}
 	int tmpheaptop = gcloads[0];
 	for(int i = 1; i < NUMCORES; i++) {
+		if((gcloads[i] > BAMBOO_BASE_VA+BAMBOO_SMEM_SIZE_L) 
+				&& (gcloads[i] % BAMBOO_SMEM_SIZE == 0)) {
+			// edge of a block, check if this is exactly the heaptop
+			BASEPTR(0, gcfilledblocks[i]-1, &gcloads[i]);
+			gcloads[i]+=(gcfilledblocks[i]>1?BAMBOO_SMEM_SIZE:BAMBOO_SMEM_SIZE_L);
+		}
 		if(tmpheaptop < gcloads[i]) {
 			tmpheaptop = gcloads[i];
 		}
@@ -470,7 +533,7 @@ inline void moveLObjs() {
 
 inline void updateFreeMemList() {
 	int i = 0;
-	int tmptop = gcloads[0];
+	int tmptop = gcloads[0]; 
 	struct freeMemItem * tochange = bamboo_free_mem_list->head;
 	if(tochange == NULL) {
 		bamboo_free_mem_list->head = tochange = 
@@ -507,137 +570,6 @@ inline void updateFreeMemList() {
 	bamboo_free_mem_list->tail = tochange;
 } // void updateFreeMemList()
 
-inline void gc(struct garbagelist * stackptr) {
-	// check if do gc
-	if(!gcflag) {
-		return;
-	}
-
-	// core coordinator routine
-	if(0 == BAMBOO_NUM_OF_CORE) {
-		if(!preGC()) {
-			// not ready to do gc
-			gcflag = true;
-			return;
-		}
-
-		gcprocessing = true;
-		int i = 0;
-		waitconfirm = false;
-		waitconfirm = 0;
-		gcphase = MARKPHASE;
-		for(i = 1; i < NUMCORES - 1; i++) {
-			// send GC start messages to all cores
-			send_msg_1(i, GCSTART);
-		}
-		bool isfirst = true;
-		bool allStall = false;
-
-		// mark phase
-		while(MARKPHASE == gcphase) {
-			mark(isfirst, stackptr);
-			if(isfirst) {
-				isfirst = false;
-			}
-
-			// check gcstatus
-			checkMarkStatue(); 
-		}  // while(MARKPHASE == gcphase)
-		// send msgs to all cores requiring large objs info
-		numconfirm = NUMCORES - 1;
-		for(i = 1; i < NUMCORES; ++i) {
-			send_msg_1(i, GCLOBJREQUEST);
-		}	
-		while(numconfirm != 0) {} // wait for responses
-		if(!cacheLObjs()) {
-			// no enough space to cache large objs
-			BAMBOO_EXIT(0xd001);
-		}
-		int numpbc = loadbalance();
-
-		if((gcheapdirection) && (0 <= gctopcore)
-				|| ((!gcheapdirection) && (0 == gctopcore))) {
-			gcstopblock = numpbc + 1;
-		} else {
-			gcstopblock = numpbc;
-		}
-		for(i = 1; i < NUMCORES; ++i) {
-			//send start compact messages to all cores
-			if((gcheapdirection) && (i <= gctopcore)
-					|| ((!gcheapdirection) && (i >= gctopcore))) {
-					send_msg_2(i, GCSTARTCOMPACT, numpbc+1); 
-			} else {
-					send_msg_2(i, GCSTARTCOMPACT, numpbc);
-			}
-		}
-
-		// compact phase
-		compact();
-		gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-		while(COMPACTPHASE == gcphase) {
-			// check the status of all cores
-			allStall = true;
-			for(i = 0; i < NUMCORES; ++i) {
-				if(gccorestatus[i] != 0) {
-					allStall = false;
-					break;
-				}
-			}	
-			if(allStall) {
-				// restore the gcstatus of all cores
-				for(i = 0; i < NUMCORES; ++i) {
-					gccorestatus[i] = 1;
-				}
-				break;
-			}
-		} // while(COMPACTPHASE == gcphase)
-		// move largeObjs
-		moveLObjs();
-
-		gcphase = FLUSHPHASE;
-		for(i = 1; i < NUMCORES; ++i) {
-			// send start flush messages to all cores
-			send_msg_1(i, GCSTARTFLUSH);
-		}
-
-		// flush phase
-		flush();
-		gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-		while(FLUSHPHASE == gcphase) {
-			// check the status of all cores
-			allStall = true;
-			for(i = 0; i < NUMCORES; ++i) {
-				if(gccorestatus[i] != 0) {
-					allStall = false;
-					break;
-				}
-			}	
-			if(allStall) {
-				break;
-			}
-		} // while(FLUSHPHASE == gcphase)
-		gcphase = FINISHPHASE;
-		for(i = 1; i < NUMCORES; ++i) {
-			// send gc finish messages to all cores
-			send_msg_1(i, GCFINISH);
-		}
-
-		// need to create free memory list  
-		updateFreeMemList();
-	} else {
-		gcprocessing = true;
-		gc_collect(stackptr);
-	}
-
-	// invalidate all shared mem pointers
-	bamboo_cur_msp = NULL;
-	bamboo_smem_size = 0;
-
-	gcflag = false;
-	gcprocessing = false;
-
-} // void gc(struct garbagelist * stackptr)
-
 // enqueue root objs
 inline void tomark(struct garbagelist * stackptr) {
 	if(MARKPHASE != gcphase) {
@@ -790,6 +722,126 @@ inline void mark(bool isfirst,
 	} // while(MARKPHASE == gcphase)
 } // mark()
 
+inline void compact2Heaptop() {
+	// no cores with spare mem and some cores are blocked with pending move
+	// find the current heap top and make them move to the heap top
+	int p;
+	if(gcheapdirection) {
+		gctopcore++;
+	} else {
+		gctopcore--;
+	}
+	int numblocks = gcfilledblocks[gctopcore];
+	BASEPTR(gctopcore, numblocks, &p);
+	int b;
+	BLOCKINDEX(p, &b);
+	int remain = b<NUMCORES ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+	for(int i = 0; i < NUMCORES; i++) {
+		if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0)) {
+			int memneed = gcrequiredmems[i] + BAMBOO_CACHE_LINE_SIZE;
+			if(STARTUPCORE == i) {
+				gctomove = true;
+				gcmovestartaddr = p;
+				gcdstcore = gctopcore;
+				gcblock2fill = numblocks + 1;
+			} else {
+				send_msg_4(i, GCMOVESTART, gctopcore, p, numblocks + 1);
+			}
+			if(memneed < remain) {
+				p += memneed;
+				gcrequiredmems[i] = 0;
+				gcmovepending--;
+				gcloads[gctopcore] += memneed;
+			} else {
+				// next available block
+				p += remain;
+				gcfilledblocks[gctopcore] += 1;
+				int newbase = 0;
+				BASEPTR(gctopcore, gcfilledblocks[gctopcore], &newbase);
+				gcloads[gctopcore] = newbase;
+				gcrequiredmems[i] -= remain - BAMBOO_CACHE_LINE_SIZE;
+				gcstopblock[gctopcore]++;
+				if(gcheapdirection) {
+					gctopcore++;
+				} else {
+					gctopcore--;
+				}
+				numblocks = gcstopblock[gctopcore];
+				BASEPTR(gctopcore, numblocks, &p);
+				BLOCKINDEX(p, &p);
+				remain = b<NUMCORES ? BAMBOO_SMEM_SIZE_L : BAMBOO_SMEM_SIZE;
+			} // if(memneed < remain)
+		} // if((gccorestatus[i] != 0) && (gcrequiredmems[i] > 0))
+	} // for(i = 0; i < NUMCORES; i++)
+} // void compact2Heaptop()
+
+inline void resolvePendingMoveRequest() {
+	int i;
+	int j;
+	bool nosparemem = true;
+	bool haspending = false;
+	bool hasrunning = false;
+	bool noblock = false;
+	int dstcore = 0;
+	int sourcecore = 0;
+	for(i = j = 0; (i < NUMCORES) && (j < NUMCORES);) {
+		if(nosparemem) {
+			// check if there are cores with spare mem
+			if(gccorestatus[i] == 0) {
+				// finished working, check if it still have spare mem
+				if(gcfilledblocks[i] < gcstopblock[i]) {
+					// still have spare mem
+					nosparemem = false;
+					dstcore = i;
+				} else {
+					i++;
+				} // if(gcfilledblocks[i] < gcstopblock[i]) else ...
+			}
+		} // if(nosparemem)
+		if(!haspending) {
+			if(gccorestatus[j] != 0) {
+				// not finished, check if it has pending move requests
+				if((gcfilledblocks[j]==gcstopblock[j])&&(gcrequiredmems[j]>0)) {
+					sourcecore = j;
+					haspending = true;
+				} else {
+					j++;
+					hasrunning = true;
+				} // if((gcfilledblocks[i] == gcstopblock[i])...) else ...
+			} // if(gccorestatus[i] == 0) else ...
+		} // if(!haspending)
+		if(!nosparemem && haspending) {
+			// find match
+			int tomove = 0;
+			int startaddr = 0;
+			gcrequiredmems[dstcore] = assignSpareMem(sourcecore, 
+					                                     gcrequiredmems[dstcore], 
+																							 &tomove, 
+																							 &startaddr);
+			if(STARTUPCORE == dstcore) {
+				gcdstcore = sourcecore;
+				gctomove = true;
+				gcmovestartaddr = startaddr;
+				gcblock2fill = tomove;
+			} else {
+				send_msg_4(dstcore, GCMOVESTART, sourcecore, startaddr, tomove);
+			}
+			if(gcrequiredmems[dstcore] == 0) {
+				gcmovepending--;
+			}
+			nosparemem = true;
+			haspending = false;
+			noblock = true;
+		} 
+	} // for(i = 0; i < NUMCORES; i++)
+
+	if(!hasrunning && !noblock) {
+		gcphase = SUBTLECOMPACTPHASE;
+		compact2Heaptop();
+	}
+
+} // void resovePendingMoveRequest()
+
 struct moveHelper {
 	int numblocks; // block num for heap
 	INTPTR base; // base virtual address of current heap block
@@ -951,99 +1003,110 @@ innermoveobj:
 	return false;
 } //bool moveobj(struct moveHelper* orig,struct moveHelper* to,int* endaddr)
 
+inline int assignSpareMem(int sourcecore,
+        		               int * requiredmem,
+													 int * tomove,
+													 int * startaddr) {
+	int b = 0;
+	BLOCKINDEX(gcloads[sourcecore], &b);
+	int boundptr = b<NUMCORES?(b+1)*BAMBOO_SMEM_SIZE_L
+		:BAMBOO_LARGE_SMEM_BOUND+(b-NUMCORES+1)*BAMBOO_SMEM_SIZE;
+	int remain = boundptr - gcloads[sourcecore];
+	int memneed = requiredmem + BAMBOO_CACHE_LINE_SIZE;
+	*startaddr = gcloads[sourcecore];
+	*tomove = gcfilledblocks[sourcecore] + 1;
+	if(memneed < remain) {
+		gcloads[sourcecore] += memneed;
+		return 0;
+	} else {
+		// next available block
+		gcfilledblocks[sourcecore] += 1;
+		int newbase = 0;
+		BASEPTR(sourcecore, gcfilledblocks[sourcecore], &newbase);
+		gcloads[sourcecore] = newbase;
+		return requiredmem-remain;
+	}
+}
+
 inline bool findSpareMem(int * startaddr,
 		                     int * tomove,
-												 int requiredmem) {
+												 int * dstcore,
+												 int requiredmem,
+												 int requiredcore) {
 	for(int k = 0; k < NUMCORES; k++) {
-		if((gccorestatus[k] == 0) && (gcnumblocks[k] < gcstopblock)) {
+		if((gccorestatus[k] == 0) && (gcfilledblocks[k] < gcstopblock[k])) {
 			// check if this stopped core has enough mem
-			int b = 0;
-			BLOCKINDEX(gcloads[k], &b);
-			int boundptr = b<NUMCORES?(b+1)*BAMBOO_SMEM_SIZE_L
-				:BAMBOO_LARGE_SMEM_BOUND+(b-NUMCORES+1)*BAMBOO_SMEM_SIZE;
-			int remain = boundptr - gcloads[k];
-			*tomove = requiredmem + BAMBOO_CACHE_LINE_SIZE;
-			*startaddr = gcloads[k];
-			if(*tomove < remain) {
-				gcloads[k] += *tomove;
-			} else {
-				// next available block
-				gcnumblocks[k] += 1;
-				int newbase = 0;
-				BASEPTR(k, gcnumblocks[k], &newbase);
-				gcloads[k] = newbase;
-				*tomove = remain;
-			}
+			assignSpareMem(k, requiredmem, tomove, startaddr);
+			*dstcore = k;
 			return true;
 		}
 	}
-	// TODO if can not find spare mem right now, hold the request
+	// if can not find spare mem right now, hold the request
+	gcrequiredmems[requiredcore] = requiredmem;
+	gcmovepending++;
 	return false;
-}
-
-inline void compact() {
-	if(COMPACTPHASE != gcphase) {
-		BAMBOO_EXIT(0xb003);
-	}
-
-	INTPTR heaptopptr = 0;
+} //bool findSpareMem(int* startaddr,int* tomove,int mem,int core)
 
-	// initialize pointers for comapcting
-	struct moveHelper * orig = 
-		(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-	struct moveHelper * to = 
-		(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
-	initOrig_Dst(orig, to);
-	
+inline bool compacthelper(struct moveHelper * orig,
+		                      struct moveHelper * to,
+													int * filledblocks,
+													int * heaptopptr,
+													bool * localcompact) {
 	// scan over all objs in this block, compact the marked objs 
 	// loop stop when finishing either scanning all active objs or 
 	// fulfilled the gcstopblock
 innercompact:
 	do {
-		bool stop = moveobj(orig, to, gcstopblock);
+		bool stop = moveobj(orig, to, gcblock2fill);
 		if(stop) {
 			break;
 		}
 	} while(orig->ptr < gcmarkedptrbound); 
 	// fill the header of this block
 	(*((int*)(to->base))) = to->offset;
-	heaptopptr = to->ptr;
+	if(*localcompact) {
+		*heaptopptr = to->ptr;
+		*filledblocks = to->numblocks;
+	}
 
 	// send msgs to core coordinator indicating that the compact is finishing
 	// send compact finish message to core coordinator
 	if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
-		gcnumblocks[0] = to->numblocks;
+		gcfilledblocks[BAMBOO_NUM_OF_CORE] = *filledblocks;
+		gcloads[BAMBOO_NUM_OF_CORE] = *heaptopptr;
 		if(orig->ptr < gcmarkedptrbound) {
 			// ask for more mem
 			gctomove = false;
-			if(findSpareMem(&gcmovestartaddr, &gcstopblock, curr_heaptop)) {
+			if(findSpareMem(&gcmovestartaddr, &gcblock2fill, &gcdstcore, 
+						          curr_heaptop, BAMBOO_NUM_OF_CORE)) {
 				gctomove = true;
 			} else {
-				// TODO hold the request
+				return false; 
 			}
 		} else {
 			gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
-			gcloads[BAMBOO_NUM_OF_CORE] = to->ptr;
+			return true;
 		}
 	} else {
 		if(orig->ptr < gcmarkedptrbound) {
 			// ask for more mem
 			gctomove = false;
 			send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE, 
-					       to->numblocks, 0, curr_heaptop);
+					       *filledblocks, *heaptopptr, curr_heaptop);
 		} else {
 			// finish compacting
 			send_msg_5(STARTUPCORE, GCFINISHCOMPACT, BAMBOO_NUM_OF_CORE,
-					       to->numblocks, 1, to->ptr);
+					       *filledblocks, *heaptopptr, 0);
 		}
 	} // if(STARTUPCORE == BAMBOO_NUM_OF_CORE)
 
 	if(orig->ptr < gcmarkedptrbound) {
 		// still have unpacked obj
 		while(!gctomove) {};
+		gctomove = false;
 
 		to->ptr = gcmovestartaddr;
-		to->numblocks = gcstopblock - 1;
+		to->numblocks = gcblock2fill - 1;
 		to->bound = (to->numblocks==0)?
 			BAMBOO_SMEM_SIZE_L:
 			BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
@@ -1055,9 +1118,33 @@ innercompact:
 		to->offset = BAMBOO_CACHE_LINE_SIZE;
 		to->ptr += to->offset; // for header
 		to->top += to->offset;
+		if(gcdstcore == BAMBOO_NUM_OF_CORE) {
+			*localcompact = true;
+		} else {
+			*localcompact = false;
+		}
 		goto innercompact;
 	}
-	// TODO finish?
+	return true;
+} // void compacthelper()
+
+inline void compact() {
+	if(COMPACTPHASE != gcphase) {
+		BAMBOO_EXIT(0xb003);
+	}
+
+	// initialize pointers for comapcting
+	struct moveHelper * orig = 
+		(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
+	struct moveHelper * to = 
+		(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
+
+	initOrig_Dst(orig, to);
+	
+	int filledblocks = 0;
+	INTPTR heaptopptr = 0;
+	bool localcompact = true;
+	compacthelper(orig, to, &filledblocks, &heaptopptr, &localcompact);
 
 	RUNFREE(orig);
 	RUNFREE(to);
@@ -1135,4 +1222,183 @@ inline void gc_collect(struct garbagelist * stackptr) {
 	while(FINISHPHASE != gcphase) {}
 } // void gc_collect(struct garbagelist * stackptr)
 
+inline void gc(struct garbagelist * stackptr) {
+	// check if do gc
+	if(!gcflag) {
+		return;
+	}
+
+	// core coordinator routine
+	if(0 == BAMBOO_NUM_OF_CORE) {
+		if(!preGC()) {
+			// not ready to do gc
+			gcflag = true;
+			return;
+		}
+
+		initGC();
+
+		gcprocessing = true;
+		int i = 0;
+		waitconfirm = false;
+		waitconfirm = 0;
+		gcphase = MARKPHASE;
+		for(i = 1; i < NUMCORES - 1; i++) {
+			// send GC start messages to all cores
+			send_msg_1(i, GCSTART);
+		}
+		bool isfirst = true;
+		bool allStall = false;
+
+		// mark phase
+		while(MARKPHASE == gcphase) {
+			mark(isfirst, stackptr);
+			if(isfirst) {
+				isfirst = false;
+			}
+
+			// check gcstatus
+			checkMarkStatue(); 
+		}  // while(MARKPHASE == gcphase)
+		// send msgs to all cores requiring large objs info
+		numconfirm = NUMCORES - 1;
+		for(i = 1; i < NUMCORES; ++i) {
+			send_msg_1(i, GCLOBJREQUEST);
+		}	
+		while(numconfirm != 0) {} // wait for responses
+		// cache all large objs
+		if(!cacheLObjs()) {
+			// no enough space to cache large objs
+			BAMBOO_EXIT(0xd001);
+		}
+		// predict number of blocks to fill for each core
+		int numpbc = loadbalance();
+		for(i = 0; i < NUMCORES; ++i) {
+			//send start compact messages to all cores
+			if((gcheapdirection) && (i < gctopcore)
+					|| ((!gcheapdirection) && (i > gctopcore))) {
+				gcstopblock[i] =numpbc + 1;
+				if(i != STARTUPCORE) {
+					send_msg_2(i, GCSTARTCOMPACT, numpbc+1); 
+				}
+			} else {
+				gcstopblock[i] = numpbc;
+				if(i != STARTUPCORE) {
+					send_msg_2(i, GCSTARTCOMPACT, numpbc);
+				}
+			}
+			// init some data strutures for compact phase
+			gcloads[i] = 0;
+			gcfilledblocks[i] = 0;
+			gcrequiredmems[i] = 0;
+		}
+
+		// compact phase
+		bool finalcompact = false;
+		// initialize pointers for comapcting
+		struct moveHelper * orig = 
+			(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
+		struct moveHelper * to = 
+			(struct moveHelper *)RUNMALLOC(sizeof(struct moveHelper));
+		initOrig_Dst(orig, to);
+		int filledblocks = 0;
+		INTPTR heaptopptr = 0;
+		bool finishcompact = false;
+		bool iscontinue = true;
+		while((COMPACTPHASE == gcphase) || (SUBTLECOMPACTPHASE == gcphase)) {
+			if((!finishcompact) && iscontinue) {
+				finishcompact = compacthelper(orig, to, &filledblocks, 
+						                          &heaptopptr, &localcompact);
+			}
+
+			if(gc_checkCoreStatus()) {
+				// all cores have finished compacting
+				// restore the gcstatus of all cores
+				for(i = 0; i < NUMCORES; ++i) {
+					gccorestatus[i] = 1;
+				}
+				break;
+			} else {
+				// check if there are spare mem for pending move requires
+				if(COMPACTPHASE == gcphase) {
+					resolvePendingMoveRequest();
+				} else {
+					compact2Heaptop();
+				}
+			} // if(gc_checkCoreStatus()) else ...
+
+			if(gctomove) {
+				to->ptr = gcmovestartaddr;
+				to->numblocks = gcblock2fill - 1;
+				to->bound = (to->numblocks==0)?
+					BAMBOO_SMEM_SIZE_L:
+					BAMBOO_SMEM_SIZE_L+BAMBOO_SMEM_SIZE*to->numblocks;
+				BASEPTR(BAMBOO_NUM_OF_CORE, to->numblocks, &(to->base));
+				to->offset = to->ptr - to->base;
+				to->top = (to->numblocks==0)?
+					(to->offset):(to->bound-BAMBOO_SMEM_SIZE+to->offset);
+				to->base = to->ptr;
+				to->offset = BAMBOO_CACHE_LINE_SIZE;
+				to->ptr += to->offset; // for header
+				to->top += to->offset;
+				if(gcdstcore == BAMBOO_NUM_OF_CORE) {
+					*localcompact = true;
+				} else {
+					*localcompact = false;
+				}
+				gctomove = false;
+				iscontinue = true;
+			} else if(!finishcompact) {
+				// still pending
+				iscontinue = false;
+			} // if(gctomove)
+
+		} // while(COMPACTPHASE == gcphase) 
+		// move largeObjs
+		moveLObjs();
+
+		gcphase = FLUSHPHASE;
+		for(i = 1; i < NUMCORES; ++i) {
+			// send start flush messages to all cores
+			send_msg_1(i, GCSTARTFLUSH);
+		}
+
+		// flush phase
+		flush();
+		gccorestatus[BAMBOO_NUM_OF_CORE] = 0;
+		while(FLUSHPHASE == gcphase) {
+			// check the status of all cores
+			allStall = true;
+			for(i = 0; i < NUMCORES; ++i) {
+				if(gccorestatus[i] != 0) {
+					allStall = false;
+					break;
+				}
+			}	
+			if(allStall) {
+				break;
+			}
+		} // while(FLUSHPHASE == gcphase)
+		gcphase = FINISHPHASE;
+		for(i = 1; i < NUMCORES; ++i) {
+			// send gc finish messages to all cores
+			send_msg_1(i, GCFINISH);
+		}
+
+		// need to create free memory list  
+		updateFreeMemList();
+	} else {
+		gcprocessing = true;
+		gc_collect(stackptr);
+	}
+
+	// invalidate all shared mem pointers
+	bamboo_cur_msp = NULL;
+	bamboo_smem_size = 0;
+
+	gcflag = false;
+	gcprocessing = false;
+
+} // void gc(struct garbagelist * stackptr)
+
 #endif
diff --git a/Robust/src/Runtime/multicoregarbage.h b/Robust/src/Runtime/multicoregarbage.h
index 5908fae9..f081e765 100644
--- a/Robust/src/Runtime/multicoregarbage.h
+++ b/Robust/src/Runtime/multicoregarbage.h
@@ -3,12 +3,7 @@
 #include "Queue.h"
 
 // data structures for GC
-#define BAMBOO_NUM_PAGES 1024 * 512
-#define BAMBOO_PAGE_SIZE 4096
-#define BAMBOO_SHARED_MEM_SIZE BAMBOO_PAGE_SIZE * BAMBOO_NUM_PAGES
-#define BAMBOO_BASE_VA 0xd000000
-#define BAMBOO_SMEM_SIZE 16 * BAMBOO_PAGE_SIZE
-#define BAMBOO_SMEM_SIZE_L 512 * BAMBOO_PAGE_SIZE
+#define BAMBOO_SMEM_SIZE_L 32 * BAMBOO_SMEM_SIZE
 #define BAMBOO_LARGE_SMEM_BOUND BAMBOO_SMEM_SIZE_L*NUMCORES // NUMCORES=62
 
 #define NUMPTRS 100
@@ -58,12 +53,13 @@ int gclobjtailindex2=0;
 struct lobjpointerblock *gclobjspare=NULL;
 int gcnumlobjs = 0;
 
-enum GCPHASETYPE {
+typedef enum {
 	MARKPHASE = 0x0,   // 0x0
 	COMPACTPHASE,      // 0x1
-	FLUSHPHASE,        // 0x2
-	FINISHPHASE        // 0x3
-};
+	SUBTLECOMPACTPHASE,// 0x2
+	FLUSHPHASE,        // 0x3
+	FINISHPHASE        // 0x4
+} GCPHASETYPE;
 
 volatile bool gcflag;
 volatile bool gcprocessing;
@@ -86,11 +82,15 @@ bool gcheapdirection; // 0: decrease; 1: increase
 
 // compact instruction
 INTPTR gcmarkedptrbound;
-int gcstopblock; // indicate when to stop compact phase
-int gcnumblocks[NUMCORES]; // indicate how many blocks have been fulfilled
+int gcblock2fill;
+int gcstopblock[NUMCORES]; // indicate when to stop compact phase
+int gcfilledblocks[NUMCORES]; //indicate how many blocks have been fulfilled
 // move instruction;
 INTPTR gcmovestartaddr;
+int gcdstcore;
 bool gctomove;
+int gcrequiredmems[NUMCORES]; //record pending mem requests
+int gcmovepending;
 
 // mapping of old address to new address
 struct RuntimeHash * gcpointertbl;
diff --git a/Robust/src/Runtime/multicoreruntime.c b/Robust/src/Runtime/multicoreruntime.c
index 0e84d7e3..3b1c9cf4 100644
--- a/Robust/src/Runtime/multicoreruntime.c
+++ b/Robust/src/Runtime/multicoreruntime.c
@@ -194,6 +194,7 @@ void * allocate_new(void * ptr, int type) {
   v->type=type;
   v->version = 0;
   v->lock = NULL;
+	initlock(v);
   return v;
 }
 
@@ -208,6 +209,7 @@ struct ArrayObject * allocate_newarray(void * ptr, int type, int length) {
     return NULL;
   }
   v->___length___=length;
+	initlock(v);
   return v;
 }
 
@@ -218,6 +220,7 @@ void * allocate_new(int type) {
   v->version = 0;
   //v->numlocks = 0;
   v->lock = NULL;
+	initlock(v);
   return v;
 }
 
@@ -230,6 +233,7 @@ struct ArrayObject * allocate_newarray(int type, int length) {
   //v->numlocks = 0;
   v->lock = NULL;
   v->___length___=length;
+	initlock(v);
   return v;
 }
 #endif
diff --git a/Robust/src/Runtime/multicoreruntime.h b/Robust/src/Runtime/multicoreruntime.h
index 15255eaf..64cae114 100644
--- a/Robust/src/Runtime/multicoreruntime.h
+++ b/Robust/src/Runtime/multicoreruntime.h
@@ -130,7 +130,7 @@ volatile bool isMsgSending;
  *        20 + orig large obj ptr + new large obj ptr 
 *            (size is always 3 * sizeof(int))
  */
-enum MSGTYPE {
+typedef enum {
 	TRANSOBJ = 0x0,  // 0x0
 	TRANSTALL,       // 0x1
 	LOCKREQUEST,     // 0x2
@@ -167,7 +167,7 @@ enum MSGTYPE {
 	GCLOBJMAPPING,   // 0x20
 #endif
 	MSGEND
-};
+} MSGTYPE;
 
 // data structures of status for termination
 int corestatus[NUMCORES]; // records status of each core
@@ -202,6 +202,12 @@ bool lockflag;
 struct Queue objqueue;
 
 // data structures for shared memory allocation
+#define BAMBOO_NUM_PAGES 1024 * 512
+#define BAMBOO_PAGE_SIZE 4096
+#define BAMBOO_SHARED_MEM_SIZE BAMBOO_PAGE_SIZE * BAMBOO_NUM_PAGES
+#define BAMBOO_BASE_VA 0xd000000
+#define BAMBOO_SMEM_SIZE 16 * BAMBOO_PAGE_SIZE
+
 #ifdef MULTICORE_GC
 #include "multicoregarbage.h"
 
@@ -221,15 +227,9 @@ struct freeMemList * bamboo_free_mem_list;
 INTPTR bamboo_cur_msp;
 int bamboo_smem_size;
 #else
-#define BAMBOO_NUM_PAGES 1024 * 512
-#define BAMBOO_PAGE_SIZE 4096
-#define BAMBOO_SHARED_MEM_SIZE BAMBOO_PAGE_SIZE * BAMBOO_PAGE_SIZE
-#define BAMBOO_BASE_VA 0xd000000
-#define BAMBOO_SMEM_SIZE 16 * BAMBOO_PAGE_SIZE
-
 bool smemflag;
 mspace bamboo_free_msp;
-mspace bamboo_cur_msp;
+INTPTR bamboo_cur_msp;
 int bamboo_smem_size;
 #endif
 
@@ -285,6 +285,7 @@ inline void initialization(void) __attribute__((always_inline));
 inline void initCommunication(void) __attribute__((always_inline));
 inline void fakeExecution(void) __attribute__((always_inline));
 inline void terminate(void) __attribute__((always_inline));
+inline void initlock(struct ___Object___ * v) __attribute__((always_inline));
 
 // lock related functions
 bool getreadlock(void* ptr);
@@ -293,6 +294,9 @@ bool getwritelock(void* ptr);
 void releasewritelock(void* ptr);
 bool getwritelock_I(void* ptr);
 void releasewritelock_I(void * ptr);
+#ifndef MULTICORE_GC
+void releasewritelock_r(void * lock, void * redirectlock);
+#endif
 /* this function is to process lock requests. 
  * can only be invoked in receiveObject() */
 // if return -1: the lock request is redirected
@@ -303,11 +307,11 @@ inline int processlockrequest(int locktype,
 															int obj, 
 															int requestcore, 
 															int rootrequestcore, 
-															bool cache) __attribute_((always_inline));
+															bool cache) __attribute__((always_inline));
 inline void processlockrelease(int locktype, 
 		                           int lock, 
 															 int redirectlock, 
-															 bool isredirect) __attribute_((always_inline));
+															 bool isredirect) __attribute__((always_inline));
 
 // msg related functions
 inline void send_hanging_msg() __attribute__((always_inline));
diff --git a/Robust/src/Runtime/multicoretask.c b/Robust/src/Runtime/multicoretask.c
index 924f7021..71d8acc5 100644
--- a/Robust/src/Runtime/multicoretask.c
+++ b/Robust/src/Runtime/multicoretask.c
@@ -19,6 +19,7 @@ int enqueuetasks_I(struct parameterwrapper *parameter,
 									 int numenterflags);
 
 inline void initruntimedata() {
+	int i;
 	// initialize the arrays
   if(STARTUPCORE == BAMBOO_NUM_OF_CORE) {
     // startup core to initialize corestatus[]
@@ -35,6 +36,8 @@ inline void initruntimedata() {
 			gcnumsendobjs[i] = 0; 
       gcnumreceiveobjs[i] = 0;
 			gcloads[i] = 0;
+			gcrequiredmems[i] = 0;
+			gcstopblock[i] = 0;
 #endif
     } // for(i = 0; i < NUMCORES; ++i)
 		numconfirm = 0;
@@ -82,27 +85,20 @@ inline void initruntimedata() {
 	gcheaptop = 0;
 	gctopcore = 0;
 	gcheapdirection = 1;
-	gcstopblock = 0;
 	gcreservedsb = 0;
 	gcmovestartaddr = 0;
 	gctomove = false;
 	gcstopblock = 0;
-
-	// initialize queue
-	if (gchead==NULL) {
-		gcheadindex=0;
-		gctailindex=0;
-		gctailindex2 = 0;
-		gchead=gctail=gctail2=malloc(sizeof(struct pointerblock));
-	}
-	// initialize the large obj queues
-	if (gclobjhead==NULL) {
-		gclobjheadindex=0;
-		gclobjtailindex=0;
-		gclobjtailindex2 = 0;
-		gclobjhead=gclobjtail=gclobjtail2=
-			malloc(sizeof(struct lobjpointerblock));
-	}
+	gchead = gctail = gctail2 = NULL;
+	gclobjhead = gclobjtail = gclobjtail2 = NULL;
+	gcheadindex=0;
+	gctailindex=0;
+	gctailindex2 = 0;
+	gclobjheadindex=0;
+	gclobjtailindex=0;
+	gclobjtailindex2 = 0;
+	gcmovepending = 0;
+	gcblocks2fill = 0;
 #else
 	// create the lock table, lockresult table and obj queue
   locktable.size = 20;
@@ -282,7 +278,7 @@ objqueuebreak:
 	return rflag;
 }
 
-inline void checkCoreStatue() {
+inline void checkCoreStatus() {
 	bool allStall = false;
 	int i = 0;
 	int sumsendobj = 0;
@@ -824,7 +820,6 @@ struct ___TagDescriptor___ * allocate_tag(void *ptr,
 struct ___TagDescriptor___ * allocate_tag(int index) {
   struct ___TagDescriptor___ * v=FREEMALLOC(classsize[TAGTYPE]);
 #endif
-  struct ___TagDescriptor___ * v=FREEMALLOC(classsize[TAGTYPE]);
   v->type=TAGTYPE;
   v->flag=index;
   return v;
@@ -1167,7 +1162,7 @@ void * smemalloc(int size,
 	if(freemem != NULL) {
 		void * mem = (void *)(freemem->ptr);
 		*allocsize = size;
-		freemem->ptr += size;
+		freemem->ptr = ((void*)freemem->ptr) + size;
 		freemem->size -= size;
 		// check how many blocks it acrosses
 		int b = 0;
@@ -1700,7 +1695,7 @@ msg:
 
 	case GCSTARTCOMPACT: {
 		// a compact phase start msg
-		gcstopblock = msgdata[1];
+		gcblocks2fill = msgdata[1];
 		gcphase = COMPACTPHASE;
 		break;
 	}
@@ -1737,23 +1732,57 @@ msg:
 		  BAMBOO_DEBUGPRINT_REG(msgdata[1]);
 #endif
 		  BAMBOO_EXIT(0xb006);
-		} 
-		if(msgdata[1] < NUMCORES) {
-			gcnumblocks[msgdata[1]] = msgdata[2];
-			if(msgdata[3] == 0) {
+		}
+		int cnum = msgdata[1];
+		int filledblocks = msgdata[2];
+		int heaptop = msgdata[3];
+		int data4 = msgdata[4];
+		if(cnum < NUMCORES) {
+			if(COMPACTPHASE == gcphase) {
+				gcfilledblocks[cnum] = filledblocks;
+				gcloads[cnum] = heaptop;
+			}
+			if(data4 > 0) {
 				// ask for more mem
 				int startaddr = 0;
 				int tomove = 0;
-				if(findSpareMem(&startaddr, &tomove, msgdata[2])) {
-					send_msg_4(msgdata[1], GCMOVESTART, k, startaddr, tomove);
-				} else {
-					// TODO if not success
+				int dstcore = 0;
+				if(findSpareMem(&startaddr, &tomove, &dstcore, data4, cnum)) {
+					send_msg_4(cnum, GCMOVESTART, dstcore, startaddr, tomove);
 				}
 			} else {
-				gccorestatus[msgdata[1]] = 0;
-				gcloads[msgdata[1]] = msgdata[4];
-			}
-		}
+				gccorestatus[cnum] = 0;
+				// check if there is pending move request
+				if(gcmovepending > 0) {
+					int j;
+					for(j = 0; j < NUMCORES; j++) {
+						if(gcrequiredmems[j]>0) {
+							break;
+						}
+					}
+					if(j < NUMCORES) {
+						// find match
+						int tomove = 0;
+						int startaddr = 0;
+						gcrequiredmems[j] = assignSpareMem(cnum, 
+																							 gcrequiredmems[j], 
+																							 &tomove, 
+																							 &startaddr);
+						if(STARTUPCORE == j) {
+							gcdstcore = cnum;
+							gctomove = true;
+							gcmovestartaddr = startaddr;
+							gcblock2fill = tomove;
+						} else {
+							send_msg_4(j, GCMOVESTART, cnum, startaddr, tomove);
+						} // if(STARTUPCORE == j)
+						if(gcrequiredmems[j] == 0) {
+							gcmovepending--;
+						}
+					} // if(j < NUMCORES)
+				} // if(gcmovepending > 0)
+			} // if(flag == 0)
+		} // if(cnum < NUMCORES)
 	  break;
 	}
 
@@ -1829,8 +1858,9 @@ msg:
 	case GCMOVESTART: {
 		// received a start moving objs msg
 		gctomove = true;
+		gcdstcore = msgdata[1];
 		gcmovestartaddr = msgdata[2];
-		gcstopblock = msgdata[3];
+		gcblock2fill = msgdata[3];
 		break;
 	}
 	
@@ -2122,6 +2152,58 @@ backtrackinc:
 int containstag(struct ___Object___ *ptr, 
 		            struct ___TagDescriptor___ *tag);
 
+#ifndef MULTICORE_GC
+void releasewritelock_r(void * lock, void * redirectlock) {
+  int targetcore = 0;
+  int reallock = (int)lock;
+  targetcore = (reallock >> 5) % BAMBOO_TOTALCORE;
+
+#ifdef DEBUG
+  BAMBOO_DEBUGPRINT(0xe671);
+  BAMBOO_DEBUGPRINT_REG((int)lock);
+  BAMBOO_DEBUGPRINT_REG(reallock);
+  BAMBOO_DEBUGPRINT_REG(targetcore);
+#endif
+
+  if(targetcore == BAMBOO_NUM_OF_CORE) {
+	BAMBOO_START_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf001);
+#endif
+    // reside on this core
+    if(!RuntimeHashcontainskey(locktbl, reallock)) {
+      // no locks for this object, something is wrong
+      BAMBOO_EXIT(0xa01d);
+    } else {
+      int rwlock_obj = 0;
+	  struct LockValue * lockvalue = NULL;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT(0xe672);
+#endif
+      RuntimeHashget(locktbl, reallock, &rwlock_obj);
+	  lockvalue = (struct LockValue *)rwlock_obj;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+      lockvalue->value++;
+	  lockvalue->redirectlock = (int)redirectlock;
+#ifdef DEBUG
+      BAMBOO_DEBUGPRINT_REG(lockvalue->value);
+#endif
+    }
+	BAMBOO_CLOSE_CRITICAL_SECTION_LOCK();
+#ifdef DEBUG
+	BAMBOO_DEBUGPRINT(0xf000);
+#endif
+    return;
+  } else {
+	  // send lock release with redirect info msg
+	  // for 32 bit machine, the size is always 4 words
+	  send_msg_4(targetcore, REDIRECTRELEASE, 1, (int)lock, (int)redirectlock);
+  }
+}
+#endif
+
 void executetasks() {
   void * taskpointerarray[MAXTASKPARAMS+OFFSET];
   int numparams=0;
@@ -2424,18 +2506,22 @@ execute:
 		  BAMBOO_DEBUGPRINT(0xe999);
 #endif
 	    for(i = 0; i < locklen; ++i) {
-		  void * ptr = (void *)(locks[i].redirectlock);
+				void * ptr = (void *)(locks[i].redirectlock);
 	      int * lock = (int *)(locks[i].value);
 #ifdef DEBUG
 		  BAMBOO_DEBUGPRINT_REG((int)ptr);
 		  BAMBOO_DEBUGPRINT_REG((int)lock);
 #endif
+#ifndef MULTICORE_GC
 		  if(RuntimeHashcontainskey(lockRedirectTbl, (int)lock)) {
 			  int redirectlock;
 			  RuntimeHashget(lockRedirectTbl, (int)lock, &redirectlock);
 			  RuntimeHashremovekey(lockRedirectTbl, (int)lock);
 			  releasewritelock_r(lock, (int *)redirectlock);
 		  } else {
+#else
+				{
+#endif
 		releasewritelock(ptr);
 		  }
 	    }
diff --git a/Robust/src/buildscript b/Robust/src/buildscript
index 6d9ae67e..d87399f2 100755
--- a/Robust/src/buildscript
+++ b/Robust/src/buildscript
@@ -40,6 +40,7 @@ echo "-tilera generate tilera version binary (should be used together with -mult
 echo "-tileraconfig config tilera simulator/pci as nxm (should be used together with -tilera)"
 echo "-raw generate raw version binary (should be used together with -multicore)"
 echo "-rawconfig config raw simulator as 4xn (should be used together with -raw)"
+echo "-multcoregc enable garbage collection in multicore version"
 echo -threadsimulate generate multi-thread simulate version binary
 echo -optional enable optional
 echo -debug generate debug symbols
@@ -83,6 +84,7 @@ RECOVERFLAG=false
 MLP_ON=false
 MLPDEBUG=false
 MULTICOREFLAG=false
+MULTICOREGCFLAG=false
 RAWFLAG=false
 TILERAFLAG=false
 TILERACONFIG=''
@@ -237,6 +239,10 @@ elif [[ $1 = '-multicore' ]]
 then
 MULTICOREFLAG=true
 JAVAOPTS="$JAVAOPTS -multicore"
+elif [[ $1 = '-multicoregc' ]]
+then
+MULTICOREGCFLAG=true
+JAVAOPTS="$JAVAOPTS -multicoregc"
 elif [[ $1 = '-numcore' ]]
 then
 JAVAOPTS="$JAVAOPTS -numcore $2"
@@ -604,10 +610,18 @@ then #INTERRUPT version
 TILERACFLAGS="${TILERACFLAGS} -DINTERRUPT"
 fi #INTERRUPT version
 
+if $MULTICOREGCFLAG
+then #MULTICORE_GC version
+TILERACFLAGS="${TILERACFLAGS} -DMULTICORE_GC"
+fi #MULTICORE_GC version
+
 cp $ROBUSTROOT/Tilera/Runtime/$MAKEFILE ./Makefile
 cp $ROBUSTROOT/Tilera/Runtime/$SIMHVC ./sim.hvc
 cp $ROBUSTROOT/Tilera/Runtime/$PCIHVC ./pci.hvc
 cp $ROBUSTROOT/Tilera/Runtime/bamboo-vmlinux-pci.hvc ./bamboo-vmlinux-pci.hvc
+cp ../Tilera/Runtime/*.c ./
+cp ../Tilera/Runtime/*.h ./
+cp ../Tilera/lib/* ./
 cp ../Runtime/multicoretask.c ./
 cp ../Runtime/multicoreruntime.c ./
 cp ../Runtime/Queue.c ./
@@ -619,6 +633,7 @@ cp ../Runtime/SimpleHash.c ./
 cp ../Runtime/ObjectHash.c ./
 cp ../Runtime/socket.c ./
 cp ../Runtime/mem.c ./
+cp ../Runtime/multicoregarbage.c ./
 cp ../Runtime/GenericHashtable.h ./
 cp ../Runtime/mem.h ./
 cp ../Runtime/multicoreruntime.h ./
@@ -627,9 +642,7 @@ cp ../Runtime/ObjectHash.h ./
 cp ../Runtime/Queue.h ./
 cp ../Runtime/runtime.h ./
 cp ../Runtime/SimpleHash.h ./
-cp ../Tilera/Runtime/*.c ./
-cp ../Tilera/Runtime/*.h ./
-cp ../Tilera/lib/* ./
+cp ../Runtime/multicoregarbage.h ./
 cp ../tmpbuilddirectory/*.c ./
 cp ../tmpbuilddirectory/*.h ./
 
-- 
2.34.1