changes
[IRC.git] / Robust / src / Runtime / bamboo / multicorecache.c
1 #ifdef GC_CACHE_ADAPT
2 #include "multicorecache.h"
3 #include "multicoremsg.h"
4 #include "multicoregc.h"
5 #include "multicoregcprofile.h"
6
7 void cacheadapt_finish_compact(void *toptr) {
8   unsigned int dstpage=((unsigned INTPTR)(toptr-gcbaseva))>>BAMBOO_PAGE_SIZE_BITS;
9   unsigned int * newtable=&gccachesamplingtbl_r[dstpage*NUMCORESACTIVE];
10
11   for(int core = 0; core < NUMCORESACTIVE; core++) {
12     (*newtable)=(*newtable)>>6;
13     newtable++;
14   }  
15 }
16
17 void cacheadapt_finish_src_page(void *srcptr, void *tostart, void *tofinish) {
18   unsigned int srcpage=((unsigned INTPTR)(srcptr-gcbaseva))>>BAMBOO_PAGE_SIZE_BITS;
19   unsigned int dstpage=((unsigned INTPTR)(tostart-gcbaseva))>>BAMBOO_PAGE_SIZE_BITS;
20   unsigned int numbytes=tofinish-tostart;
21   
22   unsigned int * oldtable=&gccachesamplingtbl[srcpage*NUMCORESACTIVE];
23   unsigned int * newtable=&gccachesamplingtbl_r[dstpage*NUMCORESACTIVE];
24   
25   unsigned int page64th=numbytes>>(BAMBOO_PAGE_SIZE_BITS-6);
26
27   for(int core = 0; core < NUMCORESACTIVE; core++) {
28     (*newtable)+=page64th*(*oldtable);
29     newtable++;
30     oldtable++;
31   }  
32 }
33
34 /* Bytes needed equal to zero is a special case...  It means that we should finish the dst page */
35
36 void cacheadapt_finish_dst_page(void *origptr, void *tostart, void *toptr, unsigned int bytesneeded) {
37   unsigned int numbytes=toptr-tostart;
38
39   void *tobound=(void *)((((unsigned INTPTR)toptr-1)&~(BAMBOO_PAGE_SIZE-1))+BAMBOO_PAGE_SIZE);
40   void *origbound=(void *)((((unsigned INTPTR)origptr)&~(BAMBOO_PAGE_SIZE-1))+BAMBOO_PAGE_SIZE);
41   
42   unsigned int topage=((unsigned INTPTR)(toptr-1-gcbaseva))>>BAMBOO_PAGE_SIZE_BITS; 
43   unsigned int origpage=((unsigned INTPTR)(origptr-gcbaseva))>>BAMBOO_PAGE_SIZE_BITS;
44
45   unsigned int * totable=&gccachesamplingtbl_r[topage*NUMCORESACTIVE];
46   unsigned int * origtable=&gccachesamplingtbl[origpage*NUMCORESACTIVE];
47
48   //handler
49   unsigned int remaintobytes=(bytesneeded==0)?0:(tobound-toptr);
50   unsigned int remainorigbytes=origbound-origptr;
51
52   do {
53     //round source bytes down....don't want to close out page if not necessary
54     remainorigbytes=(remainorigbytes>bytesneeded)?bytesneeded:remainorigbytes;
55
56     if (remaintobytes<=remainorigbytes) {
57       //Need to close out to page
58
59       numbytes+=remaintobytes;
60       unsigned int page64th=numbytes>>(BAMBOO_PAGE_SIZE_BITS-6);
61
62       for(int core = 0; core < NUMCORESACTIVE; core++) {
63         (*totable)=(*totable+page64th*(*origtable))>>6;
64         totable++;
65         origtable++;
66       }
67       toptr+=remaintobytes;
68       origptr+=remaintobytes;
69       bytesneeded-=remaintobytes;
70       topage++;//to page is definitely done
71       tobound+=BAMBOO_PAGE_SIZE;
72       origpage=((unsigned INTPTR)(origptr-gcbaseva))>>BAMBOO_PAGE_SIZE_BITS;//handle exact match case
73       origbound=(void *) ((((unsigned INTPTR)origptr)&~(BAMBOO_PAGE_SIZE-1))+BAMBOO_PAGE_SIZE);
74     } else {
75       //Finishing off orig page
76
77       numbytes+=remainorigbytes;
78       unsigned int page64th=numbytes>>(BAMBOO_PAGE_SIZE_BITS-6);
79       
80       for(int core = 0; core < NUMCORESACTIVE; core++) {
81         (*totable)+=page64th*(*origtable);
82         totable++;
83         origtable++;
84       }
85       toptr+=remainorigbytes;
86       origptr+=remainorigbytes;
87       bytesneeded-=remainorigbytes;
88       origpage++;//just orig page is done
89       origbound+=BAMBOO_PAGE_SIZE;
90     }
91     totable=&gccachesamplingtbl_r[topage*NUMCORESACTIVE];
92     origtable=&gccachesamplingtbl[origpage*NUMCORESACTIVE];
93     
94     remaintobytes=tobound-toptr;
95     remainorigbytes=origbound-origptr;
96     
97     numbytes=0;
98   } while(bytesneeded!=0);
99 }
100
101 // prepare for cache adaption:
102 //   -- flush the shared heap
103 //   -- clean dtlb entries
104 //   -- change cache strategy
105 void cacheAdapt_gc(bool isgccachestage) {
106   // flush the shared heap
107   BAMBOO_CACHE_FLUSH_L2();
108
109   // clean the dtlb entries
110   BAMBOO_CLEAN_DTLB();
111
112   if(isgccachestage) {
113     bamboo_install_dtlb_handler_for_gc();
114   } else {
115     bamboo_install_dtlb_handler_for_mutator();
116   }
117
118
119 // the master core decides how to adapt cache strategy for the mutator 
120 // according to collected statistic data
121
122 // find the core that accesses the page #page_index most
123 #define CACHEADAPT_FIND_HOTTEST_CORE(page_index,hottestcore,hotfreq) \
124   { \
125     unsigned int *local_tbl=&gccachesamplingtbl_r[page_index*NUMCORESACTIVE];   \
126     for(int i = 0; i < NUMCORESACTIVE; i++) { \
127       int freq = *local_tbl; \
128       local_tbl++; \
129       if(hotfreq < freq) { \
130         hotfreq = freq; \
131         hottestcore = i; \
132       } \
133     } \
134   }
135 // find the core that accesses the page #page_index most and comput the total
136 // access time of the page at the same time
137 #define CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq) \
138   { \
139     unsigned int *local_tbl=&gccachesamplingtbl_r[page_index*NUMCORESACTIVE];   \
140     for(int i = 0; i < NUMCORESACTIVE; i++) { \
141       int freq = *local_tbl; \
142       local_tbl++; \
143       totalfreq += freq; \
144       if(hotfreq < freq) { \
145         hotfreq = freq; \
146         hottestcore = i; \
147       } \
148     } \
149   }
150 // Set the policy as hosted by coren
151 // NOTE: (x,y) should be changed to (x+1, y+1)!!!
152 #define CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren) \
153   { \
154     (policy).cache_mode = BAMBOO_CACHE_MODE_COORDS; \    
155     (policy).lotar_x = bamboo_cpu2coords[2*(coren)]+1; \
156     (policy).lotar_y = bamboo_cpu2coords[2*(coren)+1]+1; \
157   }
158 // store the new policy information at tmp_p in gccachepolicytbl
159 #define CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy) \
160   { \
161     ((int*)(tmp_p))[page_index] = (policy).word; \
162   }
163
164 // make all pages hfh
165 void cacheAdapt_policy_h4h(int coren){
166   unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
167   unsigned int page_gap=page_num/NUMCORESACTIVE;
168   unsigned int page_index=page_gap*coren;
169   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
170   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
171   unsigned int * tmp_p = gccachepolicytbl;
172   for(; page_index < page_index_end; page_index++) {
173     bamboo_cache_policy_t policy = {0};
174     policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
175     CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
176     page_sva += BAMBOO_PAGE_SIZE;
177   }
178
179
180 // make all pages local as non-cache-adaptable gc local mode
181 void cacheAdapt_policy_local(int coren){
182   unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
183   unsigned int page_gap=page_num/NUMCORESACTIVE;
184   unsigned int page_index=page_gap*coren;
185   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
186   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
187   unsigned int * tmp_p = gccachepolicytbl;
188   for(; page_index < page_index_end; page_index++) {
189     bamboo_cache_policy_t policy = {0};
190     unsigned int block = 0;
191     BLOCKINDEX(block, (void *) page_sva);
192     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
193     CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);
194     CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
195     page_sva += BAMBOO_PAGE_SIZE;
196   }
197
198
199 void cacheAdapt_policy_hottest(int coren){
200   unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
201   unsigned int page_gap=page_num/NUMCORESACTIVE;
202   unsigned int page_index=page_gap*coren;
203   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
204   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
205   unsigned int * tmp_p = gccachepolicytbl;
206   for(; page_index < page_index_end; page_index++) {
207     bamboo_cache_policy_t policy = {0};
208     unsigned int hottestcore = 0;
209     unsigned int hotfreq = 0;
210     CACHEADAPT_FIND_HOTTEST_CORE(page_index,hottestcore,hotfreq);
211     // TODO
212     // Decide the cache strategy for this page
213     // If decide to adapt a new cache strategy, write into the shared block of
214     // the gcsharedsamplingtbl. The mem recording information that has been 
215     // written is enough to hold the information.
216     // Format: page start va + cache strategy(hfh/(host core+[x,y]))
217     if(hotfreq != 0) {
218       // locally cache the page in the hottest core
219       CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
220     } else {
221       // reset it to be homed by its host core
222       unsigned int block = 0;
223       BLOCKINDEX(block, (void *) page_sva);
224       unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
225       CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);
226     }
227     CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
228     page_sva += BAMBOO_PAGE_SIZE;
229   }
230
231
232 #define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  2
233 // cache the page on the core that accesses it the most if that core accesses 
234 // it more than (GC_CACHE_ADAPT_DOMINATE_THRESHOLD)% of the total.  Otherwise,
235 // h4h the page.
236 void cacheAdapt_policy_dominate(int coren){
237   unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
238   unsigned int page_gap=page_num/NUMCORESACTIVE;
239   unsigned int page_index=page_gap*coren;
240   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
241   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
242   unsigned int * tmp_p = gccachepolicytbl;
243   for(; page_index < page_index_end; page_index++) {
244     bamboo_cache_policy_t policy = {0};
245     unsigned int hottestcore = 0;
246     unsigned int totalfreq = 0;
247     unsigned int hotfreq = 0;
248     CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq);
249     // Decide the cache strategy for this page
250     // If decide to adapt a new cache strategy, write into the shared block of
251     // the gcpolicytbl 
252     // Format: page start va + cache policy
253     if(hotfreq != 0) {
254       totalfreq=totalfreq>>GC_CACHE_ADAPT_DOMINATE_THRESHOLD;
255       if(hotfreq < totalfreq) {
256         // use hfh
257         policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
258         /*unsigned int block = 0;
259         BLOCKINDEX(block, (void *) page_sva);
260         unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
261         CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);*/
262       } else {
263         // locally cache the page in the hottest core
264         CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
265       }     
266     } else {
267       // reset it to be homed by its host core
268       unsigned int block = 0;
269       BLOCKINDEX(block, (void *) page_sva);
270       unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
271       CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);
272     }
273     CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
274     page_sva += BAMBOO_PAGE_SIZE;
275   }
276 }
277
278 unsigned int cacheAdapt_decision(int coren) {
279   BAMBOO_CACHE_MF();
280   // check the statistic data
281   // for each page, decide the new cache strategy
282 #ifdef GC_CACHE_ADAPT_POLICY1
283   //  cacheAdapt_policy_h4h(coren);
284 #elif defined(GC_CACHE_ADAPT_POLICY2)
285   //cacheAdapt_policy_local(coren);
286 #elif defined(GC_CACHE_ADAPT_POLICY3)
287   //cacheAdapt_policy_hottest(coren);
288 #elif defined(GC_CACHE_ADAPT_POLICY4)
289   cacheAdapt_policy_dominate(coren);
290 #endif
291 }
292
293 // adapt the cache strategy for the mutator
294 void cacheAdapt_mutator() {
295 #if (defined(GC_CACHE_ADAPT_POLICY4)||defined(GC_CACHE_ADAPT_POLICY3))
296   BAMBOO_CACHE_MF();
297   // check the changes and adapt them
298   unsigned int * tmp_p = gccachepolicytbl;
299   unsigned int page_sva = gcbaseva;
300   for(; page_sva<gctopva; page_sva+=BAMBOO_PAGE_SIZE) {
301     // read out the policy
302     bamboo_cache_policy_t policy = (bamboo_cache_policy_t)(*(tmp_p));
303     // adapt the policy
304     if(policy.word != 0) {
305       bamboo_adapt_cache_policy(page_sva,policy,BAMBOO_PAGE_SIZE);
306     }
307     tmp_p += 1;
308   }
309 #endif
310 }
311
312 // Cache adapt phase process for clients
313 void cacheAdapt_phase_client() {
314   WAITFORGCPHASE(CACHEPOLICYPHASE);
315   GC_PRINTF("Start cachepolicy phase\n");
316   cacheAdapt_decision(BAMBOO_NUM_OF_CORE);
317   //send init finish msg to core coordinator
318   send_msg_2(STARTUPCORE, GCFINISHCACHEPOLICY, BAMBOO_NUM_OF_CORE);
319   GC_PRINTF("Finish cachepolicy phase\n");
320
321   WAITFORGCPHASE(PREFINISHPHASE);
322   GC_PRINTF("Start prefinish phase\n");
323   // cache adapt phase
324   cacheAdapt_mutator();
325   cacheAdapt_gc(false);
326   //send init finish msg to core coordinator
327   send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE);
328   GC_PRINTF("Finish prefinish phase\n");
329
330 #if (defined(GC_CACHE_ADAPT_POLICY4)||defined(GC_CACHE_ADAPT_POLICY3))
331   CACHEADAPT_SAMPLING_RESET();
332   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
333     // zero out the gccachesamplingtbl
334     BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);  
335     BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,size_cachesamplingtbl_local_r);
336   }
337 #endif
338 }
339
340 extern unsigned long long gc_output_cache_policy_time;
341
342 // Cache adpat phase process for the master
343 void cacheAdapt_phase_master() {
344   GCPROFILE_ITEM_MASTER();
345   unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
346   CACHEADAPT_OUTPUT_CACHE_SAMPLING_R();
347   gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
348   // let all cores to parallelly process the revised profile data and decide 
349   // the cache policy for each page
350   gc_status_info.gcphase = CACHEPOLICYPHASE;
351   GC_SEND_MSG_1_TO_CLIENT(GCSTARTCACHEPOLICY);
352   GC_PRINTF("Start cachepolicy phase \n");
353   // cache adapt phase
354   cacheAdapt_decision(BAMBOO_NUM_OF_CORE);
355   GC_CHECK_ALL_CORE_STATUS();
356   BAMBOO_CACHE_MF();
357
358   // let all cores to adopt new policies
359   gc_status_info.gcphase = PREFINISHPHASE;
360   // Note: all cores should flush their runtime data including non-gc cores
361   GC_SEND_MSG_1_TO_CLIENT(GCSTARTPREF);
362   GC_PRINTF("Start prefinish phase \n");
363   // cache adapt phase
364   cacheAdapt_mutator();
365   cacheAdapt_gc(false);
366   GC_CHECK_ALL_CORE_STATUS();
367   
368 #if (defined(GC_CACHE_ADAPT_POLICY4)||defined(GC_CACHE_ADAPT_POLICY3))
369   CACHEADAPT_SAMPLING_RESET();
370   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
371     // zero out the gccachesamplingtbl
372     BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);
373     BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,size_cachesamplingtbl_local_r);
374     BAMBOO_MEMSET_WH(gccachepolicytbl,0,size_cachepolicytbl);
375   }
376 #endif
377 }
378
379 // output original cache sampling data for each page
380 void gc_output_cache_sampling() {
381   extern volatile bool gc_profile_flag;
382   if(!gc_profile_flag) return;
383   unsigned int page_index = 0;
384   VA page_sva = 0;
385   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) >> (BAMBOO_PAGE_SIZE_BITS);
386   for(page_index = 0; page_index < page_num; page_index++) {
387     page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
388     unsigned int block = 0;
389     BLOCKINDEX(block, (void *) page_sva);
390     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
391     //printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
392     unsigned int * local_tbl = &gccachesamplingtbl[page_index*NUMCORESACTIVE];
393     int accesscore = 0;
394     for(int i = 0; i < NUMCORESACTIVE; i++) {
395       int freq = *local_tbl;
396       local_tbl++;
397       if(freq != 0) {
398         accesscore++;
399         //printf("%d,  ", freq);
400       }
401     }
402     if(accesscore!=0) {
403       printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
404       unsigned int * local_tbl = &gccachesamplingtbl[page_index*NUMCORESACTIVE];
405       for(int i = 0; i < NUMCORESACTIVE; i++) {
406         unsigned int freq = *local_tbl;
407         local_tbl++;
408         printf("%u,  ", freq);
409       }
410       printf("\n");
411     }
412     //printf("\n");
413   }
414   printf("=================\n");
415
416
417 // output revised cache sampling data for each page after compaction
418 void gc_output_cache_sampling_r() {
419   extern volatile bool gc_profile_flag;
420   if(!gc_profile_flag) return;
421   // TODO summary data
422   unsigned int sumdata[NUMCORESACTIVE][NUMCORESACTIVE];
423   for(int i = 0; i < NUMCORESACTIVE; i++) {
424     for(int j = 0; j < NUMCORESACTIVE; j++) {
425       sumdata[i][j] = 0;
426     }
427   }
428   tprintf("cache sampling_r \n");
429   unsigned int page_index = 0;
430   VA page_sva = 0;
431   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) >> (BAMBOO_PAGE_SIZE_BITS);
432   for(page_index = 0; page_index < page_num; page_index++) {
433     page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
434     unsigned int block = 0;
435     BLOCKINDEX(block, (void *)page_sva);
436     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
437     //printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
438     int accesscore = 0; // TODO
439     unsigned int * local_tbl = &gccachesamplingtbl_r[page_index*NUMCORESACTIVE];
440     for(int i = 0; i < NUMCORESACTIVE; i++) {
441       unsigned int freq = *local_tbl; 
442       //printf("%d,  ", freq);
443       if(freq != 0) {
444         accesscore++;// TODO
445       }
446       local_tbl++;
447     }
448     if(accesscore!=0) {
449       printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
450       unsigned int * local_tbl = &gccachesamplingtbl_r[page_index*NUMCORESACTIVE];
451       for(int i = 0; i < NUMCORESACTIVE; i++) {
452         unsigned int freq = *local_tbl;
453         printf("%u,  ", freq);
454         sumdata[accesscore-1][i]+=freq;
455         local_tbl++;
456       }
457       printf("\n");
458     }  
459     //printf("\n");
460   }
461   printf("+++++\n");
462   // TODO printout the summary data
463   for(int i = 0; i < NUMCORESACTIVE; i++) {
464     printf("%d  ", i);
465     for(int j = 0; j < NUMCORESACTIVE; j++) {
466       printf(" %u  ", sumdata[j][i]);
467     }
468     printf("\n");
469   }
470   printf("=================\n");
471
472 #endif // GC_CACHE_ADAPT