377bc698a11271e5b41eecf1dd11a9972d1d9aeb
[IRC.git] / Robust / src / Runtime / bamboo / multicorecache.c
1 #ifdef GC_CACHE_ADAPT
2 #include "multicorecache.h"
3 #include "multicoremsg.h"
4 #include "multicoregcprofile.h"
5
6 void cacheadapt_finish_src_page(void *srcptr, void *tostart, void *tofinish) {
7   unsigned int srcpage=(srcptr-gcbaseva)>>BAMBOO_PAGE_SIZE_BITS;
8   unsigned int dstpage=(tostart-gcbase)>>BAMBOO_PAGE_SIZE_BITS;
9   unsigned int numbytes=tofinish-tostart;
10   
11   unsigned int * oldtable=&gccachesamplingtbl[srcpage*NUMCORESACTIVE];
12   unsigned int * newtable=&gccachesamplingtbl_r[dstpage*NUMCORESACTIVE];
13   
14   unsigned int page64th=numbytes>>(BAMBOO_PAGE_SIZE_BITS-6);
15
16   for(int core = 0; core < NUMCORESACTIVE; core++) {
17     (*newtable)+=page64th*(*oldtable);
18     newtable++;
19     oldtable++;
20   }  
21 }
22
23 void cacheadapt_finish_dst_page(void *origptr, void *tostart, void *toptr, unsigned int bytesneeded) {
24   unsigned int numbytes=toptr-tostart;
25
26   void *tobound=(tostart&~(BAMBOO_PAGE_SIZE-1))+BAMBOO_PAGE_SIZE;
27   void *origbound=(origstart&~(BAMBOO_PAGE_SIZE-1))+BAMBOO_PAGE_SIZE;
28   
29   unsigned int topage=(tostart-gcbase)>>BAMBOO_PAGE_SIZE_BITS; 
30   unsigned int origpage=(origptr-gcbaseva)>>BAMBOO_PAGE_SIZE_BITS;
31
32   unsigned int * totable=&gccachesamplingtbl_r[topage*NUMCORESACTIVE];
33   unsigned int * origtable=&gccachesamplingtbl[origpage*NUMCORESACTIVE];
34
35   unsigned int remaintobytes=tobound-toptr;
36   unsigned int remainorigbytes=origbound-origptr;
37
38   do {
39     //round source bytes down....don't want to close out page if not necessary
40     remainorigbytes=(remainorigbytes>bytesneeded)?bytesneeded:remainorigbytes;
41
42     if (remaintobytes<=remainorigbytes) {
43       //Need to close out to page
44
45       numbytes+=remaintobytes;
46       unsigned int page64th=numbytes>>(BAMBOO_PAGE_SIZE_BITS-6);
47
48       for(int core = 0; core < NUMCORESACTIVE; core++) {
49         (*totable)=(*totable+page64th*(*origtable))>>6;
50         totable++;
51         origtable++;
52       }
53       toptr+=remaintobytes;
54       origptr+=remaintobytes;
55       bytesneeded-=remaintobytes;
56       topage++;//to page is definitely done
57       tobound+=BAMBOO_PAGE_SIZE;
58       origpage=(origptr-gcbaseva)>>BAMBOO_PAGE_SIZE_BITS;//handle exact match case
59       origbound=(origptr&~(BAMBOO_PAGE_SIZE-1))+BAMBOO_PAGE_SIZE;
60     } else {
61       //Finishing off orig page
62
63       numbytes+=remainorigbytes;
64       unsigned int page64th=numbytes>>(BAMBOO_PAGE_SIZE_BITS-6);
65       
66       for(int core = 0; core < NUMCORESACTIVE; core++) {
67         (*totable)+=page64th*(*origtable);
68         totable++;
69         origtable++;
70       }
71       toptr+=remainorigbytes;
72       origptr+=remainorigbytes;
73       bytesneeded-=remainorigbytes;
74       origpage++;//just orig page is done
75       origbound+=BAMBOO_PAGE_SIZE;
76     }
77     totable=&gccachesamplingtbl_r[topage*NUMCORESACTIVE];
78     origtable=&gccachesamplingtbl[origpage*NUMCORESACTIVE];
79     
80     remaintobytes=tobound-toptr;
81     remainorigbytes=origbound-origptr;
82     
83     numbytes=0;
84   } while(bytesneeded!=0);
85 }
86
87 // prepare for cache adaption:
88 //   -- flush the shared heap
89 //   -- clean dtlb entries
90 //   -- change cache strategy
91 void cacheAdapt_gc(bool isgccachestage) {
92   // flush the shared heap
93   BAMBOO_CACHE_FLUSH_L2();
94
95   // clean the dtlb entries
96   BAMBOO_CLEAN_DTLB();
97
98   if(isgccachestage) {
99     bamboo_install_dtlb_handler_for_gc();
100   } else {
101     bamboo_install_dtlb_handler_for_mutator();
102   }
103
104
105 // the master core decides how to adapt cache strategy for the mutator 
106 // according to collected statistic data
107
108 // find the core that accesses the page #page_index most
109 #define CACHEADAPT_FIND_HOTTEST_CORE(page_index,hottestcore,hotfreq) \
110   { \
111     unsigned int *local_tbl=&gccachesamplingtbl_r[page_index*NUMCORESACTIVE];   \
112     for(int i = 0; i < NUMCORESACTIVE; i++) { \
113       int freq = *local_tbl; \
114       local_tbl++; \
115       if(hotfreq < freq) { \
116         hotfreq = freq; \
117         hottestcore = i; \
118       } \
119     } \
120   }
121 // find the core that accesses the page #page_index most and comput the total
122 // access time of the page at the same time
123 #define CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq) \
124   { \
125     unsigned int *local_tbl=&gccachesamplingtbl_r[page_index*NUMCORESACTIVE];   \
126     for(int i = 0; i < NUMCORESACTIVE; i++) { \
127       int freq = *local_tbl; \
128       local_tbl++; \
129       totalfreq += freq; \
130       if(hotfreq < freq) { \
131         hotfreq = freq; \
132         hottestcore = i; \
133       } \
134     } \
135   }
136 // Set the policy as hosted by coren
137 // NOTE: (x,y) should be changed to (x+1, y+1)!!!
138 #define CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren) \
139   { \
140     (policy).cache_mode = BAMBOO_CACHE_MODE_COORDS; \    
141     (policy).lotar_x = bamboo_cpu2coords[2*(coren)]+1; \
142     (policy).lotar_y = bamboo_cpu2coords[2*(coren)+1]+1; \
143   }
144 // store the new policy information at tmp_p in gccachepolicytbl
145 #define CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy) \
146   { \
147     ((int*)(tmp_p))[page_index] = (policy).word; \
148   }
149
150 // make all pages hfh
151 void cacheAdapt_policy_h4h(int coren){
152   unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
153   unsigned int page_gap=page_num/NUMCORESACTIVE;
154   unsigned int page_index=page_gap*coren;
155   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
156   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
157   unsigned int * tmp_p = gccachepolicytbl;
158   for(; page_index < page_index_end; page_index++) {
159     bamboo_cache_policy_t policy = {0};
160     policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
161     CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
162     page_sva += BAMBOO_PAGE_SIZE;
163   }
164
165
166 // make all pages local as non-cache-adaptable gc local mode
167 void cacheAdapt_policy_local(int coren){
168   unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
169   unsigned int page_gap=page_num/NUMCORESACTIVE;
170   unsigned int page_index=page_gap*coren;
171   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
172   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
173   unsigned int * tmp_p = gccachepolicytbl;
174   for(; page_index < page_index_end; page_index++) {
175     bamboo_cache_policy_t policy = {0};
176     unsigned int block = 0;
177     BLOCKINDEX(block, (void *) page_sva);
178     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
179     CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);
180     CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
181     page_sva += BAMBOO_PAGE_SIZE;
182   }
183
184
185 void cacheAdapt_policy_hottest(int coren){
186   unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
187   unsigned int page_gap=page_num/NUMCORESACTIVE;
188   unsigned int page_index=page_gap*coren;
189   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
190   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
191   unsigned int * tmp_p = gccachepolicytbl;
192   for(; page_index < page_index_end; page_index++) {
193     bamboo_cache_policy_t policy = {0};
194     unsigned int hottestcore = 0;
195     unsigned int hotfreq = 0;
196     CACHEADAPT_FIND_HOTTEST_CORE(page_index,hottestcore,hotfreq);
197     // TODO
198     // Decide the cache strategy for this page
199     // If decide to adapt a new cache strategy, write into the shared block of
200     // the gcsharedsamplingtbl. The mem recording information that has been 
201     // written is enough to hold the information.
202     // Format: page start va + cache strategy(hfh/(host core+[x,y]))
203     if(hotfreq != 0) {
204       // locally cache the page in the hottest core
205       CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
206     }
207     CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
208     page_sva += BAMBOO_PAGE_SIZE;
209   }
210
211
212 #define GC_CACHE_ADAPT_DOMINATE_THRESHOLD  1
213 // cache the page on the core that accesses it the most if that core accesses 
214 // it more than (GC_CACHE_ADAPT_DOMINATE_THRESHOLD)% of the total.  Otherwise,
215 // h4h the page.
216 void cacheAdapt_policy_dominate(int coren){
217   unsigned int page_num=(BAMBOO_SHARED_MEM_SIZE)>>(BAMBOO_PAGE_SIZE_BITS);
218   unsigned int page_gap=page_num/NUMCORESACTIVE;
219   unsigned int page_index=page_gap*coren;
220   unsigned int page_index_end=(coren==NUMCORESACTIVE-1)?page_num:(page_index+page_gap);
221   VA page_sva = gcbaseva+(BAMBOO_PAGE_SIZE)*page_index;
222   unsigned int * tmp_p = gccachepolicytbl;
223   for(; page_index < page_index_end; page_index++) {
224     bamboo_cache_policy_t policy = {0};
225     unsigned int hottestcore = 0;
226     unsigned int totalfreq = 0;
227     unsigned int hotfreq = 0;
228     CACHEADAPT_FIND_HOTTEST_CORE_W_TOTALFREQ(page_index,hottestcore,hotfreq,totalfreq);
229     // Decide the cache strategy for this page
230     // If decide to adapt a new cache strategy, write into the shared block of
231     // the gcpolicytbl 
232     // Format: page start va + cache policy
233     if(hotfreq != 0) {
234       totalfreq=totalfreq>>GC_CACHE_ADAPT_DOMINATE_THRESHOLD;
235       if((unsigned int)hotfreq < (unsigned int)totalfreq) {
236         // use hfh
237         policy.cache_mode = BAMBOO_CACHE_MODE_HASH;
238         /*unsigned int block = 0;
239         BLOCKINDEX(block, (void *) page_sva);
240         unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
241         CACHEADAPT_POLICY_SET_HOST_CORE(policy, coren);*/
242       } else {
243         // locally cache the page in the hottest core
244         CACHEADAPT_POLICY_SET_HOST_CORE(policy, hottestcore);
245       }     
246     }
247     CACHEADAPT_CHANGE_POLICY_4_PAGE(tmp_p,page_index,policy);
248     page_sva += BAMBOO_PAGE_SIZE;
249   }
250 }
251
252 unsigned int cacheAdapt_decision(int coren) {
253   BAMBOO_CACHE_MF();
254   // check the statistic data
255   // for each page, decide the new cache strategy
256 #ifdef GC_CACHE_ADAPT_POLICY1
257   cacheAdapt_policy_h4h(coren);
258 #elif defined GC_CACHE_ADAPT_POLICY2
259   cacheAdapt_policy_local(coren);
260 #elif defined GC_CACHE_ADAPT_POLICY3
261   cacheAdapt_policy_hottest(coren);
262 #elif defined GC_CACHE_ADAPT_POLICY4
263   cacheAdapt_policy_dominate(coren);
264 #endif
265 }
266
267 // adapt the cache strategy for the mutator
268 void cacheAdapt_mutator() {
269   BAMBOO_CACHE_MF();
270   // check the changes and adapt them
271   unsigned int * tmp_p = gccachepolicytbl;
272   unsigned int page_sva = gcbaseva;
273   for(; page_sva<gctopva; page_sva+=BAMBOO_PAGE_SIZE) {
274     // read out the policy
275     bamboo_cache_policy_t policy = (bamboo_cache_policy_t)(*(tmp_p));
276     // adapt the policy
277     if(policy.word != 0) {
278       bamboo_adapt_cache_policy(page_sva,policy,BAMBOO_PAGE_SIZE);
279     }
280     tmp_p += 1;
281   }
282 }
283
284 // Cache adapt phase process for clients
285 void cacheAdapt_phase_client() {
286   WAITFORGCPHASE(CACHEPOLICYPHASE);
287   GC_PRINTF("Start cachepolicy phase\n");
288   cacheAdapt_decision(BAMBOO_NUM_OF_CORE);
289   //send init finish msg to core coordinator
290   send_msg_2(STARTUPCORE, GCFINISHCACHEPOLICY, BAMBOO_NUM_OF_CORE);
291   GC_PRINTF("Finish cachepolicy phase\n");
292
293   WAITFORGCPHASE(PREFINISHPHASE);
294   GC_PRINTF("Start prefinish phase\n");
295   // cache adapt phase
296   cacheAdapt_mutator();
297   cacheAdapt_gc(false);
298   //send init finish msg to core coordinator
299   send_msg_2(STARTUPCORE, GCFINISHPREF, BAMBOO_NUM_OF_CORE);
300   GC_PRINTF("Finish prefinish phase\n");
301   CACHEADAPT_SAMPLING_RESET();
302   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
303     // zero out the gccachesamplingtbl
304     BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);  
305     BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,size_cachesamplingtbl_local_r);
306   }
307 }
308
309 extern unsigned long long gc_output_cache_policy_time;
310
311 // Cache adpat phase process for the master
312 void cacheAdapt_phase_master() {
313   GCPROFILE_ITEM();
314   unsigned long long tmpt = BAMBOO_GET_EXE_TIME();
315   CACHEADAPT_OUTPUT_CACHE_SAMPLING_R();
316   gc_output_cache_policy_time += (BAMBOO_GET_EXE_TIME()-tmpt);
317   // let all cores to parallelly process the revised profile data and decide 
318   // the cache policy for each page
319   gc_status_info.gcphase = CACHEPOLICYPHASE;
320   GC_SEND_MSG_1_TO_CLIENT(GCSTARTCACHEPOLICY);
321   GC_PRINTF("Start cachepolicy phase \n");
322   // cache adapt phase
323   cacheAdapt_decision(BAMBOO_NUM_OF_CORE);
324   GC_CHECK_ALL_CORE_STATUS();
325   BAMBOO_CACHE_MF();
326
327   // let all cores to adopt new policies
328   gc_status_info.gcphase = PREFINISHPHASE;
329   // Note: all cores should flush their runtime data including non-gc cores
330   GC_SEND_MSG_1_TO_CLIENT(GCSTARTPREF);
331   GC_PRINTF("Start prefinish phase \n");
332   // cache adapt phase
333   cacheAdapt_mutator();
334   cacheAdapt_gc(false);
335   GC_CHECK_ALL_CORE_STATUS();
336   
337   CACHEADAPT_SAMPLING_RESET();
338   if(BAMBOO_NUM_OF_CORE < NUMCORESACTIVE) {
339     // zero out the gccachesamplingtbl
340     BAMBOO_MEMSET_WH(gccachesamplingtbl_local,0,size_cachesamplingtbl_local);
341     BAMBOO_MEMSET_WH(gccachesamplingtbl_local_r,0,size_cachesamplingtbl_local_r);
342     BAMBOO_MEMSET_WH(gccachepolicytbl,0,size_cachepolicytbl);
343   }
344 }
345
346 // output original cache sampling data for each page
347 void gc_output_cache_sampling() {
348   extern volatile bool gc_profile_flag;
349   if(!gc_profile_flag) return;
350   unsigned int page_index = 0;
351   VA page_sva = 0;
352   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) >> (BAMBOO_PAGE_SIZE_BITS);
353   for(page_index = 0; page_index < page_num; page_index++) {
354     page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
355     unsigned int block = 0;
356     BLOCKINDEX(block, (void *) page_sva);
357     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
358     //printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
359     unsigned int * local_tbl = &gccachesamplingtbl[page_index*NUMCORESACTIVE];
360     int accesscore = 0;
361     for(int i = 0; i < NUMCORESACTIVE; i++) {
362       int freq = *local_tbl;
363       local_tbl++;
364       if(freq != 0) {
365         accesscore++;
366         //printf("%d,  ", freq);
367       }
368     }
369     if(accesscore!=0) {
370       printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
371       unsigned int * local_tbl = &gccachesamplingtbl[page_index*NUMCORESACTIVE];
372       for(int i = 0; i < NUMCORESACTIVE; i++) {
373         int freq = *local_tbl;
374         local_tbl++;
375         printf("%d,  ", freq);
376       }
377       printf("\n");
378     }
379     //printf("\n");
380   }
381   printf("=================\n");
382
383
384 // output revised cache sampling data for each page after compaction
385 void gc_output_cache_sampling_r() {
386   extern volatile bool gc_profile_flag;
387   if(!gc_profile_flag) return;
388   // TODO summary data
389   unsigned int sumdata[NUMCORESACTIVE][NUMCORESACTIVE];
390   for(int i = 0; i < NUMCORESACTIVE; i++) {
391     for(int j = 0; j < NUMCORESACTIVE; j++) {
392       sumdata[i][j] = 0;
393     }
394   }
395   tprintf("cache sampling_r \n");
396   unsigned int page_index = 0;
397   VA page_sva = 0;
398   unsigned int page_num = (BAMBOO_SHARED_MEM_SIZE) >> (BAMBOO_PAGE_SIZE_BITS);
399   for(page_index = 0; page_index < page_num; page_index++) {
400     page_sva = gcbaseva + (BAMBOO_PAGE_SIZE) * page_index;
401     unsigned int block = 0;
402     BLOCKINDEX(block, (void *)page_sva);
403     unsigned int coren = gc_block2core[block%(NUMCORES4GC*2)];
404     //printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
405     int accesscore = 0; // TODO
406     unsigned int * local_tbl = &gccachesamplingtbl_r[page_index*NUMCORESACTIVE];
407     for(int i = 0; i < NUMCORESACTIVE; i++) {
408       int freq = *local_tbl; 
409       //printf("%d,  ", freq);
410       if(freq != 0) {
411         accesscore++;// TODO
412       }
413       local_tbl++;
414     }
415     if(accesscore!=0) {
416       printf("%x,  %d,  %d,  ",(int)page_sva,page_index,coren);
417       unsigned int * local_tbl = &gccachesamplingtbl_r[page_index*NUMCORESACTIVE];
418       for(int i = 0; i < NUMCORESACTIVE; i++) {
419         int freq = *local_tbl;
420         printf("%d,  ", freq);
421         sumdata[accesscore-1][i]+=freq;
422         local_tbl++;
423       }
424       printf("\n");
425     }  
426     //printf("\n");
427   }
428   printf("+++++\n");
429   // TODO printout the summary data
430   for(int i = 0; i < NUMCORESACTIVE; i++) {
431     printf("%d  ", i);
432     for(int j = 0; j < NUMCORESACTIVE; j++) {
433       printf(" %d  ", sumdata[j][i]);
434     }
435     printf("\n");
436   }
437   printf("=================\n");
438
439 #endif // GC_CACHE_ADAPT