improvements to PAPI support
[ghc.git] / rts / Papi.c
1 /* -----------------------------------------------------------------------------
2 * (c) The GHC Team 2006
3 *
4 * Initialization and use of the PAPI performance monitoring library
5 *
6 *
7 * For adding events or add your processor counters modify
8 *
9 * init_countable_events
10 * papi_report
11 *
12 * ---------------------------------------------------------------------------*/
13
14
15 #ifdef USE_PAPI /* ugly */
16
17 #include <papi.h>
18
19 #include "Papi.h"
20 #include "Rts.h"
21 #include "RtsUtils.h"
22 #include "Stats.h"
23 #include "RtsFlags.h"
24 #include "OSThreads.h"
25
26 // used to protect the aggregated counters
27 #ifdef THREADED_RTS
28 static Mutex papi_counter_mutex;
29 #endif
30
31 struct _papi_events {
32 int event_code;
33 const char * event_name;
34 };
35
36 /* Beware, these counters are Opteron specific
37 * I obtained the numbers using the papi_avail
38 * and papi_native_avail utilities.
39 * This is certainly not the official PAPI way
40 * of doing things.
41 */
42 #define FR_BR 0x40000040
43 #define FR_BR_MIS 0x40000041
44 #define FR_BR_MISCOMPARE 0x40000048
45 #define DC_ACCESS 0x40000019
46 #define DC_MISS 0x4000001a
47 #define FR_DISPATCH_STALLS 0x40000054
48 #define FR_DISPATCH_STALLS_BR 0x40000055
49 #define FR_DISPATCH_STALLS_FULL_REORDER 0x40000058
50 #define FR_DISPATCH_STALLS_FULL_RESERVATION 0x40000059
51 #define FR_DISPATCH_STALLS_FULL_LS 0x4000005b
52 #define DC_L2_REFILL_MOES 0x40001e1b
53 #define DC_SYS_REFILL_MOES 0x40001e1c
54
55 /* This is bad, it should be in a header */
56 #define BIG_STRING_LEN 512
57
58
59 #define PAPI_CHECK(CALL) \
60 if((papi_error=(CALL)) != PAPI_OK) { \
61 debugBelch("PAPI function failed in module %s at line %d with error code %d\n", \
62 __FILE__,__LINE__,papi_error); \
63 }
64
65 /* While PAPI reporting is going on this flag is on */
66 int papi_is_reporting;
67
68 /* Event sets and counter arrays for GC and mutator */
69
70 int MutatorEvents = PAPI_NULL;
71 int GCEvents = PAPI_NULL;
72
73 int papi_error;
74
75 /* Arbitrary, to avoid using malloc */
76 #define MAX_PAPI_EVENTS 10
77
78 static nat n_papi_events = 0;
79
80
81 /* Events counted during GC and Mutator execution */
82 /* There's a trailing comma, do all C compilers accept that? */
83 static struct _papi_events papi_events[MAX_PAPI_EVENTS];
84 long_long MutatorCounters[MAX_PAPI_EVENTS];
85 long_long GC0Counters[MAX_PAPI_EVENTS];
86 long_long GC1Counters[MAX_PAPI_EVENTS];
87
88 long_long start_mutator_cycles;
89 long_long mutator_cycles;
90 long_long start_gc_cycles;
91 long_long gc0_cycles;
92 long_long gc1_cycles;
93
94
95
96 static long_long papi_counter(long_long values[],int event);
97 static void papi_add_events(int EventSet);
98
99 static nat max_hardware_counters = 2;
100
101 /* If you want to add events to count, extend the
102 * init_countable_events and the papi_report function.
103 * Be aware that your processor can count a limited number
104 * of events simultaneously, you can turn on multiplexing
105 * to increase that number, though.
106 */
107 static void papi_add_event(const char *name, int code)
108 {
109 if (n_papi_events >= max_hardware_counters) {
110 errorBelch("too many PAPI events for this CPU (max: %d)",
111 max_hardware_counters);
112 stg_exit(EXIT_FAILURE);
113 }
114 papi_events[n_papi_events].event_code = code;
115 papi_events[n_papi_events].event_name = name;
116 n_papi_events++;
117 }
118
119 static void
120 init_countable_events(void)
121 {
122 max_hardware_counters = PAPI_num_counters();
123
124 #define PAPI_ADD_EVENT(EVENT) papi_add_event(#EVENT,EVENT)
125
126 if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
127 PAPI_ADD_EVENT(FR_BR);
128 PAPI_ADD_EVENT(FR_BR_MIS);
129 /* Docs are wrong? Opteron does not count indirect branch misses exclusively */
130 PAPI_ADD_EVENT(FR_BR_MISCOMPARE);
131 } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_STALLS) {
132 PAPI_ADD_EVENT(FR_DISPATCH_STALLS);
133 PAPI_ADD_EVENT(FR_DISPATCH_STALLS_BR);
134 PAPI_ADD_EVENT(FR_DISPATCH_STALLS_FULL_LS);
135 } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
136 PAPI_ADD_EVENT(PAPI_L1_DCA);
137 PAPI_ADD_EVENT(PAPI_L1_DCM);
138 } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
139 PAPI_ADD_EVENT(PAPI_L2_DCA);
140 PAPI_ADD_EVENT(PAPI_L2_DCM);
141 } else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CB_EVENTS) {
142 PAPI_ADD_EVENT(DC_L2_REFILL_MOES);
143 PAPI_ADD_EVENT(DC_SYS_REFILL_MOES);
144 PAPI_ADD_EVENT(FR_BR_MIS);
145 } else if (RtsFlags.PapiFlags.eventType==PAPI_USER_EVENTS) {
146 nat i;
147 char *name;
148 int code;
149 for (i = 0; i < RtsFlags.PapiFlags.numUserEvents; i++) {
150 name = RtsFlags.PapiFlags.userEvents[i];
151 PAPI_CHECK(PAPI_event_name_to_code(name, &code))
152 papi_add_event(name, code);
153 }
154 } else {
155 // PAPI_ADD_EVENT(PAPI_L1_DCA); // L1 data cache accesses
156 // PAPI_ADD_EVENT(PAPI_L1_ICR); // L1 instruction cache reads
157 // PAPI_ADD_EVENT(PAPI_L1_ICM); // L1 instruction cache misses
158 // PAPI_ADD_EVENT(PAPI_L1_STM); // L1 store misses
159 // PAPI_ADD_EVENT(PAPI_L1_DCM); // L1 data cache misses
160 // PAPI_ADD_EVENT(PAPI_L1_LDM); // L1 load misses
161 // PAPI_ADD_EVENT(PAPI_L2_TCM); // L2 cache misses
162 // PAPI_ADD_EVENT(PAPI_L2_STM); // L2 store misses
163 // PAPI_ADD_EVENT(PAPI_L2_DCW); // L2 data cache writes
164 // PAPI_ADD_EVENT(PAPI_L2_DCR); // L2 data cache reads
165 // PAPI_ADD_EVENT(PAPI_L2_TCW); // L2 cache writes
166 // PAPI_ADD_EVENT(PAPI_L2_TCR); // L2 cache reads
167 // PAPI_ADD_EVENT(PAPI_CA_CLN); // exclusive access to clean cache line
168 // PAPI_ADD_EVENT(PAPI_TLB_DM); // TLB misses
169 PAPI_ADD_EVENT(PAPI_TOT_INS); // Total instructions
170 PAPI_ADD_EVENT(PAPI_TOT_CYC); // Total instructions
171 // PAPI_ADD_EVENT(PAPI_CA_SHR); // exclusive access to shared cache line
172 // PAPI_ADD_EVENT(PAPI_RES_STL); // Cycles stalled on any resource
173
174 }
175
176 // We might also consider:
177 // PAPI_BR_MSP Conditional branch instructions mispredicted
178 // PAPI_RES_STL Cycles stalled on any resource
179 };
180
181
182 static void
183 papi_report_event(const char *name, ullong value)
184 {
185 static char temp[BIG_STRING_LEN];
186 ullong_format_string(value,temp,rtsTrue/*commas*/);
187 statsPrintf(" %15s %15s\n", name, temp);
188 }
189
190 /* This function reports counters for GC and mutator */
191 static void
192 papi_report(long_long counters[])
193 {
194 nat i;
195
196 /* Report the value of a counter as a percentage of another counter */
197 #define PAPI_REPORT_PCT(EVENTSET,EVENT,EVENTTOT) \
198 statsPrintf(" " #EVENT " %% of " #EVENTTOT " : %.1f%%\n", \
199 papi_counter(EVENTSET,EVENT)*100.0/papi_counter(EVENTSET,EVENTTOT))
200
201 for (i = 0; i < n_papi_events; i++)
202 {
203 papi_report_event(papi_events[i].event_name, counters[i]);
204 }
205
206 if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_BRANCH) {
207 PAPI_REPORT_PCT(counters,FR_BR_MIS,FR_BR);
208 PAPI_REPORT_PCT(counters,FR_BR_MISCOMPARE,FR_BR);
209 }
210
211 else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L1) {
212 PAPI_REPORT_PCT(counters,PAPI_L1_DCM,PAPI_L1_DCA);
213 }
214
215 else if (RtsFlags.PapiFlags.eventType==PAPI_FLAG_CACHE_L2) {
216 PAPI_REPORT_PCT(counters,PAPI_L2_DCM,PAPI_L2_DCA);
217 }
218 }
219
220 void
221 papi_stats_report (void)
222 {
223 statsPrintf(" Mutator CPU counters\n");
224 papi_report_event("CYCLES", mutator_cycles);
225 papi_report(MutatorCounters);
226
227 statsPrintf("\n GC(0) CPU counters\n");
228 papi_report_event("CYCLES", gc0_cycles);
229 papi_report(GC0Counters);
230
231 statsPrintf("\n GC(1) CPU counters\n");
232 papi_report_event("CYCLES", gc1_cycles);
233 papi_report(GC1Counters);
234 }
235
236 void
237 papi_init_eventset (int *event_set)
238 {
239 PAPI_register_thread();
240 PAPI_CHECK( PAPI_create_eventset(event_set));
241 papi_add_events(*event_set);
242 }
243
244 void
245 papi_init (void)
246 {
247 /* Initialise the performance tracking library */
248 int ver;
249 if ((ver = PAPI_library_init(PAPI_VER_CURRENT)) != PAPI_VER_CURRENT) {
250 if (ver > 0) {
251 errorBelch("PAPI_library_init: wrong version: %x", ver);
252 stg_exit(EXIT_FAILURE);
253 } else {
254 sysErrorBelch("PAPI_library_init");
255 stg_exit(EXIT_FAILURE);
256 }
257 }
258
259 #ifdef THREADED_RTS
260 {
261 int err;
262 if ((err = PAPI_thread_init(osThreadId)) < 0) {
263 barf("PAPI_thread_init: %d",err);
264 }
265
266 initMutex(&papi_counter_mutex);
267 }
268 #endif
269
270 init_countable_events();
271
272 papi_init_eventset(&MutatorEvents);
273 papi_init_eventset(&GCEvents);
274 }
275
276 /* Extract the value corresponding to an event */
277 static long_long
278 papi_counter(long_long values[],int event)
279 {
280 nat i;
281 for(i=0;i<n_papi_events;i++) {
282 if(papi_events[i].event_code==event) {
283 return values[i];
284 }
285 }
286 /* Passed a wrong event? */
287 debugBelch("Event %d is not part of event set\n",event);
288 return 0;
289 }
290
291 /* Add the events of papi_events into an event set */
292 static void
293 papi_add_events(int EventSet)
294 {
295 nat i;
296 for(i=0;i<n_papi_events;i++) {
297 if((papi_error=PAPI_add_event(EventSet,
298 papi_events[i].event_code))
299 != PAPI_OK)
300 debugBelch("Failed adding %s to event set with error code %d\n",
301 papi_events[i].event_name,papi_error);
302 }
303 }
304
305 /* We should be using elapsed cycles
306 * to be consistent with time metric chosen in Stats.c (Elapsed time).
307 * This is an approximation to the cycles that the program spends.
308 * Note that the counters, in contrast, are virtual and user space.
309 */
310 #define PAPI_cycles PAPI_get_virt_cyc
311
312 void
313 papi_start_mutator_count(void)
314 {
315 ACQUIRE_LOCK(&papi_counter_mutex);
316 PAPI_CHECK( PAPI_start(MutatorEvents));
317 start_mutator_cycles = PAPI_cycles();
318 RELEASE_LOCK(&papi_counter_mutex);
319 }
320
321 void
322 papi_stop_mutator_count(void)
323 {
324 ACQUIRE_LOCK(&papi_counter_mutex);
325 mutator_cycles += PAPI_cycles() - start_mutator_cycles;
326 PAPI_CHECK( PAPI_accum(MutatorEvents,MutatorCounters));
327 PAPI_CHECK( PAPI_stop(MutatorEvents,NULL));
328 RELEASE_LOCK(&papi_counter_mutex);
329 }
330
331 void
332 papi_start_gc_count(void)
333 {
334 ACQUIRE_LOCK(&papi_counter_mutex);
335 PAPI_CHECK( PAPI_start(GCEvents));
336 start_gc_cycles = PAPI_cycles();
337 RELEASE_LOCK(&papi_counter_mutex);
338 }
339
340 void
341 papi_stop_gc0_count(void)
342 {
343 ACQUIRE_LOCK(&papi_counter_mutex);
344 PAPI_CHECK( PAPI_accum(GCEvents,GC0Counters));
345 PAPI_CHECK( PAPI_stop(GCEvents,NULL));
346 gc0_cycles += PAPI_cycles() - start_gc_cycles;
347 RELEASE_LOCK(&papi_counter_mutex);
348 }
349
350
351 void
352 papi_stop_gc1_count(void)
353 {
354 ACQUIRE_LOCK(&papi_counter_mutex);
355 PAPI_CHECK( PAPI_accum(GCEvents,GC1Counters));
356 PAPI_CHECK( PAPI_stop(GCEvents,NULL));
357 gc1_cycles += PAPI_cycles() - start_gc_cycles;
358 RELEASE_LOCK(&papi_counter_mutex);
359 }
360
361
362 void
363 papi_thread_start_gc1_count(int event_set)
364 {
365 ACQUIRE_LOCK(&papi_counter_mutex);
366 PAPI_CHECK( PAPI_start(event_set));
367 RELEASE_LOCK(&papi_counter_mutex);
368 }
369
370 void
371 papi_thread_stop_gc1_count(int event_set)
372 {
373 ACQUIRE_LOCK(&papi_counter_mutex);
374 PAPI_CHECK( PAPI_accum(event_set,GC1Counters));
375 PAPI_CHECK( PAPI_stop(event_set,NULL));
376 RELEASE_LOCK(&papi_counter_mutex);
377 }
378
379 #endif /* USE_PAPI */