Merge tag 'scsi-misc' of git://git.kernel.org/pub/scm/linux/kernel/git/jejb/scsi
[firefly-linux-kernel-4.4.55.git] / tools / perf / builtin-trace.c
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/machine.h"
7 #include "util/session.h"
8 #include "util/thread.h"
9 #include "util/parse-options.h"
10 #include "util/strlist.h"
11 #include "util/intlist.h"
12 #include "util/thread_map.h"
13 #include "util/stat.h"
14 #include "trace-event.h"
15 #include "util/parse-events.h"
16
17 #include <libaudit.h>
18 #include <stdlib.h>
19 #include <sys/mman.h>
20 #include <linux/futex.h>
21
22 /* For older distros: */
23 #ifndef MAP_STACK
24 # define MAP_STACK              0x20000
25 #endif
26
27 #ifndef MADV_HWPOISON
28 # define MADV_HWPOISON          100
29 #endif
30
31 #ifndef MADV_MERGEABLE
32 # define MADV_MERGEABLE         12
33 #endif
34
35 #ifndef MADV_UNMERGEABLE
36 # define MADV_UNMERGEABLE       13
37 #endif
38
39 #ifndef EFD_SEMAPHORE
40 # define EFD_SEMAPHORE          1
41 #endif
42
43 #ifndef EFD_NONBLOCK
44 # define EFD_NONBLOCK           00004000
45 #endif
46
47 #ifndef EFD_CLOEXEC
48 # define EFD_CLOEXEC            02000000
49 #endif
50
51 #ifndef O_CLOEXEC
52 # define O_CLOEXEC              02000000
53 #endif
54
55 #ifndef SOCK_DCCP
56 # define SOCK_DCCP              6
57 #endif
58
59 #ifndef SOCK_CLOEXEC
60 # define SOCK_CLOEXEC           02000000
61 #endif
62
63 #ifndef SOCK_NONBLOCK
64 # define SOCK_NONBLOCK          00004000
65 #endif
66
67 #ifndef MSG_CMSG_CLOEXEC
68 # define MSG_CMSG_CLOEXEC       0x40000000
69 #endif
70
71 #ifndef PERF_FLAG_FD_NO_GROUP
72 # define PERF_FLAG_FD_NO_GROUP          (1UL << 0)
73 #endif
74
75 #ifndef PERF_FLAG_FD_OUTPUT
76 # define PERF_FLAG_FD_OUTPUT            (1UL << 1)
77 #endif
78
79 #ifndef PERF_FLAG_PID_CGROUP
80 # define PERF_FLAG_PID_CGROUP           (1UL << 2) /* pid=cgroup id, per-cpu mode only */
81 #endif
82
83 #ifndef PERF_FLAG_FD_CLOEXEC
84 # define PERF_FLAG_FD_CLOEXEC           (1UL << 3) /* O_CLOEXEC */
85 #endif
86
87
88 struct tp_field {
89         int offset;
90         union {
91                 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
92                 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
93         };
94 };
95
96 #define TP_UINT_FIELD(bits) \
97 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
98 { \
99         u##bits value; \
100         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
101         return value;  \
102 }
103
104 TP_UINT_FIELD(8);
105 TP_UINT_FIELD(16);
106 TP_UINT_FIELD(32);
107 TP_UINT_FIELD(64);
108
109 #define TP_UINT_FIELD__SWAPPED(bits) \
110 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
111 { \
112         u##bits value; \
113         memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
114         return bswap_##bits(value);\
115 }
116
117 TP_UINT_FIELD__SWAPPED(16);
118 TP_UINT_FIELD__SWAPPED(32);
119 TP_UINT_FIELD__SWAPPED(64);
120
121 static int tp_field__init_uint(struct tp_field *field,
122                                struct format_field *format_field,
123                                bool needs_swap)
124 {
125         field->offset = format_field->offset;
126
127         switch (format_field->size) {
128         case 1:
129                 field->integer = tp_field__u8;
130                 break;
131         case 2:
132                 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
133                 break;
134         case 4:
135                 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
136                 break;
137         case 8:
138                 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
139                 break;
140         default:
141                 return -1;
142         }
143
144         return 0;
145 }
146
147 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
148 {
149         return sample->raw_data + field->offset;
150 }
151
152 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
153 {
154         field->offset = format_field->offset;
155         field->pointer = tp_field__ptr;
156         return 0;
157 }
158
159 struct syscall_tp {
160         struct tp_field id;
161         union {
162                 struct tp_field args, ret;
163         };
164 };
165
166 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
167                                           struct tp_field *field,
168                                           const char *name)
169 {
170         struct format_field *format_field = perf_evsel__field(evsel, name);
171
172         if (format_field == NULL)
173                 return -1;
174
175         return tp_field__init_uint(field, format_field, evsel->needs_swap);
176 }
177
178 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
179         ({ struct syscall_tp *sc = evsel->priv;\
180            perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
181
182 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
183                                          struct tp_field *field,
184                                          const char *name)
185 {
186         struct format_field *format_field = perf_evsel__field(evsel, name);
187
188         if (format_field == NULL)
189                 return -1;
190
191         return tp_field__init_ptr(field, format_field);
192 }
193
194 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
195         ({ struct syscall_tp *sc = evsel->priv;\
196            perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
197
198 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
199 {
200         zfree(&evsel->priv);
201         perf_evsel__delete(evsel);
202 }
203
204 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
205 {
206         evsel->priv = malloc(sizeof(struct syscall_tp));
207         if (evsel->priv != NULL) {
208                 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
209                         goto out_delete;
210
211                 evsel->handler = handler;
212                 return 0;
213         }
214
215         return -ENOMEM;
216
217 out_delete:
218         zfree(&evsel->priv);
219         return -ENOENT;
220 }
221
222 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
223 {
224         struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
225
226         /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
227         if (evsel == NULL)
228                 evsel = perf_evsel__newtp("syscalls", direction);
229
230         if (evsel) {
231                 if (perf_evsel__init_syscall_tp(evsel, handler))
232                         goto out_delete;
233         }
234
235         return evsel;
236
237 out_delete:
238         perf_evsel__delete_priv(evsel);
239         return NULL;
240 }
241
242 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
243         ({ struct syscall_tp *fields = evsel->priv; \
244            fields->name.integer(&fields->name, sample); })
245
246 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
247         ({ struct syscall_tp *fields = evsel->priv; \
248            fields->name.pointer(&fields->name, sample); })
249
250 static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
251                                           void *sys_enter_handler,
252                                           void *sys_exit_handler)
253 {
254         int ret = -1;
255         struct perf_evsel *sys_enter, *sys_exit;
256
257         sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
258         if (sys_enter == NULL)
259                 goto out;
260
261         if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
262                 goto out_delete_sys_enter;
263
264         sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
265         if (sys_exit == NULL)
266                 goto out_delete_sys_enter;
267
268         if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
269                 goto out_delete_sys_exit;
270
271         perf_evlist__add(evlist, sys_enter);
272         perf_evlist__add(evlist, sys_exit);
273
274         ret = 0;
275 out:
276         return ret;
277
278 out_delete_sys_exit:
279         perf_evsel__delete_priv(sys_exit);
280 out_delete_sys_enter:
281         perf_evsel__delete_priv(sys_enter);
282         goto out;
283 }
284
285
286 struct syscall_arg {
287         unsigned long val;
288         struct thread *thread;
289         struct trace  *trace;
290         void          *parm;
291         u8            idx;
292         u8            mask;
293 };
294
295 struct strarray {
296         int         offset;
297         int         nr_entries;
298         const char **entries;
299 };
300
301 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
302         .nr_entries = ARRAY_SIZE(array), \
303         .entries = array, \
304 }
305
306 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
307         .offset     = off, \
308         .nr_entries = ARRAY_SIZE(array), \
309         .entries = array, \
310 }
311
312 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
313                                                 const char *intfmt,
314                                                 struct syscall_arg *arg)
315 {
316         struct strarray *sa = arg->parm;
317         int idx = arg->val - sa->offset;
318
319         if (idx < 0 || idx >= sa->nr_entries)
320                 return scnprintf(bf, size, intfmt, arg->val);
321
322         return scnprintf(bf, size, "%s", sa->entries[idx]);
323 }
324
325 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
326                                               struct syscall_arg *arg)
327 {
328         return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
329 }
330
331 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
332
333 #if defined(__i386__) || defined(__x86_64__)
334 /*
335  * FIXME: Make this available to all arches as soon as the ioctl beautifier
336  *        gets rewritten to support all arches.
337  */
338 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
339                                                  struct syscall_arg *arg)
340 {
341         return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
342 }
343
344 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
345 #endif /* defined(__i386__) || defined(__x86_64__) */
346
347 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
348                                         struct syscall_arg *arg);
349
350 #define SCA_FD syscall_arg__scnprintf_fd
351
352 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
353                                            struct syscall_arg *arg)
354 {
355         int fd = arg->val;
356
357         if (fd == AT_FDCWD)
358                 return scnprintf(bf, size, "CWD");
359
360         return syscall_arg__scnprintf_fd(bf, size, arg);
361 }
362
363 #define SCA_FDAT syscall_arg__scnprintf_fd_at
364
365 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
366                                               struct syscall_arg *arg);
367
368 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
369
370 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
371                                          struct syscall_arg *arg)
372 {
373         return scnprintf(bf, size, "%#lx", arg->val);
374 }
375
376 #define SCA_HEX syscall_arg__scnprintf_hex
377
378 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
379                                          struct syscall_arg *arg)
380 {
381         return scnprintf(bf, size, "%d", arg->val);
382 }
383
384 #define SCA_INT syscall_arg__scnprintf_int
385
386 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
387                                                struct syscall_arg *arg)
388 {
389         int printed = 0, prot = arg->val;
390
391         if (prot == PROT_NONE)
392                 return scnprintf(bf, size, "NONE");
393 #define P_MMAP_PROT(n) \
394         if (prot & PROT_##n) { \
395                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
396                 prot &= ~PROT_##n; \
397         }
398
399         P_MMAP_PROT(EXEC);
400         P_MMAP_PROT(READ);
401         P_MMAP_PROT(WRITE);
402 #ifdef PROT_SEM
403         P_MMAP_PROT(SEM);
404 #endif
405         P_MMAP_PROT(GROWSDOWN);
406         P_MMAP_PROT(GROWSUP);
407 #undef P_MMAP_PROT
408
409         if (prot)
410                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
411
412         return printed;
413 }
414
415 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
416
417 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
418                                                 struct syscall_arg *arg)
419 {
420         int printed = 0, flags = arg->val;
421
422 #define P_MMAP_FLAG(n) \
423         if (flags & MAP_##n) { \
424                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
425                 flags &= ~MAP_##n; \
426         }
427
428         P_MMAP_FLAG(SHARED);
429         P_MMAP_FLAG(PRIVATE);
430 #ifdef MAP_32BIT
431         P_MMAP_FLAG(32BIT);
432 #endif
433         P_MMAP_FLAG(ANONYMOUS);
434         P_MMAP_FLAG(DENYWRITE);
435         P_MMAP_FLAG(EXECUTABLE);
436         P_MMAP_FLAG(FILE);
437         P_MMAP_FLAG(FIXED);
438         P_MMAP_FLAG(GROWSDOWN);
439 #ifdef MAP_HUGETLB
440         P_MMAP_FLAG(HUGETLB);
441 #endif
442         P_MMAP_FLAG(LOCKED);
443         P_MMAP_FLAG(NONBLOCK);
444         P_MMAP_FLAG(NORESERVE);
445         P_MMAP_FLAG(POPULATE);
446         P_MMAP_FLAG(STACK);
447 #ifdef MAP_UNINITIALIZED
448         P_MMAP_FLAG(UNINITIALIZED);
449 #endif
450 #undef P_MMAP_FLAG
451
452         if (flags)
453                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
454
455         return printed;
456 }
457
458 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
459
460 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
461                                                   struct syscall_arg *arg)
462 {
463         int printed = 0, flags = arg->val;
464
465 #define P_MREMAP_FLAG(n) \
466         if (flags & MREMAP_##n) { \
467                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
468                 flags &= ~MREMAP_##n; \
469         }
470
471         P_MREMAP_FLAG(MAYMOVE);
472 #ifdef MREMAP_FIXED
473         P_MREMAP_FLAG(FIXED);
474 #endif
475 #undef P_MREMAP_FLAG
476
477         if (flags)
478                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
479
480         return printed;
481 }
482
483 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
484
485 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
486                                                       struct syscall_arg *arg)
487 {
488         int behavior = arg->val;
489
490         switch (behavior) {
491 #define P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
492         P_MADV_BHV(NORMAL);
493         P_MADV_BHV(RANDOM);
494         P_MADV_BHV(SEQUENTIAL);
495         P_MADV_BHV(WILLNEED);
496         P_MADV_BHV(DONTNEED);
497         P_MADV_BHV(REMOVE);
498         P_MADV_BHV(DONTFORK);
499         P_MADV_BHV(DOFORK);
500         P_MADV_BHV(HWPOISON);
501 #ifdef MADV_SOFT_OFFLINE
502         P_MADV_BHV(SOFT_OFFLINE);
503 #endif
504         P_MADV_BHV(MERGEABLE);
505         P_MADV_BHV(UNMERGEABLE);
506 #ifdef MADV_HUGEPAGE
507         P_MADV_BHV(HUGEPAGE);
508 #endif
509 #ifdef MADV_NOHUGEPAGE
510         P_MADV_BHV(NOHUGEPAGE);
511 #endif
512 #ifdef MADV_DONTDUMP
513         P_MADV_BHV(DONTDUMP);
514 #endif
515 #ifdef MADV_DODUMP
516         P_MADV_BHV(DODUMP);
517 #endif
518 #undef P_MADV_PHV
519         default: break;
520         }
521
522         return scnprintf(bf, size, "%#x", behavior);
523 }
524
525 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
526
527 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
528                                            struct syscall_arg *arg)
529 {
530         int printed = 0, op = arg->val;
531
532         if (op == 0)
533                 return scnprintf(bf, size, "NONE");
534 #define P_CMD(cmd) \
535         if ((op & LOCK_##cmd) == LOCK_##cmd) { \
536                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
537                 op &= ~LOCK_##cmd; \
538         }
539
540         P_CMD(SH);
541         P_CMD(EX);
542         P_CMD(NB);
543         P_CMD(UN);
544         P_CMD(MAND);
545         P_CMD(RW);
546         P_CMD(READ);
547         P_CMD(WRITE);
548 #undef P_OP
549
550         if (op)
551                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
552
553         return printed;
554 }
555
556 #define SCA_FLOCK syscall_arg__scnprintf_flock
557
558 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
559 {
560         enum syscall_futex_args {
561                 SCF_UADDR   = (1 << 0),
562                 SCF_OP      = (1 << 1),
563                 SCF_VAL     = (1 << 2),
564                 SCF_TIMEOUT = (1 << 3),
565                 SCF_UADDR2  = (1 << 4),
566                 SCF_VAL3    = (1 << 5),
567         };
568         int op = arg->val;
569         int cmd = op & FUTEX_CMD_MASK;
570         size_t printed = 0;
571
572         switch (cmd) {
573 #define P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
574         P_FUTEX_OP(WAIT);           arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
575         P_FUTEX_OP(WAKE);           arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
576         P_FUTEX_OP(FD);             arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
577         P_FUTEX_OP(REQUEUE);        arg->mask |= SCF_VAL3|SCF_TIMEOUT;            break;
578         P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;                     break;
579         P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;                     break;
580         P_FUTEX_OP(WAKE_OP);                                                      break;
581         P_FUTEX_OP(LOCK_PI);        arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
582         P_FUTEX_OP(UNLOCK_PI);      arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
583         P_FUTEX_OP(TRYLOCK_PI);     arg->mask |= SCF_VAL3|SCF_UADDR2;             break;
584         P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;                      break;
585         P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;                      break;
586         P_FUTEX_OP(WAIT_REQUEUE_PI);                                              break;
587         default: printed = scnprintf(bf, size, "%#x", cmd);                       break;
588         }
589
590         if (op & FUTEX_PRIVATE_FLAG)
591                 printed += scnprintf(bf + printed, size - printed, "|PRIV");
592
593         if (op & FUTEX_CLOCK_REALTIME)
594                 printed += scnprintf(bf + printed, size - printed, "|CLKRT");
595
596         return printed;
597 }
598
599 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
600
601 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
602 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
603
604 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
605 static DEFINE_STRARRAY(itimers);
606
607 static const char *whences[] = { "SET", "CUR", "END",
608 #ifdef SEEK_DATA
609 "DATA",
610 #endif
611 #ifdef SEEK_HOLE
612 "HOLE",
613 #endif
614 };
615 static DEFINE_STRARRAY(whences);
616
617 static const char *fcntl_cmds[] = {
618         "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
619         "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
620         "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
621         "F_GETOWNER_UIDS",
622 };
623 static DEFINE_STRARRAY(fcntl_cmds);
624
625 static const char *rlimit_resources[] = {
626         "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
627         "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
628         "RTTIME",
629 };
630 static DEFINE_STRARRAY(rlimit_resources);
631
632 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
633 static DEFINE_STRARRAY(sighow);
634
635 static const char *clockid[] = {
636         "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
637         "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
638 };
639 static DEFINE_STRARRAY(clockid);
640
641 static const char *socket_families[] = {
642         "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
643         "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
644         "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
645         "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
646         "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
647         "ALG", "NFC", "VSOCK",
648 };
649 static DEFINE_STRARRAY(socket_families);
650
651 #ifndef SOCK_TYPE_MASK
652 #define SOCK_TYPE_MASK 0xf
653 #endif
654
655 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
656                                                       struct syscall_arg *arg)
657 {
658         size_t printed;
659         int type = arg->val,
660             flags = type & ~SOCK_TYPE_MASK;
661
662         type &= SOCK_TYPE_MASK;
663         /*
664          * Can't use a strarray, MIPS may override for ABI reasons.
665          */
666         switch (type) {
667 #define P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
668         P_SK_TYPE(STREAM);
669         P_SK_TYPE(DGRAM);
670         P_SK_TYPE(RAW);
671         P_SK_TYPE(RDM);
672         P_SK_TYPE(SEQPACKET);
673         P_SK_TYPE(DCCP);
674         P_SK_TYPE(PACKET);
675 #undef P_SK_TYPE
676         default:
677                 printed = scnprintf(bf, size, "%#x", type);
678         }
679
680 #define P_SK_FLAG(n) \
681         if (flags & SOCK_##n) { \
682                 printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
683                 flags &= ~SOCK_##n; \
684         }
685
686         P_SK_FLAG(CLOEXEC);
687         P_SK_FLAG(NONBLOCK);
688 #undef P_SK_FLAG
689
690         if (flags)
691                 printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
692
693         return printed;
694 }
695
696 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
697
698 #ifndef MSG_PROBE
699 #define MSG_PROBE            0x10
700 #endif
701 #ifndef MSG_WAITFORONE
702 #define MSG_WAITFORONE  0x10000
703 #endif
704 #ifndef MSG_SENDPAGE_NOTLAST
705 #define MSG_SENDPAGE_NOTLAST 0x20000
706 #endif
707 #ifndef MSG_FASTOPEN
708 #define MSG_FASTOPEN         0x20000000
709 #endif
710
711 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
712                                                struct syscall_arg *arg)
713 {
714         int printed = 0, flags = arg->val;
715
716         if (flags == 0)
717                 return scnprintf(bf, size, "NONE");
718 #define P_MSG_FLAG(n) \
719         if (flags & MSG_##n) { \
720                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
721                 flags &= ~MSG_##n; \
722         }
723
724         P_MSG_FLAG(OOB);
725         P_MSG_FLAG(PEEK);
726         P_MSG_FLAG(DONTROUTE);
727         P_MSG_FLAG(TRYHARD);
728         P_MSG_FLAG(CTRUNC);
729         P_MSG_FLAG(PROBE);
730         P_MSG_FLAG(TRUNC);
731         P_MSG_FLAG(DONTWAIT);
732         P_MSG_FLAG(EOR);
733         P_MSG_FLAG(WAITALL);
734         P_MSG_FLAG(FIN);
735         P_MSG_FLAG(SYN);
736         P_MSG_FLAG(CONFIRM);
737         P_MSG_FLAG(RST);
738         P_MSG_FLAG(ERRQUEUE);
739         P_MSG_FLAG(NOSIGNAL);
740         P_MSG_FLAG(MORE);
741         P_MSG_FLAG(WAITFORONE);
742         P_MSG_FLAG(SENDPAGE_NOTLAST);
743         P_MSG_FLAG(FASTOPEN);
744         P_MSG_FLAG(CMSG_CLOEXEC);
745 #undef P_MSG_FLAG
746
747         if (flags)
748                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
749
750         return printed;
751 }
752
753 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
754
755 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
756                                                  struct syscall_arg *arg)
757 {
758         size_t printed = 0;
759         int mode = arg->val;
760
761         if (mode == F_OK) /* 0 */
762                 return scnprintf(bf, size, "F");
763 #define P_MODE(n) \
764         if (mode & n##_OK) { \
765                 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
766                 mode &= ~n##_OK; \
767         }
768
769         P_MODE(R);
770         P_MODE(W);
771         P_MODE(X);
772 #undef P_MODE
773
774         if (mode)
775                 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
776
777         return printed;
778 }
779
780 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
781
782 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
783                                                struct syscall_arg *arg)
784 {
785         int printed = 0, flags = arg->val;
786
787         if (!(flags & O_CREAT))
788                 arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
789
790         if (flags == 0)
791                 return scnprintf(bf, size, "RDONLY");
792 #define P_FLAG(n) \
793         if (flags & O_##n) { \
794                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
795                 flags &= ~O_##n; \
796         }
797
798         P_FLAG(APPEND);
799         P_FLAG(ASYNC);
800         P_FLAG(CLOEXEC);
801         P_FLAG(CREAT);
802         P_FLAG(DIRECT);
803         P_FLAG(DIRECTORY);
804         P_FLAG(EXCL);
805         P_FLAG(LARGEFILE);
806         P_FLAG(NOATIME);
807         P_FLAG(NOCTTY);
808 #ifdef O_NONBLOCK
809         P_FLAG(NONBLOCK);
810 #elif O_NDELAY
811         P_FLAG(NDELAY);
812 #endif
813 #ifdef O_PATH
814         P_FLAG(PATH);
815 #endif
816         P_FLAG(RDWR);
817 #ifdef O_DSYNC
818         if ((flags & O_SYNC) == O_SYNC)
819                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
820         else {
821                 P_FLAG(DSYNC);
822         }
823 #else
824         P_FLAG(SYNC);
825 #endif
826         P_FLAG(TRUNC);
827         P_FLAG(WRONLY);
828 #undef P_FLAG
829
830         if (flags)
831                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
832
833         return printed;
834 }
835
836 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
837
838 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
839                                                 struct syscall_arg *arg)
840 {
841         int printed = 0, flags = arg->val;
842
843         if (flags == 0)
844                 return 0;
845
846 #define P_FLAG(n) \
847         if (flags & PERF_FLAG_##n) { \
848                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
849                 flags &= ~PERF_FLAG_##n; \
850         }
851
852         P_FLAG(FD_NO_GROUP);
853         P_FLAG(FD_OUTPUT);
854         P_FLAG(PID_CGROUP);
855         P_FLAG(FD_CLOEXEC);
856 #undef P_FLAG
857
858         if (flags)
859                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
860
861         return printed;
862 }
863
864 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
865
866 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
867                                                    struct syscall_arg *arg)
868 {
869         int printed = 0, flags = arg->val;
870
871         if (flags == 0)
872                 return scnprintf(bf, size, "NONE");
873 #define P_FLAG(n) \
874         if (flags & EFD_##n) { \
875                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
876                 flags &= ~EFD_##n; \
877         }
878
879         P_FLAG(SEMAPHORE);
880         P_FLAG(CLOEXEC);
881         P_FLAG(NONBLOCK);
882 #undef P_FLAG
883
884         if (flags)
885                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
886
887         return printed;
888 }
889
890 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
891
892 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
893                                                 struct syscall_arg *arg)
894 {
895         int printed = 0, flags = arg->val;
896
897 #define P_FLAG(n) \
898         if (flags & O_##n) { \
899                 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
900                 flags &= ~O_##n; \
901         }
902
903         P_FLAG(CLOEXEC);
904         P_FLAG(NONBLOCK);
905 #undef P_FLAG
906
907         if (flags)
908                 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
909
910         return printed;
911 }
912
913 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
914
915 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
916 {
917         int sig = arg->val;
918
919         switch (sig) {
920 #define P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
921         P_SIGNUM(HUP);
922         P_SIGNUM(INT);
923         P_SIGNUM(QUIT);
924         P_SIGNUM(ILL);
925         P_SIGNUM(TRAP);
926         P_SIGNUM(ABRT);
927         P_SIGNUM(BUS);
928         P_SIGNUM(FPE);
929         P_SIGNUM(KILL);
930         P_SIGNUM(USR1);
931         P_SIGNUM(SEGV);
932         P_SIGNUM(USR2);
933         P_SIGNUM(PIPE);
934         P_SIGNUM(ALRM);
935         P_SIGNUM(TERM);
936         P_SIGNUM(CHLD);
937         P_SIGNUM(CONT);
938         P_SIGNUM(STOP);
939         P_SIGNUM(TSTP);
940         P_SIGNUM(TTIN);
941         P_SIGNUM(TTOU);
942         P_SIGNUM(URG);
943         P_SIGNUM(XCPU);
944         P_SIGNUM(XFSZ);
945         P_SIGNUM(VTALRM);
946         P_SIGNUM(PROF);
947         P_SIGNUM(WINCH);
948         P_SIGNUM(IO);
949         P_SIGNUM(PWR);
950         P_SIGNUM(SYS);
951 #ifdef SIGEMT
952         P_SIGNUM(EMT);
953 #endif
954 #ifdef SIGSTKFLT
955         P_SIGNUM(STKFLT);
956 #endif
957 #ifdef SIGSWI
958         P_SIGNUM(SWI);
959 #endif
960         default: break;
961         }
962
963         return scnprintf(bf, size, "%#x", sig);
964 }
965
966 #define SCA_SIGNUM syscall_arg__scnprintf_signum
967
968 #if defined(__i386__) || defined(__x86_64__)
969 /*
970  * FIXME: Make this available to all arches.
971  */
972 #define TCGETS          0x5401
973
974 static const char *tioctls[] = {
975         "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
976         "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
977         "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
978         "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
979         "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
980         "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
981         "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
982         "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
983         "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
984         "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
985         "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
986         [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
987         "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
988         "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
989         "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
990 };
991
992 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
993 #endif /* defined(__i386__) || defined(__x86_64__) */
994
995 #define STRARRAY(arg, name, array) \
996           .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
997           .arg_parm      = { [arg] = &strarray__##array, }
998
999 static struct syscall_fmt {
1000         const char *name;
1001         const char *alias;
1002         size_t     (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1003         void       *arg_parm[6];
1004         bool       errmsg;
1005         bool       timeout;
1006         bool       hexret;
1007 } syscall_fmts[] = {
1008         { .name     = "access",     .errmsg = true,
1009           .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
1010         { .name     = "arch_prctl", .errmsg = true, .alias = "prctl", },
1011         { .name     = "brk",        .hexret = true,
1012           .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1013         { .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1014         { .name     = "close",      .errmsg = true,
1015           .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1016         { .name     = "connect",    .errmsg = true, },
1017         { .name     = "dup",        .errmsg = true,
1018           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1019         { .name     = "dup2",       .errmsg = true,
1020           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1021         { .name     = "dup3",       .errmsg = true,
1022           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1023         { .name     = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1024         { .name     = "eventfd2",   .errmsg = true,
1025           .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1026         { .name     = "faccessat",  .errmsg = true,
1027           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1028         { .name     = "fadvise64",  .errmsg = true,
1029           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1030         { .name     = "fallocate",  .errmsg = true,
1031           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1032         { .name     = "fchdir",     .errmsg = true,
1033           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1034         { .name     = "fchmod",     .errmsg = true,
1035           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1036         { .name     = "fchmodat",   .errmsg = true,
1037           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1038         { .name     = "fchown",     .errmsg = true,
1039           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1040         { .name     = "fchownat",   .errmsg = true,
1041           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1042         { .name     = "fcntl",      .errmsg = true,
1043           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1044                              [1] = SCA_STRARRAY, /* cmd */ },
1045           .arg_parm      = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1046         { .name     = "fdatasync",  .errmsg = true,
1047           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1048         { .name     = "flock",      .errmsg = true,
1049           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1050                              [1] = SCA_FLOCK, /* cmd */ }, },
1051         { .name     = "fsetxattr",  .errmsg = true,
1052           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1053         { .name     = "fstat",      .errmsg = true, .alias = "newfstat",
1054           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1055         { .name     = "fstatat",    .errmsg = true, .alias = "newfstatat",
1056           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1057         { .name     = "fstatfs",    .errmsg = true,
1058           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1059         { .name     = "fsync",    .errmsg = true,
1060           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061         { .name     = "ftruncate", .errmsg = true,
1062           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1063         { .name     = "futex",      .errmsg = true,
1064           .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1065         { .name     = "futimesat", .errmsg = true,
1066           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1067         { .name     = "getdents",   .errmsg = true,
1068           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1069         { .name     = "getdents64", .errmsg = true,
1070           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1071         { .name     = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1072         { .name     = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1073         { .name     = "ioctl",      .errmsg = true,
1074           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1075 #if defined(__i386__) || defined(__x86_64__)
1076 /*
1077  * FIXME: Make this available to all arches.
1078  */
1079                              [1] = SCA_STRHEXARRAY, /* cmd */
1080                              [2] = SCA_HEX, /* arg */ },
1081           .arg_parm      = { [1] = &strarray__tioctls, /* cmd */ }, },
1082 #else
1083                              [2] = SCA_HEX, /* arg */ }, },
1084 #endif
1085         { .name     = "kill",       .errmsg = true,
1086           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1087         { .name     = "linkat",     .errmsg = true,
1088           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1089         { .name     = "lseek",      .errmsg = true,
1090           .arg_scnprintf = { [0] = SCA_FD, /* fd */
1091                              [2] = SCA_STRARRAY, /* whence */ },
1092           .arg_parm      = { [2] = &strarray__whences, /* whence */ }, },
1093         { .name     = "lstat",      .errmsg = true, .alias = "newlstat", },
1094         { .name     = "madvise",    .errmsg = true,
1095           .arg_scnprintf = { [0] = SCA_HEX,      /* start */
1096                              [2] = SCA_MADV_BHV, /* behavior */ }, },
1097         { .name     = "mkdirat",    .errmsg = true,
1098           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1099         { .name     = "mknodat",    .errmsg = true,
1100           .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1101         { .name     = "mlock",      .errmsg = true,
1102           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1103         { .name     = "mlockall",   .errmsg = true,
1104           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1105         { .name     = "mmap",       .hexret = true,
1106           .arg_scnprintf = { [0] = SCA_HEX,       /* addr */
1107                              [2] = SCA_MMAP_PROT, /* prot */
1108                              [3] = SCA_MMAP_FLAGS, /* flags */
1109                              [4] = SCA_FD,        /* fd */ }, },
1110         { .name     = "mprotect",   .errmsg = true,
1111           .arg_scnprintf = { [0] = SCA_HEX, /* start */
1112                              [2] = SCA_MMAP_PROT, /* prot */ }, },
1113         { .name     = "mremap",     .hexret = true,
1114           .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1115                              [3] = SCA_MREMAP_FLAGS, /* flags */
1116                              [4] = SCA_HEX, /* new_addr */ }, },
1117         { .name     = "munlock",    .errmsg = true,
1118           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1119         { .name     = "munmap",     .errmsg = true,
1120           .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1121         { .name     = "name_to_handle_at", .errmsg = true,
1122           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1123         { .name     = "newfstatat", .errmsg = true,
1124           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1125         { .name     = "open",       .errmsg = true,
1126           .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1127         { .name     = "open_by_handle_at", .errmsg = true,
1128           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1129                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1130         { .name     = "openat",     .errmsg = true,
1131           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1132                              [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1133         { .name     = "perf_event_open", .errmsg = true,
1134           .arg_scnprintf = { [1] = SCA_INT, /* pid */
1135                              [2] = SCA_INT, /* cpu */
1136                              [3] = SCA_FD,  /* group_fd */
1137                              [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1138         { .name     = "pipe2",      .errmsg = true,
1139           .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1140         { .name     = "poll",       .errmsg = true, .timeout = true, },
1141         { .name     = "ppoll",      .errmsg = true, .timeout = true, },
1142         { .name     = "pread",      .errmsg = true, .alias = "pread64",
1143           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1144         { .name     = "preadv",     .errmsg = true, .alias = "pread",
1145           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1146         { .name     = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1147         { .name     = "pwrite",     .errmsg = true, .alias = "pwrite64",
1148           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1149         { .name     = "pwritev",    .errmsg = true,
1150           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1151         { .name     = "read",       .errmsg = true,
1152           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1153         { .name     = "readlinkat", .errmsg = true,
1154           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1155         { .name     = "readv",      .errmsg = true,
1156           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1157         { .name     = "recvfrom",   .errmsg = true,
1158           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1159         { .name     = "recvmmsg",   .errmsg = true,
1160           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1161         { .name     = "recvmsg",    .errmsg = true,
1162           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1163         { .name     = "renameat",   .errmsg = true,
1164           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1165         { .name     = "rt_sigaction", .errmsg = true,
1166           .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1167         { .name     = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1168         { .name     = "rt_sigqueueinfo", .errmsg = true,
1169           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1170         { .name     = "rt_tgsigqueueinfo", .errmsg = true,
1171           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1172         { .name     = "select",     .errmsg = true, .timeout = true, },
1173         { .name     = "sendmmsg",    .errmsg = true,
1174           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1175         { .name     = "sendmsg",    .errmsg = true,
1176           .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1177         { .name     = "sendto",     .errmsg = true,
1178           .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1179         { .name     = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1180         { .name     = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1181         { .name     = "shutdown",   .errmsg = true,
1182           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1183         { .name     = "socket",     .errmsg = true,
1184           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1185                              [1] = SCA_SK_TYPE, /* type */ },
1186           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1187         { .name     = "socketpair", .errmsg = true,
1188           .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1189                              [1] = SCA_SK_TYPE, /* type */ },
1190           .arg_parm      = { [0] = &strarray__socket_families, /* family */ }, },
1191         { .name     = "stat",       .errmsg = true, .alias = "newstat", },
1192         { .name     = "symlinkat",  .errmsg = true,
1193           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1194         { .name     = "tgkill",     .errmsg = true,
1195           .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1196         { .name     = "tkill",      .errmsg = true,
1197           .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1198         { .name     = "uname",      .errmsg = true, .alias = "newuname", },
1199         { .name     = "unlinkat",   .errmsg = true,
1200           .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1201         { .name     = "utimensat",  .errmsg = true,
1202           .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1203         { .name     = "write",      .errmsg = true,
1204           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1205         { .name     = "writev",     .errmsg = true,
1206           .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1207 };
1208
1209 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1210 {
1211         const struct syscall_fmt *fmt = fmtp;
1212         return strcmp(name, fmt->name);
1213 }
1214
1215 static struct syscall_fmt *syscall_fmt__find(const char *name)
1216 {
1217         const int nmemb = ARRAY_SIZE(syscall_fmts);
1218         return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1219 }
1220
1221 struct syscall {
1222         struct event_format *tp_format;
1223         int                 nr_args;
1224         struct format_field *args;
1225         const char          *name;
1226         bool                filtered;
1227         bool                is_exit;
1228         struct syscall_fmt  *fmt;
1229         size_t              (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1230         void                **arg_parm;
1231 };
1232
1233 static size_t fprintf_duration(unsigned long t, FILE *fp)
1234 {
1235         double duration = (double)t / NSEC_PER_MSEC;
1236         size_t printed = fprintf(fp, "(");
1237
1238         if (duration >= 1.0)
1239                 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1240         else if (duration >= 0.01)
1241                 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1242         else
1243                 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1244         return printed + fprintf(fp, "): ");
1245 }
1246
1247 struct thread_trace {
1248         u64               entry_time;
1249         u64               exit_time;
1250         bool              entry_pending;
1251         unsigned long     nr_events;
1252         unsigned long     pfmaj, pfmin;
1253         char              *entry_str;
1254         double            runtime_ms;
1255         struct {
1256                 int       max;
1257                 char      **table;
1258         } paths;
1259
1260         struct intlist *syscall_stats;
1261 };
1262
1263 static struct thread_trace *thread_trace__new(void)
1264 {
1265         struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1266
1267         if (ttrace)
1268                 ttrace->paths.max = -1;
1269
1270         ttrace->syscall_stats = intlist__new(NULL);
1271
1272         return ttrace;
1273 }
1274
1275 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1276 {
1277         struct thread_trace *ttrace;
1278
1279         if (thread == NULL)
1280                 goto fail;
1281
1282         if (thread__priv(thread) == NULL)
1283                 thread__set_priv(thread, thread_trace__new());
1284
1285         if (thread__priv(thread) == NULL)
1286                 goto fail;
1287
1288         ttrace = thread__priv(thread);
1289         ++ttrace->nr_events;
1290
1291         return ttrace;
1292 fail:
1293         color_fprintf(fp, PERF_COLOR_RED,
1294                       "WARNING: not enough memory, dropping samples!\n");
1295         return NULL;
1296 }
1297
1298 #define TRACE_PFMAJ             (1 << 0)
1299 #define TRACE_PFMIN             (1 << 1)
1300
1301 struct trace {
1302         struct perf_tool        tool;
1303         struct {
1304                 int             machine;
1305                 int             open_id;
1306         }                       audit;
1307         struct {
1308                 int             max;
1309                 struct syscall  *table;
1310         } syscalls;
1311         struct record_opts      opts;
1312         struct perf_evlist      *evlist;
1313         struct machine          *host;
1314         struct thread           *current;
1315         u64                     base_time;
1316         FILE                    *output;
1317         unsigned long           nr_events;
1318         struct strlist          *ev_qualifier;
1319         const char              *last_vfs_getname;
1320         struct intlist          *tid_list;
1321         struct intlist          *pid_list;
1322         struct {
1323                 size_t          nr;
1324                 pid_t           *entries;
1325         }                       filter_pids;
1326         double                  duration_filter;
1327         double                  runtime_ms;
1328         struct {
1329                 u64             vfs_getname,
1330                                 proc_getname;
1331         } stats;
1332         bool                    not_ev_qualifier;
1333         bool                    live;
1334         bool                    full_time;
1335         bool                    sched;
1336         bool                    multiple_threads;
1337         bool                    summary;
1338         bool                    summary_only;
1339         bool                    show_comm;
1340         bool                    show_tool_stats;
1341         bool                    trace_syscalls;
1342         bool                    force;
1343         int                     trace_pgfaults;
1344 };
1345
1346 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1347 {
1348         struct thread_trace *ttrace = thread__priv(thread);
1349
1350         if (fd > ttrace->paths.max) {
1351                 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1352
1353                 if (npath == NULL)
1354                         return -1;
1355
1356                 if (ttrace->paths.max != -1) {
1357                         memset(npath + ttrace->paths.max + 1, 0,
1358                                (fd - ttrace->paths.max) * sizeof(char *));
1359                 } else {
1360                         memset(npath, 0, (fd + 1) * sizeof(char *));
1361                 }
1362
1363                 ttrace->paths.table = npath;
1364                 ttrace->paths.max   = fd;
1365         }
1366
1367         ttrace->paths.table[fd] = strdup(pathname);
1368
1369         return ttrace->paths.table[fd] != NULL ? 0 : -1;
1370 }
1371
1372 static int thread__read_fd_path(struct thread *thread, int fd)
1373 {
1374         char linkname[PATH_MAX], pathname[PATH_MAX];
1375         struct stat st;
1376         int ret;
1377
1378         if (thread->pid_ == thread->tid) {
1379                 scnprintf(linkname, sizeof(linkname),
1380                           "/proc/%d/fd/%d", thread->pid_, fd);
1381         } else {
1382                 scnprintf(linkname, sizeof(linkname),
1383                           "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1384         }
1385
1386         if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1387                 return -1;
1388
1389         ret = readlink(linkname, pathname, sizeof(pathname));
1390
1391         if (ret < 0 || ret > st.st_size)
1392                 return -1;
1393
1394         pathname[ret] = '\0';
1395         return trace__set_fd_pathname(thread, fd, pathname);
1396 }
1397
1398 static const char *thread__fd_path(struct thread *thread, int fd,
1399                                    struct trace *trace)
1400 {
1401         struct thread_trace *ttrace = thread__priv(thread);
1402
1403         if (ttrace == NULL)
1404                 return NULL;
1405
1406         if (fd < 0)
1407                 return NULL;
1408
1409         if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1410                 if (!trace->live)
1411                         return NULL;
1412                 ++trace->stats.proc_getname;
1413                 if (thread__read_fd_path(thread, fd))
1414                         return NULL;
1415         }
1416
1417         return ttrace->paths.table[fd];
1418 }
1419
1420 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1421                                         struct syscall_arg *arg)
1422 {
1423         int fd = arg->val;
1424         size_t printed = scnprintf(bf, size, "%d", fd);
1425         const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1426
1427         if (path)
1428                 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1429
1430         return printed;
1431 }
1432
1433 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1434                                               struct syscall_arg *arg)
1435 {
1436         int fd = arg->val;
1437         size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1438         struct thread_trace *ttrace = thread__priv(arg->thread);
1439
1440         if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1441                 zfree(&ttrace->paths.table[fd]);
1442
1443         return printed;
1444 }
1445
1446 static bool trace__filter_duration(struct trace *trace, double t)
1447 {
1448         return t < (trace->duration_filter * NSEC_PER_MSEC);
1449 }
1450
1451 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1452 {
1453         double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1454
1455         return fprintf(fp, "%10.3f ", ts);
1456 }
1457
1458 static bool done = false;
1459 static bool interrupted = false;
1460
1461 static void sig_handler(int sig)
1462 {
1463         done = true;
1464         interrupted = sig == SIGINT;
1465 }
1466
1467 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1468                                         u64 duration, u64 tstamp, FILE *fp)
1469 {
1470         size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1471         printed += fprintf_duration(duration, fp);
1472
1473         if (trace->multiple_threads) {
1474                 if (trace->show_comm)
1475                         printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1476                 printed += fprintf(fp, "%d ", thread->tid);
1477         }
1478
1479         return printed;
1480 }
1481
1482 static int trace__process_event(struct trace *trace, struct machine *machine,
1483                                 union perf_event *event, struct perf_sample *sample)
1484 {
1485         int ret = 0;
1486
1487         switch (event->header.type) {
1488         case PERF_RECORD_LOST:
1489                 color_fprintf(trace->output, PERF_COLOR_RED,
1490                               "LOST %" PRIu64 " events!\n", event->lost.lost);
1491                 ret = machine__process_lost_event(machine, event, sample);
1492         default:
1493                 ret = machine__process_event(machine, event, sample);
1494                 break;
1495         }
1496
1497         return ret;
1498 }
1499
1500 static int trace__tool_process(struct perf_tool *tool,
1501                                union perf_event *event,
1502                                struct perf_sample *sample,
1503                                struct machine *machine)
1504 {
1505         struct trace *trace = container_of(tool, struct trace, tool);
1506         return trace__process_event(trace, machine, event, sample);
1507 }
1508
1509 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1510 {
1511         int err = symbol__init(NULL);
1512
1513         if (err)
1514                 return err;
1515
1516         trace->host = machine__new_host();
1517         if (trace->host == NULL)
1518                 return -ENOMEM;
1519
1520         err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1521                                             evlist->threads, trace__tool_process, false,
1522                                             trace->opts.proc_map_timeout);
1523         if (err)
1524                 symbol__exit();
1525
1526         return err;
1527 }
1528
1529 static int syscall__set_arg_fmts(struct syscall *sc)
1530 {
1531         struct format_field *field;
1532         int idx = 0;
1533
1534         sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1535         if (sc->arg_scnprintf == NULL)
1536                 return -1;
1537
1538         if (sc->fmt)
1539                 sc->arg_parm = sc->fmt->arg_parm;
1540
1541         for (field = sc->args; field; field = field->next) {
1542                 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1543                         sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1544                 else if (field->flags & FIELD_IS_POINTER)
1545                         sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1546                 ++idx;
1547         }
1548
1549         return 0;
1550 }
1551
1552 static int trace__read_syscall_info(struct trace *trace, int id)
1553 {
1554         char tp_name[128];
1555         struct syscall *sc;
1556         const char *name = audit_syscall_to_name(id, trace->audit.machine);
1557
1558         if (name == NULL)
1559                 return -1;
1560
1561         if (id > trace->syscalls.max) {
1562                 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1563
1564                 if (nsyscalls == NULL)
1565                         return -1;
1566
1567                 if (trace->syscalls.max != -1) {
1568                         memset(nsyscalls + trace->syscalls.max + 1, 0,
1569                                (id - trace->syscalls.max) * sizeof(*sc));
1570                 } else {
1571                         memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1572                 }
1573
1574                 trace->syscalls.table = nsyscalls;
1575                 trace->syscalls.max   = id;
1576         }
1577
1578         sc = trace->syscalls.table + id;
1579         sc->name = name;
1580
1581         if (trace->ev_qualifier) {
1582                 bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1583
1584                 if (!(in ^ trace->not_ev_qualifier)) {
1585                         sc->filtered = true;
1586                         /*
1587                          * No need to do read tracepoint information since this will be
1588                          * filtered out.
1589                          */
1590                         return 0;
1591                 }
1592         }
1593
1594         sc->fmt  = syscall_fmt__find(sc->name);
1595
1596         snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1597         sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1598
1599         if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1600                 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1601                 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1602         }
1603
1604         if (sc->tp_format == NULL)
1605                 return -1;
1606
1607         sc->args = sc->tp_format->format.fields;
1608         sc->nr_args = sc->tp_format->format.nr_fields;
1609         /* drop nr field - not relevant here; does not exist on older kernels */
1610         if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1611                 sc->args = sc->args->next;
1612                 --sc->nr_args;
1613         }
1614
1615         sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1616
1617         return syscall__set_arg_fmts(sc);
1618 }
1619
1620 /*
1621  * args is to be interpreted as a series of longs but we need to handle
1622  * 8-byte unaligned accesses. args points to raw_data within the event
1623  * and raw_data is guaranteed to be 8-byte unaligned because it is
1624  * preceded by raw_size which is a u32. So we need to copy args to a temp
1625  * variable to read it. Most notably this avoids extended load instructions
1626  * on unaligned addresses
1627  */
1628
1629 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1630                                       unsigned char *args, struct trace *trace,
1631                                       struct thread *thread)
1632 {
1633         size_t printed = 0;
1634         unsigned char *p;
1635         unsigned long val;
1636
1637         if (sc->args != NULL) {
1638                 struct format_field *field;
1639                 u8 bit = 1;
1640                 struct syscall_arg arg = {
1641                         .idx    = 0,
1642                         .mask   = 0,
1643                         .trace  = trace,
1644                         .thread = thread,
1645                 };
1646
1647                 for (field = sc->args; field;
1648                      field = field->next, ++arg.idx, bit <<= 1) {
1649                         if (arg.mask & bit)
1650                                 continue;
1651
1652                         /* special care for unaligned accesses */
1653                         p = args + sizeof(unsigned long) * arg.idx;
1654                         memcpy(&val, p, sizeof(val));
1655
1656                         /*
1657                          * Suppress this argument if its value is zero and
1658                          * and we don't have a string associated in an
1659                          * strarray for it.
1660                          */
1661                         if (val == 0 &&
1662                             !(sc->arg_scnprintf &&
1663                               sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1664                               sc->arg_parm[arg.idx]))
1665                                 continue;
1666
1667                         printed += scnprintf(bf + printed, size - printed,
1668                                              "%s%s: ", printed ? ", " : "", field->name);
1669                         if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1670                                 arg.val = val;
1671                                 if (sc->arg_parm)
1672                                         arg.parm = sc->arg_parm[arg.idx];
1673                                 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1674                                                                       size - printed, &arg);
1675                         } else {
1676                                 printed += scnprintf(bf + printed, size - printed,
1677                                                      "%ld", val);
1678                         }
1679                 }
1680         } else {
1681                 int i = 0;
1682
1683                 while (i < 6) {
1684                         /* special care for unaligned accesses */
1685                         p = args + sizeof(unsigned long) * i;
1686                         memcpy(&val, p, sizeof(val));
1687                         printed += scnprintf(bf + printed, size - printed,
1688                                              "%sarg%d: %ld",
1689                                              printed ? ", " : "", i, val);
1690                         ++i;
1691                 }
1692         }
1693
1694         return printed;
1695 }
1696
1697 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1698                                   union perf_event *event,
1699                                   struct perf_sample *sample);
1700
1701 static struct syscall *trace__syscall_info(struct trace *trace,
1702                                            struct perf_evsel *evsel, int id)
1703 {
1704
1705         if (id < 0) {
1706
1707                 /*
1708                  * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1709                  * before that, leaving at a higher verbosity level till that is
1710                  * explained. Reproduced with plain ftrace with:
1711                  *
1712                  * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1713                  * grep "NR -1 " /t/trace_pipe
1714                  *
1715                  * After generating some load on the machine.
1716                  */
1717                 if (verbose > 1) {
1718                         static u64 n;
1719                         fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1720                                 id, perf_evsel__name(evsel), ++n);
1721                 }
1722                 return NULL;
1723         }
1724
1725         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1726             trace__read_syscall_info(trace, id))
1727                 goto out_cant_read;
1728
1729         if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1730                 goto out_cant_read;
1731
1732         return &trace->syscalls.table[id];
1733
1734 out_cant_read:
1735         if (verbose) {
1736                 fprintf(trace->output, "Problems reading syscall %d", id);
1737                 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1738                         fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1739                 fputs(" information\n", trace->output);
1740         }
1741         return NULL;
1742 }
1743
1744 static void thread__update_stats(struct thread_trace *ttrace,
1745                                  int id, struct perf_sample *sample)
1746 {
1747         struct int_node *inode;
1748         struct stats *stats;
1749         u64 duration = 0;
1750
1751         inode = intlist__findnew(ttrace->syscall_stats, id);
1752         if (inode == NULL)
1753                 return;
1754
1755         stats = inode->priv;
1756         if (stats == NULL) {
1757                 stats = malloc(sizeof(struct stats));
1758                 if (stats == NULL)
1759                         return;
1760                 init_stats(stats);
1761                 inode->priv = stats;
1762         }
1763
1764         if (ttrace->entry_time && sample->time > ttrace->entry_time)
1765                 duration = sample->time - ttrace->entry_time;
1766
1767         update_stats(stats, duration);
1768 }
1769
1770 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1771 {
1772         struct thread_trace *ttrace;
1773         u64 duration;
1774         size_t printed;
1775
1776         if (trace->current == NULL)
1777                 return 0;
1778
1779         ttrace = thread__priv(trace->current);
1780
1781         if (!ttrace->entry_pending)
1782                 return 0;
1783
1784         duration = sample->time - ttrace->entry_time;
1785
1786         printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1787         printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1788         ttrace->entry_pending = false;
1789
1790         return printed;
1791 }
1792
1793 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1794                             union perf_event *event __maybe_unused,
1795                             struct perf_sample *sample)
1796 {
1797         char *msg;
1798         void *args;
1799         size_t printed = 0;
1800         struct thread *thread;
1801         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1802         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1803         struct thread_trace *ttrace;
1804
1805         if (sc == NULL)
1806                 return -1;
1807
1808         if (sc->filtered)
1809                 return 0;
1810
1811         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1812         ttrace = thread__trace(thread, trace->output);
1813         if (ttrace == NULL)
1814                 goto out_put;
1815
1816         args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1817
1818         if (ttrace->entry_str == NULL) {
1819                 ttrace->entry_str = malloc(1024);
1820                 if (!ttrace->entry_str)
1821                         goto out_put;
1822         }
1823
1824         if (!trace->summary_only)
1825                 trace__printf_interrupted_entry(trace, sample);
1826
1827         ttrace->entry_time = sample->time;
1828         msg = ttrace->entry_str;
1829         printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1830
1831         printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1832                                            args, trace, thread);
1833
1834         if (sc->is_exit) {
1835                 if (!trace->duration_filter && !trace->summary_only) {
1836                         trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1837                         fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1838                 }
1839         } else
1840                 ttrace->entry_pending = true;
1841
1842         if (trace->current != thread) {
1843                 thread__put(trace->current);
1844                 trace->current = thread__get(thread);
1845         }
1846         err = 0;
1847 out_put:
1848         thread__put(thread);
1849         return err;
1850 }
1851
1852 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1853                            union perf_event *event __maybe_unused,
1854                            struct perf_sample *sample)
1855 {
1856         long ret;
1857         u64 duration = 0;
1858         struct thread *thread;
1859         int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1860         struct syscall *sc = trace__syscall_info(trace, evsel, id);
1861         struct thread_trace *ttrace;
1862
1863         if (sc == NULL)
1864                 return -1;
1865
1866         if (sc->filtered)
1867                 return 0;
1868
1869         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1870         ttrace = thread__trace(thread, trace->output);
1871         if (ttrace == NULL)
1872                 goto out_put;
1873
1874         if (trace->summary)
1875                 thread__update_stats(ttrace, id, sample);
1876
1877         ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1878
1879         if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1880                 trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1881                 trace->last_vfs_getname = NULL;
1882                 ++trace->stats.vfs_getname;
1883         }
1884
1885         ttrace->exit_time = sample->time;
1886
1887         if (ttrace->entry_time) {
1888                 duration = sample->time - ttrace->entry_time;
1889                 if (trace__filter_duration(trace, duration))
1890                         goto out;
1891         } else if (trace->duration_filter)
1892                 goto out;
1893
1894         if (trace->summary_only)
1895                 goto out;
1896
1897         trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1898
1899         if (ttrace->entry_pending) {
1900                 fprintf(trace->output, "%-70s", ttrace->entry_str);
1901         } else {
1902                 fprintf(trace->output, " ... [");
1903                 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1904                 fprintf(trace->output, "]: %s()", sc->name);
1905         }
1906
1907         if (sc->fmt == NULL) {
1908 signed_print:
1909                 fprintf(trace->output, ") = %ld", ret);
1910         } else if (ret < 0 && sc->fmt->errmsg) {
1911                 char bf[STRERR_BUFSIZE];
1912                 const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1913                            *e = audit_errno_to_name(-ret);
1914
1915                 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1916         } else if (ret == 0 && sc->fmt->timeout)
1917                 fprintf(trace->output, ") = 0 Timeout");
1918         else if (sc->fmt->hexret)
1919                 fprintf(trace->output, ") = %#lx", ret);
1920         else
1921                 goto signed_print;
1922
1923         fputc('\n', trace->output);
1924 out:
1925         ttrace->entry_pending = false;
1926         err = 0;
1927 out_put:
1928         thread__put(thread);
1929         return err;
1930 }
1931
1932 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1933                               union perf_event *event __maybe_unused,
1934                               struct perf_sample *sample)
1935 {
1936         trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1937         return 0;
1938 }
1939
1940 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1941                                      union perf_event *event __maybe_unused,
1942                                      struct perf_sample *sample)
1943 {
1944         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1945         double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1946         struct thread *thread = machine__findnew_thread(trace->host,
1947                                                         sample->pid,
1948                                                         sample->tid);
1949         struct thread_trace *ttrace = thread__trace(thread, trace->output);
1950
1951         if (ttrace == NULL)
1952                 goto out_dump;
1953
1954         ttrace->runtime_ms += runtime_ms;
1955         trace->runtime_ms += runtime_ms;
1956         thread__put(thread);
1957         return 0;
1958
1959 out_dump:
1960         fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1961                evsel->name,
1962                perf_evsel__strval(evsel, sample, "comm"),
1963                (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1964                runtime,
1965                perf_evsel__intval(evsel, sample, "vruntime"));
1966         thread__put(thread);
1967         return 0;
1968 }
1969
1970 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1971                                 union perf_event *event __maybe_unused,
1972                                 struct perf_sample *sample)
1973 {
1974         trace__printf_interrupted_entry(trace, sample);
1975         trace__fprintf_tstamp(trace, sample->time, trace->output);
1976
1977         if (trace->trace_syscalls)
1978                 fprintf(trace->output, "(         ): ");
1979
1980         fprintf(trace->output, "%s:", evsel->name);
1981
1982         if (evsel->tp_format) {
1983                 event_format__fprintf(evsel->tp_format, sample->cpu,
1984                                       sample->raw_data, sample->raw_size,
1985                                       trace->output);
1986         }
1987
1988         fprintf(trace->output, ")\n");
1989         return 0;
1990 }
1991
1992 static void print_location(FILE *f, struct perf_sample *sample,
1993                            struct addr_location *al,
1994                            bool print_dso, bool print_sym)
1995 {
1996
1997         if ((verbose || print_dso) && al->map)
1998                 fprintf(f, "%s@", al->map->dso->long_name);
1999
2000         if ((verbose || print_sym) && al->sym)
2001                 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2002                         al->addr - al->sym->start);
2003         else if (al->map)
2004                 fprintf(f, "0x%" PRIx64, al->addr);
2005         else
2006                 fprintf(f, "0x%" PRIx64, sample->addr);
2007 }
2008
2009 static int trace__pgfault(struct trace *trace,
2010                           struct perf_evsel *evsel,
2011                           union perf_event *event,
2012                           struct perf_sample *sample)
2013 {
2014         struct thread *thread;
2015         u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2016         struct addr_location al;
2017         char map_type = 'd';
2018         struct thread_trace *ttrace;
2019         int err = -1;
2020
2021         thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2022         ttrace = thread__trace(thread, trace->output);
2023         if (ttrace == NULL)
2024                 goto out_put;
2025
2026         if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2027                 ttrace->pfmaj++;
2028         else
2029                 ttrace->pfmin++;
2030
2031         if (trace->summary_only)
2032                 goto out;
2033
2034         thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2035                               sample->ip, &al);
2036
2037         trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2038
2039         fprintf(trace->output, "%sfault [",
2040                 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2041                 "maj" : "min");
2042
2043         print_location(trace->output, sample, &al, false, true);
2044
2045         fprintf(trace->output, "] => ");
2046
2047         thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2048                                    sample->addr, &al);
2049
2050         if (!al.map) {
2051                 thread__find_addr_location(thread, cpumode,
2052                                            MAP__FUNCTION, sample->addr, &al);
2053
2054                 if (al.map)
2055                         map_type = 'x';
2056                 else
2057                         map_type = '?';
2058         }
2059
2060         print_location(trace->output, sample, &al, true, false);
2061
2062         fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2063 out:
2064         err = 0;
2065 out_put:
2066         thread__put(thread);
2067         return err;
2068 }
2069
2070 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2071 {
2072         if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2073             (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2074                 return false;
2075
2076         if (trace->pid_list || trace->tid_list)
2077                 return true;
2078
2079         return false;
2080 }
2081
2082 static int trace__process_sample(struct perf_tool *tool,
2083                                  union perf_event *event,
2084                                  struct perf_sample *sample,
2085                                  struct perf_evsel *evsel,
2086                                  struct machine *machine __maybe_unused)
2087 {
2088         struct trace *trace = container_of(tool, struct trace, tool);
2089         int err = 0;
2090
2091         tracepoint_handler handler = evsel->handler;
2092
2093         if (skip_sample(trace, sample))
2094                 return 0;
2095
2096         if (!trace->full_time && trace->base_time == 0)
2097                 trace->base_time = sample->time;
2098
2099         if (handler) {
2100                 ++trace->nr_events;
2101                 handler(trace, evsel, event, sample);
2102         }
2103
2104         return err;
2105 }
2106
2107 static int parse_target_str(struct trace *trace)
2108 {
2109         if (trace->opts.target.pid) {
2110                 trace->pid_list = intlist__new(trace->opts.target.pid);
2111                 if (trace->pid_list == NULL) {
2112                         pr_err("Error parsing process id string\n");
2113                         return -EINVAL;
2114                 }
2115         }
2116
2117         if (trace->opts.target.tid) {
2118                 trace->tid_list = intlist__new(trace->opts.target.tid);
2119                 if (trace->tid_list == NULL) {
2120                         pr_err("Error parsing thread id string\n");
2121                         return -EINVAL;
2122                 }
2123         }
2124
2125         return 0;
2126 }
2127
2128 static int trace__record(struct trace *trace, int argc, const char **argv)
2129 {
2130         unsigned int rec_argc, i, j;
2131         const char **rec_argv;
2132         const char * const record_args[] = {
2133                 "record",
2134                 "-R",
2135                 "-m", "1024",
2136                 "-c", "1",
2137         };
2138
2139         const char * const sc_args[] = { "-e", };
2140         unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2141         const char * const majpf_args[] = { "-e", "major-faults" };
2142         unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2143         const char * const minpf_args[] = { "-e", "minor-faults" };
2144         unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2145
2146         /* +1 is for the event string below */
2147         rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2148                 majpf_args_nr + minpf_args_nr + argc;
2149         rec_argv = calloc(rec_argc + 1, sizeof(char *));
2150
2151         if (rec_argv == NULL)
2152                 return -ENOMEM;
2153
2154         j = 0;
2155         for (i = 0; i < ARRAY_SIZE(record_args); i++)
2156                 rec_argv[j++] = record_args[i];
2157
2158         if (trace->trace_syscalls) {
2159                 for (i = 0; i < sc_args_nr; i++)
2160                         rec_argv[j++] = sc_args[i];
2161
2162                 /* event string may be different for older kernels - e.g., RHEL6 */
2163                 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2164                         rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2165                 else if (is_valid_tracepoint("syscalls:sys_enter"))
2166                         rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2167                 else {
2168                         pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2169                         return -1;
2170                 }
2171         }
2172
2173         if (trace->trace_pgfaults & TRACE_PFMAJ)
2174                 for (i = 0; i < majpf_args_nr; i++)
2175                         rec_argv[j++] = majpf_args[i];
2176
2177         if (trace->trace_pgfaults & TRACE_PFMIN)
2178                 for (i = 0; i < minpf_args_nr; i++)
2179                         rec_argv[j++] = minpf_args[i];
2180
2181         for (i = 0; i < (unsigned int)argc; i++)
2182                 rec_argv[j++] = argv[i];
2183
2184         return cmd_record(j, rec_argv, NULL);
2185 }
2186
2187 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2188
2189 static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2190 {
2191         struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2192         if (evsel == NULL)
2193                 return;
2194
2195         if (perf_evsel__field(evsel, "pathname") == NULL) {
2196                 perf_evsel__delete(evsel);
2197                 return;
2198         }
2199
2200         evsel->handler = trace__vfs_getname;
2201         perf_evlist__add(evlist, evsel);
2202 }
2203
2204 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2205                                     u64 config)
2206 {
2207         struct perf_evsel *evsel;
2208         struct perf_event_attr attr = {
2209                 .type = PERF_TYPE_SOFTWARE,
2210                 .mmap_data = 1,
2211         };
2212
2213         attr.config = config;
2214         attr.sample_period = 1;
2215
2216         event_attr_init(&attr);
2217
2218         evsel = perf_evsel__new(&attr);
2219         if (!evsel)
2220                 return -ENOMEM;
2221
2222         evsel->handler = trace__pgfault;
2223         perf_evlist__add(evlist, evsel);
2224
2225         return 0;
2226 }
2227
2228 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2229 {
2230         const u32 type = event->header.type;
2231         struct perf_evsel *evsel;
2232
2233         if (!trace->full_time && trace->base_time == 0)
2234                 trace->base_time = sample->time;
2235
2236         if (type != PERF_RECORD_SAMPLE) {
2237                 trace__process_event(trace, trace->host, event, sample);
2238                 return;
2239         }
2240
2241         evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2242         if (evsel == NULL) {
2243                 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2244                 return;
2245         }
2246
2247         if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2248             sample->raw_data == NULL) {
2249                 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2250                        perf_evsel__name(evsel), sample->tid,
2251                        sample->cpu, sample->raw_size);
2252         } else {
2253                 tracepoint_handler handler = evsel->handler;
2254                 handler(trace, evsel, event, sample);
2255         }
2256 }
2257
2258 static int trace__run(struct trace *trace, int argc, const char **argv)
2259 {
2260         struct perf_evlist *evlist = trace->evlist;
2261         int err = -1, i;
2262         unsigned long before;
2263         const bool forks = argc > 0;
2264         bool draining = false;
2265
2266         trace->live = true;
2267
2268         if (trace->trace_syscalls &&
2269             perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
2270                                            trace__sys_exit))
2271                 goto out_error_raw_syscalls;
2272
2273         if (trace->trace_syscalls)
2274                 perf_evlist__add_vfs_getname(evlist);
2275
2276         if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2277             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2278                 goto out_error_mem;
2279         }
2280
2281         if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2282             perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2283                 goto out_error_mem;
2284
2285         if (trace->sched &&
2286             perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2287                                    trace__sched_stat_runtime))
2288                 goto out_error_sched_stat_runtime;
2289
2290         err = perf_evlist__create_maps(evlist, &trace->opts.target);
2291         if (err < 0) {
2292                 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2293                 goto out_delete_evlist;
2294         }
2295
2296         err = trace__symbols_init(trace, evlist);
2297         if (err < 0) {
2298                 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2299                 goto out_delete_evlist;
2300         }
2301
2302         perf_evlist__config(evlist, &trace->opts);
2303
2304         signal(SIGCHLD, sig_handler);
2305         signal(SIGINT, sig_handler);
2306
2307         if (forks) {
2308                 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2309                                                     argv, false, NULL);
2310                 if (err < 0) {
2311                         fprintf(trace->output, "Couldn't run the workload!\n");
2312                         goto out_delete_evlist;
2313                 }
2314         }
2315
2316         err = perf_evlist__open(evlist);
2317         if (err < 0)
2318                 goto out_error_open;
2319
2320         /*
2321          * Better not use !target__has_task() here because we need to cover the
2322          * case where no threads were specified in the command line, but a
2323          * workload was, and in that case we will fill in the thread_map when
2324          * we fork the workload in perf_evlist__prepare_workload.
2325          */
2326         if (trace->filter_pids.nr > 0)
2327                 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2328         else if (evlist->threads->map[0] == -1)
2329                 err = perf_evlist__set_filter_pid(evlist, getpid());
2330
2331         if (err < 0) {
2332                 printf("err=%d,%s\n", -err, strerror(-err));
2333                 exit(1);
2334         }
2335
2336         err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2337         if (err < 0)
2338                 goto out_error_mmap;
2339
2340         if (!target__none(&trace->opts.target))
2341                 perf_evlist__enable(evlist);
2342
2343         if (forks)
2344                 perf_evlist__start_workload(evlist);
2345
2346         trace->multiple_threads = evlist->threads->map[0] == -1 ||
2347                                   evlist->threads->nr > 1 ||
2348                                   perf_evlist__first(evlist)->attr.inherit;
2349 again:
2350         before = trace->nr_events;
2351
2352         for (i = 0; i < evlist->nr_mmaps; i++) {
2353                 union perf_event *event;
2354
2355                 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2356                         struct perf_sample sample;
2357
2358                         ++trace->nr_events;
2359
2360                         err = perf_evlist__parse_sample(evlist, event, &sample);
2361                         if (err) {
2362                                 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2363                                 goto next_event;
2364                         }
2365
2366                         trace__handle_event(trace, event, &sample);
2367 next_event:
2368                         perf_evlist__mmap_consume(evlist, i);
2369
2370                         if (interrupted)
2371                                 goto out_disable;
2372
2373                         if (done && !draining) {
2374                                 perf_evlist__disable(evlist);
2375                                 draining = true;
2376                         }
2377                 }
2378         }
2379
2380         if (trace->nr_events == before) {
2381                 int timeout = done ? 100 : -1;
2382
2383                 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2384                         if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2385                                 draining = true;
2386
2387                         goto again;
2388                 }
2389         } else {
2390                 goto again;
2391         }
2392
2393 out_disable:
2394         thread__zput(trace->current);
2395
2396         perf_evlist__disable(evlist);
2397
2398         if (!err) {
2399                 if (trace->summary)
2400                         trace__fprintf_thread_summary(trace, trace->output);
2401
2402                 if (trace->show_tool_stats) {
2403                         fprintf(trace->output, "Stats:\n "
2404                                                " vfs_getname : %" PRIu64 "\n"
2405                                                " proc_getname: %" PRIu64 "\n",
2406                                 trace->stats.vfs_getname,
2407                                 trace->stats.proc_getname);
2408                 }
2409         }
2410
2411 out_delete_evlist:
2412         perf_evlist__delete(evlist);
2413         trace->evlist = NULL;
2414         trace->live = false;
2415         return err;
2416 {
2417         char errbuf[BUFSIZ];
2418
2419 out_error_sched_stat_runtime:
2420         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2421         goto out_error;
2422
2423 out_error_raw_syscalls:
2424         debugfs__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2425         goto out_error;
2426
2427 out_error_mmap:
2428         perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2429         goto out_error;
2430
2431 out_error_open:
2432         perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2433
2434 out_error:
2435         fprintf(trace->output, "%s\n", errbuf);
2436         goto out_delete_evlist;
2437 }
2438 out_error_mem:
2439         fprintf(trace->output, "Not enough memory to run!\n");
2440         goto out_delete_evlist;
2441 }
2442
2443 static int trace__replay(struct trace *trace)
2444 {
2445         const struct perf_evsel_str_handler handlers[] = {
2446                 { "probe:vfs_getname",       trace__vfs_getname, },
2447         };
2448         struct perf_data_file file = {
2449                 .path  = input_name,
2450                 .mode  = PERF_DATA_MODE_READ,
2451                 .force = trace->force,
2452         };
2453         struct perf_session *session;
2454         struct perf_evsel *evsel;
2455         int err = -1;
2456
2457         trace->tool.sample        = trace__process_sample;
2458         trace->tool.mmap          = perf_event__process_mmap;
2459         trace->tool.mmap2         = perf_event__process_mmap2;
2460         trace->tool.comm          = perf_event__process_comm;
2461         trace->tool.exit          = perf_event__process_exit;
2462         trace->tool.fork          = perf_event__process_fork;
2463         trace->tool.attr          = perf_event__process_attr;
2464         trace->tool.tracing_data = perf_event__process_tracing_data;
2465         trace->tool.build_id      = perf_event__process_build_id;
2466
2467         trace->tool.ordered_events = true;
2468         trace->tool.ordering_requires_timestamps = true;
2469
2470         /* add tid to output */
2471         trace->multiple_threads = true;
2472
2473         session = perf_session__new(&file, false, &trace->tool);
2474         if (session == NULL)
2475                 return -1;
2476
2477         if (symbol__init(&session->header.env) < 0)
2478                 goto out;
2479
2480         trace->host = &session->machines.host;
2481
2482         err = perf_session__set_tracepoints_handlers(session, handlers);
2483         if (err)
2484                 goto out;
2485
2486         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2487                                                      "raw_syscalls:sys_enter");
2488         /* older kernels have syscalls tp versus raw_syscalls */
2489         if (evsel == NULL)
2490                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2491                                                              "syscalls:sys_enter");
2492
2493         if (evsel &&
2494             (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2495             perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2496                 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2497                 goto out;
2498         }
2499
2500         evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2501                                                      "raw_syscalls:sys_exit");
2502         if (evsel == NULL)
2503                 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2504                                                              "syscalls:sys_exit");
2505         if (evsel &&
2506             (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2507             perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2508                 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2509                 goto out;
2510         }
2511
2512         evlist__for_each(session->evlist, evsel) {
2513                 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2514                     (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2515                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2516                      evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2517                         evsel->handler = trace__pgfault;
2518         }
2519
2520         err = parse_target_str(trace);
2521         if (err != 0)
2522                 goto out;
2523
2524         setup_pager();
2525
2526         err = perf_session__process_events(session);
2527         if (err)
2528                 pr_err("Failed to process events, error %d", err);
2529
2530         else if (trace->summary)
2531                 trace__fprintf_thread_summary(trace, trace->output);
2532
2533 out:
2534         perf_session__delete(session);
2535
2536         return err;
2537 }
2538
2539 static size_t trace__fprintf_threads_header(FILE *fp)
2540 {
2541         size_t printed;
2542
2543         printed  = fprintf(fp, "\n Summary of events:\n\n");
2544
2545         return printed;
2546 }
2547
2548 static size_t thread__dump_stats(struct thread_trace *ttrace,
2549                                  struct trace *trace, FILE *fp)
2550 {
2551         struct stats *stats;
2552         size_t printed = 0;
2553         struct syscall *sc;
2554         struct int_node *inode = intlist__first(ttrace->syscall_stats);
2555
2556         if (inode == NULL)
2557                 return 0;
2558
2559         printed += fprintf(fp, "\n");
2560
2561         printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2562         printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2563         printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2564
2565         /* each int_node is a syscall */
2566         while (inode) {
2567                 stats = inode->priv;
2568                 if (stats) {
2569                         double min = (double)(stats->min) / NSEC_PER_MSEC;
2570                         double max = (double)(stats->max) / NSEC_PER_MSEC;
2571                         double avg = avg_stats(stats);
2572                         double pct;
2573                         u64 n = (u64) stats->n;
2574
2575                         pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2576                         avg /= NSEC_PER_MSEC;
2577
2578                         sc = &trace->syscalls.table[inode->i];
2579                         printed += fprintf(fp, "   %-15s", sc->name);
2580                         printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2581                                            n, min, avg);
2582                         printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2583                 }
2584
2585                 inode = intlist__next(inode);
2586         }
2587
2588         printed += fprintf(fp, "\n\n");
2589
2590         return printed;
2591 }
2592
2593 /* struct used to pass data to per-thread function */
2594 struct summary_data {
2595         FILE *fp;
2596         struct trace *trace;
2597         size_t printed;
2598 };
2599
2600 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2601 {
2602         struct summary_data *data = priv;
2603         FILE *fp = data->fp;
2604         size_t printed = data->printed;
2605         struct trace *trace = data->trace;
2606         struct thread_trace *ttrace = thread__priv(thread);
2607         double ratio;
2608
2609         if (ttrace == NULL)
2610                 return 0;
2611
2612         ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2613
2614         printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2615         printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2616         printed += fprintf(fp, "%.1f%%", ratio);
2617         if (ttrace->pfmaj)
2618                 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2619         if (ttrace->pfmin)
2620                 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2621         printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2622         printed += thread__dump_stats(ttrace, trace, fp);
2623
2624         data->printed += printed;
2625
2626         return 0;
2627 }
2628
2629 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2630 {
2631         struct summary_data data = {
2632                 .fp = fp,
2633                 .trace = trace
2634         };
2635         data.printed = trace__fprintf_threads_header(fp);
2636
2637         machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2638
2639         return data.printed;
2640 }
2641
2642 static int trace__set_duration(const struct option *opt, const char *str,
2643                                int unset __maybe_unused)
2644 {
2645         struct trace *trace = opt->value;
2646
2647         trace->duration_filter = atof(str);
2648         return 0;
2649 }
2650
2651 static int trace__set_filter_pids(const struct option *opt, const char *str,
2652                                   int unset __maybe_unused)
2653 {
2654         int ret = -1;
2655         size_t i;
2656         struct trace *trace = opt->value;
2657         /*
2658          * FIXME: introduce a intarray class, plain parse csv and create a
2659          * { int nr, int entries[] } struct...
2660          */
2661         struct intlist *list = intlist__new(str);
2662
2663         if (list == NULL)
2664                 return -1;
2665
2666         i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2667         trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2668
2669         if (trace->filter_pids.entries == NULL)
2670                 goto out;
2671
2672         trace->filter_pids.entries[0] = getpid();
2673
2674         for (i = 1; i < trace->filter_pids.nr; ++i)
2675                 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2676
2677         intlist__delete(list);
2678         ret = 0;
2679 out:
2680         return ret;
2681 }
2682
2683 static int trace__open_output(struct trace *trace, const char *filename)
2684 {
2685         struct stat st;
2686
2687         if (!stat(filename, &st) && st.st_size) {
2688                 char oldname[PATH_MAX];
2689
2690                 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2691                 unlink(oldname);
2692                 rename(filename, oldname);
2693         }
2694
2695         trace->output = fopen(filename, "w");
2696
2697         return trace->output == NULL ? -errno : 0;
2698 }
2699
2700 static int parse_pagefaults(const struct option *opt, const char *str,
2701                             int unset __maybe_unused)
2702 {
2703         int *trace_pgfaults = opt->value;
2704
2705         if (strcmp(str, "all") == 0)
2706                 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2707         else if (strcmp(str, "maj") == 0)
2708                 *trace_pgfaults |= TRACE_PFMAJ;
2709         else if (strcmp(str, "min") == 0)
2710                 *trace_pgfaults |= TRACE_PFMIN;
2711         else
2712                 return -1;
2713
2714         return 0;
2715 }
2716
2717 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2718 {
2719         struct perf_evsel *evsel;
2720
2721         evlist__for_each(evlist, evsel)
2722                 evsel->handler = handler;
2723 }
2724
2725 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2726 {
2727         const char *trace_usage[] = {
2728                 "perf trace [<options>] [<command>]",
2729                 "perf trace [<options>] -- <command> [<options>]",
2730                 "perf trace record [<options>] [<command>]",
2731                 "perf trace record [<options>] -- <command> [<options>]",
2732                 NULL
2733         };
2734         struct trace trace = {
2735                 .audit = {
2736                         .machine = audit_detect_machine(),
2737                         .open_id = audit_name_to_syscall("open", trace.audit.machine),
2738                 },
2739                 .syscalls = {
2740                         . max = -1,
2741                 },
2742                 .opts = {
2743                         .target = {
2744                                 .uid       = UINT_MAX,
2745                                 .uses_mmap = true,
2746                         },
2747                         .user_freq     = UINT_MAX,
2748                         .user_interval = ULLONG_MAX,
2749                         .no_buffering  = true,
2750                         .mmap_pages    = UINT_MAX,
2751                         .proc_map_timeout  = 500,
2752                 },
2753                 .output = stdout,
2754                 .show_comm = true,
2755                 .trace_syscalls = true,
2756         };
2757         const char *output_name = NULL;
2758         const char *ev_qualifier_str = NULL;
2759         const struct option trace_options[] = {
2760         OPT_CALLBACK(0, "event", &trace.evlist, "event",
2761                      "event selector. use 'perf list' to list available events",
2762                      parse_events_option),
2763         OPT_BOOLEAN(0, "comm", &trace.show_comm,
2764                     "show the thread COMM next to its id"),
2765         OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2766         OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2767         OPT_STRING('o', "output", &output_name, "file", "output file name"),
2768         OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2769         OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2770                     "trace events on existing process id"),
2771         OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2772                     "trace events on existing thread id"),
2773         OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2774                      "pids to filter (by the kernel)", trace__set_filter_pids),
2775         OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2776                     "system-wide collection from all CPUs"),
2777         OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2778                     "list of cpus to monitor"),
2779         OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2780                     "child tasks do not inherit counters"),
2781         OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2782                      "number of mmap data pages",
2783                      perf_evlist__parse_mmap_pages),
2784         OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2785                    "user to profile"),
2786         OPT_CALLBACK(0, "duration", &trace, "float",
2787                      "show only events with duration > N.M ms",
2788                      trace__set_duration),
2789         OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2790         OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2791         OPT_BOOLEAN('T', "time", &trace.full_time,
2792                     "Show full timestamp, not time relative to first start"),
2793         OPT_BOOLEAN('s', "summary", &trace.summary_only,
2794                     "Show only syscall summary with statistics"),
2795         OPT_BOOLEAN('S', "with-summary", &trace.summary,
2796                     "Show all syscalls and summary with statistics"),
2797         OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2798                      "Trace pagefaults", parse_pagefaults, "maj"),
2799         OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2800         OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2801         OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2802                         "per thread proc mmap processing timeout in ms"),
2803         OPT_END()
2804         };
2805         const char * const trace_subcommands[] = { "record", NULL };
2806         int err;
2807         char bf[BUFSIZ];
2808
2809         signal(SIGSEGV, sighandler_dump_stack);
2810         signal(SIGFPE, sighandler_dump_stack);
2811
2812         trace.evlist = perf_evlist__new();
2813
2814         if (trace.evlist == NULL) {
2815                 pr_err("Not enough memory to run!\n");
2816                 err = -ENOMEM;
2817                 goto out;
2818         }
2819
2820         argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2821                                  trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2822
2823         if (trace.trace_pgfaults) {
2824                 trace.opts.sample_address = true;
2825                 trace.opts.sample_time = true;
2826         }
2827
2828         if (trace.evlist->nr_entries > 0)
2829                 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2830
2831         if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2832                 return trace__record(&trace, argc-1, &argv[1]);
2833
2834         /* summary_only implies summary option, but don't overwrite summary if set */
2835         if (trace.summary_only)
2836                 trace.summary = trace.summary_only;
2837
2838         if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2839             trace.evlist->nr_entries == 0 /* Was --events used? */) {
2840                 pr_err("Please specify something to trace.\n");
2841                 return -1;
2842         }
2843
2844         if (output_name != NULL) {
2845                 err = trace__open_output(&trace, output_name);
2846                 if (err < 0) {
2847                         perror("failed to create output file");
2848                         goto out;
2849                 }
2850         }
2851
2852         if (ev_qualifier_str != NULL) {
2853                 const char *s = ev_qualifier_str;
2854
2855                 trace.not_ev_qualifier = *s == '!';
2856                 if (trace.not_ev_qualifier)
2857                         ++s;
2858                 trace.ev_qualifier = strlist__new(true, s);
2859                 if (trace.ev_qualifier == NULL) {
2860                         fputs("Not enough memory to parse event qualifier",
2861                               trace.output);
2862                         err = -ENOMEM;
2863                         goto out_close;
2864                 }
2865         }
2866
2867         err = target__validate(&trace.opts.target);
2868         if (err) {
2869                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2870                 fprintf(trace.output, "%s", bf);
2871                 goto out_close;
2872         }
2873
2874         err = target__parse_uid(&trace.opts.target);
2875         if (err) {
2876                 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2877                 fprintf(trace.output, "%s", bf);
2878                 goto out_close;
2879         }
2880
2881         if (!argc && target__none(&trace.opts.target))
2882                 trace.opts.target.system_wide = true;
2883
2884         if (input_name)
2885                 err = trace__replay(&trace);
2886         else
2887                 err = trace__run(&trace, argc, argv);
2888
2889 out_close:
2890         if (output_name != NULL)
2891                 fclose(trace.output);
2892 out:
2893         return err;
2894 }