silo/masstree/perfstat.cc

   1 /* Masstree
   2  * Eddie Kohler, Yandong Mao, Robert Morris
   3  * Copyright (c) 2012-2013 President and Fellows of Harvard College
   4  * Copyright (c) 2012-2013 Massachusetts Institute of Technology
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, subject to the conditions
   9  * listed in the Masstree LICENSE file. These conditions include: you must
  10  * preserve this copyright notice, and you cannot mention the copyright
  11  * holders in advertising related to the Software without their permission.
  12  * The Software is provided WITHOUT ANY WARRANTY, EXPRESS OR IMPLIED. This
  13  * notice is a summary of the Masstree LICENSE file; the license in that file
  14  * is legally binding.
  15  */
  16 #include "perfstat.hh"
  17 #include "compiler.hh"
  18 #include "kvstats.hh"
  19 #if HAVE_NUMA_H
  20 #include <numa.h>
  21 #endif
  22
  23 enum { MaxCores = 48 };   // Maximum number of cores kvdb statistics support
  24 enum { MaxNumaNode = 8 }; // Maximum number of Numa node kvdb statistics support
  25 enum { CoresPerChip = MaxCores / MaxNumaNode };
  26
  27 namespace Perf {
  28
  29 #if MEMSTATS && HAVE_NUMA_H && HAVE_LIBNUMA
  30 static struct {
  31   long long free;
  32   long long size;
  33 } numa[MaxNumaNode];
  34 #endif
  35
  36 void
  37 stat::initmain(bool pinthreads) {
  38     (void) pinthreads;
  39 #if PMC_ENABLED
  40     always_assert(pinthreads && "Using performance counter requires pinning threads to cores!");
  41 #endif
  42 #if MEMSTATS && HAVE_NUMA_H && HAVE_LIBNUMA
  43     if (numa_available() != -1) {
  44         always_assert(numa_max_node() <= MaxNumaNode);
  45         for (int i = 0; i <= numa_max_node(); i++)
  46             numa[i].size = numa_node_size64(i, &numa[i].free);
  47     }
  48 #endif
  49 }
  50
  51 template <typename T>
  52 kvstats
  53 sum_all_cores(const stat **s, int n, const int offset) {
  54     kvstats sum;
  55     for (int i = 0; i < n; i++) {
  56         if (!s[i])
  57             continue;
  58         T v = *reinterpret_cast<const T *>(reinterpret_cast<const char *>(s[i]) + offset);
  59         sum.add(v);
  60     }
  61     return sum;
  62 }
  63
  64 template <typename T>
  65 kvstats
  66 sum_one_chip(const stat **s, int n, const int offset, const int chipidx) {
  67     kvstats sum;
  68     for (int i = 0; i < n; i++) {
  69         if (!s[i] || s[i]->cid / (MaxCores / MaxNumaNode) != chipidx)
  70             continue;
  71         T v = *reinterpret_cast<const T *>(reinterpret_cast<const char *>(s[i]) + offset);
  72         sum.add(v);
  73     }
  74     return sum;
  75 }
  76
  77 template <typename T>
  78 kvstats
  79 sum_all_per_chip(const stat **s, int n, const int offset) {
  80     kvstats per_chip[MaxNumaNode];
  81     for (int i  = 0; i < n; i++) {
  82         if (!s[i])
  83             continue;
  84         T v = *reinterpret_cast<const T *>(reinterpret_cast<const char *>(s[i]) + offset);
  85         per_chip[i / CoresPerChip].add(v);
  86     }
  87     kvstats sum;
  88     for (int i = 0; i < MaxNumaNode; i++)
  89         if (per_chip[i].count)
  90             sum.add(per_chip[i].avg());
  91     return sum;
  92 }
  93
  94 void
  95 stat::print(const stat **s, int n) {
  96     (void)n;
  97     (void)s;
  98 #define sum_all_cores_of(field) \
  99     sum_all_cores<typeof(s[0]->field)>(s, n, offsetof(Perf::stat, field))
 100 #define sum_one_chip_of(field, c) \
 101     sum_one_chip<typeof(s[0]->field)>(s, n, offsetof(Perf::stat, field), c)
 102 #define sum_all_per_chip_of(field) \
 103     sum_all_per_chip<typeof(s[0]->field)>(s, n, offsetof(Perf::stat, field))
 104
 105 #define sum_all_cores_of_array(field, oa) \
 106     sum_all_cores<typeof(s[0]->field[0])>(s, n, offsetof(Perf::stat, field) + \
 107                                           sizeof(s[0]->field[0]) * oa)
 108 #define sum_one_chip_of_array(field, oa, c) \
 109     sum_one_chip<typeof(s[0]->field[0])>(s, n, offsetof(Perf::stat, field) + \
 110                                          sizeof(s[0]->field[0]) * oa, c)
 111 #define sum_all_per_chip_of_array(field, oa) \
 112     sum_all_per_chip<typeof(s[0]->field[0])>(s, n, offsetof(Perf::stat, field) + \
 113                                              sizeof(s[0]->field[0]) * oa)
 114
 115 #if GETSTATS && 0
 116     for (int i = 0; i < n; i++)
 117         if (s[i]->ngets < 1000) {
 118             s[i] = NULL;
 119             continue;
 120         }
 121     kvstats ngets = sum_all_cores_of(ngets);
 122     kvstats ntsc = sum_all_cores_of(ntsc);
 123     kvstats np = sum_all_cores_of(nprobe);
 124     if (np.sum >= 1)
 125         fprintf(stderr, "Total probe %.0f, probe/get %.2f\n", np.sum, np.sum / ngets.sum);
 126 #if PMC_ENABLED
 127     fprintf(stderr, "(Inaccurate because PMC is Enabled!)");
 128 #endif
 129     fprintf(stderr, "Cycles/get (between mark_get_begin and mark_get_end): %.0f\n",
 130             ntsc.sum / ngets.sum);
 131 #if PMC_ENABLED
 132     for (int i = 0; i < n; i++) {
 133         if (!s[i])
 134             continue;
 135         fprintf(stderr, "Core %d:\n", i);
 136         for (int pi = 0; pi < 4; pi++) {
 137             fprintf(stderr, "\tpmc[%d]: %016" PRIx64 "->%016" PRIx64 "\n",
 138                     pi, s[i]->pmc_firstget[pi], s[i]->pmc_start[pi]);
 139             always_assert(s[i]->pmc_start[pi] >= s[i]->pmc_firstget[pi]);
 140             always_assert(s[i]->t1_lastget >= s[i]->t0_firstget);
 141         }
 142     }
 143     // Compute the start and end time of get phase
 144     kvstats getstart = sum_all_cores_of(t0_firstget);
 145     kvstats getend = sum_all_cores_of(t1_lastget);
 146     getstart.print_report("time of first get");
 147     getend.print_report("time of last get");
 148
 149     // Compute per-chip pmc during the whole get phase
 150     double pcpmc_phase[MaxNumaNode][4];
 151     for (int i = 0; i < MaxNumaNode; i++)
 152         for (int pi = 0; pi < 4; pi++)
 153             pcpmc_phase[i][pi] = sum_one_chip_of_array(pmc_start, pi, i).avg() -
 154                                  sum_one_chip_of_array(pmc_firstget, pi, i).avg();
 155
 156     // Compute cputime and realtime during get phase
 157     kvstats t_firstget = sum_all_cores_of(t0_firstget);
 158     kvstats t_lastget = sum_all_cores_of(t1_lastget);
 159     double realtime = t_lastget.avg() - t_firstget.avg();
 160
 161     for (int pi = 0; pi < 4; pi++) {
 162         fprintf(stderr, "DRAM access to node (pmc %d)\n", pi);
 163         double sum = 0;
 164         for (int i = 0; i < MaxNumaNode; i++) {
 165             fprintf(stderr, "\tFrom chip %2d: %8.1f GB/s\n", i,
 166                     pcpmc_phase[i][pi] * 64 / (realtime * (1 << 30)));
 167             sum += pcpmc_phase[i][pi];
 168         }
 169         fprintf(stderr, "\tSum: %8.1f GB/s\n",
 170                 sum * 64 / (realtime * (1 << 30)));
 171     }
 172     // Print per-get pmc_lookup
 173     fprintf(stderr, "Per get statistics (counted between mark_get_begin and mark_get_end):\n");
 174     for (int pi = 0; (ngets.sum > 0) && pi < 4; pi ++) {
 175         kvstats pmc_lookup = sum_all_cores_of_array(pmc_lookup, pi);
 176         kvstats pcpmc_lookup = sum_all_per_chip_of_array(pmc_lookup, pi);
 177         fprintf(stderr, "\tpmc%d/get: %6.1f, per_chip_pmc%d/get: %6.1f\n",
 178                 pi, (double) pmc_lookup.sum / ngets.sum, pi,
 179                (double) pcpmc_lookup.sum / ngets.sum);
 180     }
 181 #endif
 182 #endif
 183
 184 #if MEMSTATS && HAVE_NUMA_H && HAVE_LIBNUMA && 0
 185     // collect tree memory
 186     kvstats tree_mem = sum_all_cores_of(tree_mem);
 187     kvstats tree_keys = sum_all_cores_of(tree_keys);
 188     fprintf(stderr, "Memory statistics\n");
 189     fprintf(stderr, "\tAllocated per key: %.0f bytes, %.0f\n", tree_mem.sum / tree_keys.sum, tree_keys.sum);
 190     if (numa_available() != -1) {
 191         unsigned long total_alloc = 0;
 192         for (int i = 0; i <= numa_max_node(); i++) {
 193             kvstats chip = sum_one_chip_of(tree_mem, i);
 194             long long nowfree;
 195             long long size = numa_node_size64(i, &nowfree);
 196             total_alloc += numa[i].free - nowfree;
 197             fprintf(stderr, "\tNode %d (MB): size %6lld, allocated = %6lld - "
 198                     "%6lld = %6lld, tree_mem %6.0f\n",
 199                     i, size >> 20, numa[i].free >> 20, nowfree >> 20,
 200                     (numa[i].free - nowfree) / (1 << 20),
 201                     chip.sum / (1 << 20));
 202         }
 203         fprintf(stderr, "Total allocated memory %ld MB\n", total_alloc >> 20);
 204     }
 205 #endif
 206
 207 #if GCSTATS
 208     // collect memory used by epoch based garbage collector
 209     kvstats gc_nfree = sum_all_cores_of(gc_nfree);
 210     kvstats gc_nalloc = sum_all_cores_of(gc_nalloc);
 211     fprintf(stderr, "reuse per gc slot: %.0f, freed: %.0f, allocated: %.0f\n",
 212             gc_nfree.sum / gc_nalloc.sum, gc_nfree.sum, gc_nalloc.sum);
 213 #endif
 214 }
 215
 216 }