net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26 #include <linux/rbtree.h>
  27
  28 #include <net/netlink.h>
  29 #include <net/pkt_sched.h>
  30 #include <net/inet_ecn.h>
  31
  32 #define VERSION "1.3"
  33
  34 /*      Network Emulation Queuing algorithm.
  35         ====================================
  36
  37         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  38                  Network Emulation Tool
  39                  [2] Luigi Rizzo, DummyNet for FreeBSD
  40
  41          ----------------------------------------------------------------
  42
  43          This started out as a simple way to delay outgoing packets to
  44          test TCP but has grown to include most of the functionality
  45          of a full blown network emulator like NISTnet. It can delay
  46          packets and add random jitter (and correlation). The random
  47          distribution can be loaded from a table as well to provide
  48          normal, Pareto, or experimental curves. Packet loss,
  49          duplication, and reordering can also be emulated.
  50
  51          This qdisc does not do classification that can be handled in
  52          layering other disciplines.  It does not need to do bandwidth
  53          control either since that can be handled by using token
  54          bucket or other rate control.
  55
  56      Correlated Loss Generator models
  57
  58         Added generation of correlated loss according to the
  59         "Gilbert-Elliot" model, a 4-state markov model.
  60
  61         References:
  62         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  63         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  64         and intuitive loss model for packet networks and its implementation
  65         in the Netem module in the Linux kernel", available in [1]
  66
  67         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  68                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  69 */
  70
  71 struct netem_sched_data {
  72         /* internal t(ime)fifo qdisc uses t_root and sch->limit */
  73         struct rb_root t_root;
  74
  75         /* optional qdisc for classful handling (NULL at netem init) */
  76         struct Qdisc    *qdisc;
  77
  78         struct qdisc_watchdog watchdog;
  79
  80         psched_tdiff_t latency;
  81         psched_tdiff_t jitter;
  82
  83         u32 loss;
  84         u32 ecn;
  85         u32 limit;
  86         u32 counter;
  87         u32 gap;
  88         u32 duplicate;
  89         u32 reorder;
  90         u32 corrupt;
  91         u32 rate;
  92         s32 packet_overhead;
  93         u32 cell_size;
  94         u32 cell_size_reciprocal;
  95         s32 cell_overhead;
  96
  97         struct crndstate {
  98                 u32 last;
  99                 u32 rho;
 100         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 101
 102         struct disttable {
 103                 u32  size;
 104                 s16 table[0];
 105         } *delay_dist;
 106
 107         enum  {
 108                 CLG_RANDOM,
 109                 CLG_4_STATES,
 110                 CLG_GILB_ELL,
 111         } loss_model;
 112
 113         /* Correlated Loss Generation models */
 114         struct clgstate {
 115                 /* state of the Markov chain */
 116                 u8 state;
 117
 118                 /* 4-states and Gilbert-Elliot models */
 119                 u32 a1; /* p13 for 4-states or p for GE */
 120                 u32 a2; /* p31 for 4-states or r for GE */
 121                 u32 a3; /* p32 for 4-states or h for GE */
 122                 u32 a4; /* p14 for 4-states or 1-k for GE */
 123                 u32 a5; /* p23 used only in 4-states */
 124         } clg;
 125
 126 };
 127
 128 /* Time stamp put into socket buffer control block
 129  * Only valid when skbs are in our internal t(ime)fifo queue.
 130  */
 131 struct netem_skb_cb {
 132         psched_time_t   time_to_send;
 133         ktime_t         tstamp_save;
 134 };
 135
 136 /* Because space in skb->cb[] is tight, netem overloads skb->next/prev/tstamp
 137  * to hold a rb_node structure.
 138  *
 139  * If struct sk_buff layout is changed, the following checks will complain.
 140  */
 141 static struct rb_node *netem_rb_node(struct sk_buff *skb)
 142 {
 143         BUILD_BUG_ON(offsetof(struct sk_buff, next) != 0);
 144         BUILD_BUG_ON(offsetof(struct sk_buff, prev) !=
 145                      offsetof(struct sk_buff, next) + sizeof(skb->next));
 146         BUILD_BUG_ON(offsetof(struct sk_buff, tstamp) !=
 147                      offsetof(struct sk_buff, prev) + sizeof(skb->prev));
 148         BUILD_BUG_ON(sizeof(struct rb_node) > sizeof(skb->next) +
 149                                               sizeof(skb->prev) +
 150                                               sizeof(skb->tstamp));
 151         return (struct rb_node *)&skb->next;
 152 }
 153
 154 static struct sk_buff *netem_rb_to_skb(struct rb_node *rb)
 155 {
 156         return (struct sk_buff *)rb;
 157 }
 158
 159 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 160 {
 161         /* we assume we can use skb next/prev/tstamp as storage for rb_node */
 162         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 163         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 164 }
 165
 166 /* init_crandom - initialize correlated random number generator
 167  * Use entropy source for initial seed.
 168  */
 169 static void init_crandom(struct crndstate *state, unsigned long rho)
 170 {
 171         state->rho = rho;
 172         state->last = net_random();
 173 }
 174
 175 /* get_crandom - correlated random number generator
 176  * Next number depends on last value.
 177  * rho is scaled to avoid floating point.
 178  */
 179 static u32 get_crandom(struct crndstate *state)
 180 {
 181         u64 value, rho;
 182         unsigned long answer;
 183
 184         if (state->rho == 0)    /* no correlation */
 185                 return net_random();
 186
 187         value = net_random();
 188         rho = (u64)state->rho + 1;
 189         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 190         state->last = answer;
 191         return answer;
 192 }
 193
 194 /* loss_4state - 4-state model loss generator
 195  * Generates losses according to the 4-state Markov chain adopted in
 196  * the GI (General and Intuitive) loss model.
 197  */
 198 static bool loss_4state(struct netem_sched_data *q)
 199 {
 200         struct clgstate *clg = &q->clg;
 201         u32 rnd = net_random();
 202
 203         /*
 204          * Makes a comparison between rnd and the transition
 205          * probabilities outgoing from the current state, then decides the
 206          * next state and if the next packet has to be transmitted or lost.
 207          * The four states correspond to:
 208          *   1 => successfully transmitted packets within a gap period
 209          *   4 => isolated losses within a gap period
 210          *   3 => lost packets within a burst period
 211          *   2 => successfully transmitted packets within a burst period
 212          */
 213         switch (clg->state) {
 214         case 1:
 215                 if (rnd < clg->a4) {
 216                         clg->state = 4;
 217                         return true;
 218                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 219                         clg->state = 3;
 220                         return true;
 221                 } else if (clg->a1 < rnd)
 222                         clg->state = 1;
 223
 224                 break;
 225         case 2:
 226                 if (rnd < clg->a5) {
 227                         clg->state = 3;
 228                         return true;
 229                 } else
 230                         clg->state = 2;
 231
 232                 break;
 233         case 3:
 234                 if (rnd < clg->a3)
 235                         clg->state = 2;
 236                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 237                         clg->state = 1;
 238                 } else if (clg->a2 + clg->a3 < rnd) {
 239                         clg->state = 3;
 240                         return true;
 241                 }
 242                 break;
 243         case 4:
 244                 clg->state = 1;
 245                 break;
 246         }
 247
 248         return false;
 249 }
 250
 251 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 252  * Generates losses according to the Gilbert-Elliot loss model or
 253  * its special cases  (Gilbert or Simple Gilbert)
 254  *
 255  * Makes a comparison between random number and the transition
 256  * probabilities outgoing from the current state, then decides the
 257  * next state. A second random number is extracted and the comparison
 258  * with the loss probability of the current state decides if the next
 259  * packet will be transmitted or lost.
 260  */
 261 static bool loss_gilb_ell(struct netem_sched_data *q)
 262 {
 263         struct clgstate *clg = &q->clg;
 264
 265         switch (clg->state) {
 266         case 1:
 267                 if (net_random() < clg->a1)
 268                         clg->state = 2;
 269                 if (net_random() < clg->a4)
 270                         return true;
 271         case 2:
 272                 if (net_random() < clg->a2)
 273                         clg->state = 1;
 274                 if (clg->a3 > net_random())
 275                         return true;
 276         }
 277
 278         return false;
 279 }
 280
 281 static bool loss_event(struct netem_sched_data *q)
 282 {
 283         switch (q->loss_model) {
 284         case CLG_RANDOM:
 285                 /* Random packet drop 0 => none, ~0 => all */
 286                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 287
 288         case CLG_4_STATES:
 289                 /* 4state loss model algorithm (used also for GI model)
 290                 * Extracts a value from the markov 4 state loss generator,
 291                 * if it is 1 drops a packet and if needed writes the event in
 292                 * the kernel logs
 293                 */
 294                 return loss_4state(q);
 295
 296         case CLG_GILB_ELL:
 297                 /* Gilbert-Elliot loss model algorithm
 298                 * Extracts a value from the Gilbert-Elliot loss generator,
 299                 * if it is 1 drops a packet and if needed writes the event in
 300                 * the kernel logs
 301                 */
 302                 return loss_gilb_ell(q);
 303         }
 304
 305         return false;   /* not reached */
 306 }
 307
 308
 309 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 310  * std deviation sigma.  Uses table lookup to approximate the desired
 311  * distribution, and a uniformly-distributed pseudo-random source.
 312  */
 313 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 314                                 struct crndstate *state,
 315                                 const struct disttable *dist)
 316 {
 317         psched_tdiff_t x;
 318         long t;
 319         u32 rnd;
 320
 321         if (sigma == 0)
 322                 return mu;
 323
 324         rnd = get_crandom(state);
 325
 326         /* default uniform distribution */
 327         if (dist == NULL)
 328                 return (rnd % (2*sigma)) - sigma + mu;
 329
 330         t = dist->table[rnd % dist->size];
 331         x = (sigma % NETEM_DIST_SCALE) * t;
 332         if (x >= 0)
 333                 x += NETEM_DIST_SCALE/2;
 334         else
 335                 x -= NETEM_DIST_SCALE/2;
 336
 337         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 338 }
 339
 340 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 341 {
 342         u64 ticks;
 343
 344         len += q->packet_overhead;
 345
 346         if (q->cell_size) {
 347                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 348
 349                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 350                         cells++;
 351                 len = cells * (q->cell_size + q->cell_overhead);
 352         }
 353
 354         ticks = (u64)len * NSEC_PER_SEC;
 355
 356         do_div(ticks, q->rate);
 357         return PSCHED_NS2TICKS(ticks);
 358 }
 359
 360 static void tfifo_reset(struct Qdisc *sch)
 361 {
 362         struct netem_sched_data *q = qdisc_priv(sch);
 363         struct rb_node *p;
 364
 365         while ((p = rb_first(&q->t_root))) {
 366                 struct sk_buff *skb = netem_rb_to_skb(p);
 367
 368                 rb_erase(p, &q->t_root);
 369                 skb->next = NULL;
 370                 skb->prev = NULL;
 371                 kfree_skb(skb);
 372         }
 373 }
 374
 375 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 376 {
 377         struct netem_sched_data *q = qdisc_priv(sch);
 378         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 379         struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 380
 381         while (*p) {
 382                 struct sk_buff *skb;
 383
 384                 parent = *p;
 385                 skb = netem_rb_to_skb(parent);
 386                 if (tnext >= netem_skb_cb(skb)->time_to_send)
 387                         p = &parent->rb_right;
 388                 else
 389                         p = &parent->rb_left;
 390         }
 391         rb_link_node(netem_rb_node(nskb), parent, p);
 392         rb_insert_color(netem_rb_node(nskb), &q->t_root);
 393         sch->q.qlen++;
 394 }
 395
 396 /*
 397  * Insert one skb into qdisc.
 398  * Note: parent depends on return value to account for queue length.
 399  *      NET_XMIT_DROP: queue length didn't change.
 400  *      NET_XMIT_SUCCESS: one skb was queued.
 401  */
 402 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 403 {
 404         struct netem_sched_data *q = qdisc_priv(sch);
 405         /* We don't fill cb now as skb_unshare() may invalidate it */
 406         struct netem_skb_cb *cb;
 407         struct sk_buff *skb2;
 408         int count = 1;
 409
 410         /* Random duplication */
 411         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 412                 ++count;
 413
 414         /* Drop packet? */
 415         if (loss_event(q)) {
 416                 if (q->ecn && INET_ECN_set_ce(skb))
 417                         sch->qstats.drops++; /* mark packet */
 418                 else
 419                         --count;
 420         }
 421         if (count == 0) {
 422                 sch->qstats.drops++;
 423                 kfree_skb(skb);
 424                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 425         }
 426
 427         /* If a delay is expected, orphan the skb. (orphaning usually takes
 428          * place at TX completion time, so _before_ the link transit delay)
 429          */
 430         if (q->latency || q->jitter)
 431                 skb_orphan_partial(skb);
 432
 433         /*
 434          * If we need to duplicate packet, then re-insert at top of the
 435          * qdisc tree, since parent queuer expects that only one
 436          * skb will be queued.
 437          */
 438         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 439                 struct Qdisc *rootq = qdisc_root(sch);
 440                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 441                 q->duplicate = 0;
 442
 443                 qdisc_enqueue_root(skb2, rootq);
 444                 q->duplicate = dupsave;
 445         }
 446
 447         /*
 448          * Randomized packet corruption.
 449          * Make copy if needed since we are modifying
 450          * If packet is going to be hardware checksummed, then
 451          * do it now in software before we mangle it.
 452          */
 453         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 454                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 455                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 456                      skb_checksum_help(skb)))
 457                         return qdisc_drop(skb, sch);
 458
 459                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 460         }
 461
 462         if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
 463                 return qdisc_reshape_fail(skb, sch);
 464
 465         sch->qstats.backlog += qdisc_pkt_len(skb);
 466
 467         cb = netem_skb_cb(skb);
 468         if (q->gap == 0 ||              /* not doing reordering */
 469             q->counter < q->gap - 1 ||  /* inside last reordering gap */
 470             q->reorder < get_crandom(&q->reorder_cor)) {
 471                 psched_time_t now;
 472                 psched_tdiff_t delay;
 473
 474                 delay = tabledist(q->latency, q->jitter,
 475                                   &q->delay_cor, q->delay_dist);
 476
 477                 now = psched_get_time();
 478
 479                 if (q->rate) {
 480                         struct sk_buff *last;
 481
 482                         if (!skb_queue_empty(&sch->q))
 483                                 last = skb_peek_tail(&sch->q);
 484                         else
 485                                 last = netem_rb_to_skb(rb_last(&q->t_root));
 486                         if (last) {
 487                                 /*
 488                                  * Last packet in queue is reference point (now),
 489                                  * calculate this time bonus and subtract
 490                                  * from delay.
 491                                  */
 492                                 delay -= netem_skb_cb(last)->time_to_send - now;
 493                                 delay = max_t(psched_tdiff_t, 0, delay);
 494                                 now = netem_skb_cb(last)->time_to_send;
 495                         }
 496
 497                         delay += packet_len_2_sched_time(skb->len, q);
 498                 }
 499
 500                 cb->time_to_send = now + delay;
 501                 cb->tstamp_save = skb->tstamp;
 502                 ++q->counter;
 503                 tfifo_enqueue(skb, sch);
 504         } else {
 505                 /*
 506                  * Do re-ordering by putting one out of N packets at the front
 507                  * of the queue.
 508                  */
 509                 cb->time_to_send = psched_get_time();
 510                 q->counter = 0;
 511
 512                 __skb_queue_head(&sch->q, skb);
 513                 sch->qstats.requeues++;
 514         }
 515
 516         return NET_XMIT_SUCCESS;
 517 }
 518
 519 static unsigned int netem_drop(struct Qdisc *sch)
 520 {
 521         struct netem_sched_data *q = qdisc_priv(sch);
 522         unsigned int len;
 523
 524         len = qdisc_queue_drop(sch);
 525
 526         if (!len) {
 527                 struct rb_node *p = rb_first(&q->t_root);
 528
 529                 if (p) {
 530                         struct sk_buff *skb = netem_rb_to_skb(p);
 531
 532                         rb_erase(p, &q->t_root);
 533                         sch->q.qlen--;
 534                         skb->next = NULL;
 535                         skb->prev = NULL;
 536                         len = qdisc_pkt_len(skb);
 537                         sch->qstats.backlog -= len;
 538                         kfree_skb(skb);
 539                 }
 540         }
 541         if (!len && q->qdisc && q->qdisc->ops->drop)
 542             len = q->qdisc->ops->drop(q->qdisc);
 543         if (len)
 544                 sch->qstats.drops++;
 545
 546         return len;
 547 }
 548
 549 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 550 {
 551         struct netem_sched_data *q = qdisc_priv(sch);
 552         struct sk_buff *skb;
 553         struct rb_node *p;
 554
 555         if (qdisc_is_throttled(sch))
 556                 return NULL;
 557
 558 tfifo_dequeue:
 559         skb = __skb_dequeue(&sch->q);
 560         if (skb) {
 561 deliver:
 562                 sch->qstats.backlog -= qdisc_pkt_len(skb);
 563                 qdisc_unthrottled(sch);
 564                 qdisc_bstats_update(sch, skb);
 565                 return skb;
 566         }
 567         p = rb_first(&q->t_root);
 568         if (p) {
 569                 psched_time_t time_to_send;
 570
 571                 skb = netem_rb_to_skb(p);
 572
 573                 /* if more time remaining? */
 574                 time_to_send = netem_skb_cb(skb)->time_to_send;
 575                 if (time_to_send <= psched_get_time()) {
 576                         rb_erase(p, &q->t_root);
 577
 578                         sch->q.qlen--;
 579                         skb->next = NULL;
 580                         skb->prev = NULL;
 581                         skb->tstamp = netem_skb_cb(skb)->tstamp_save;
 582
 583 #ifdef CONFIG_NET_CLS_ACT
 584                         /*
 585                          * If it's at ingress let's pretend the delay is
 586                          * from the network (tstamp will be updated).
 587                          */
 588                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 589                                 skb->tstamp.tv64 = 0;
 590 #endif
 591
 592                         if (q->qdisc) {
 593                                 int err = qdisc_enqueue(skb, q->qdisc);
 594
 595                                 if (unlikely(err != NET_XMIT_SUCCESS)) {
 596                                         if (net_xmit_drop_count(err)) {
 597                                                 sch->qstats.drops++;
 598                                                 qdisc_tree_decrease_qlen(sch, 1);
 599                                         }
 600                                 }
 601                                 goto tfifo_dequeue;
 602                         }
 603                         goto deliver;
 604                 }
 605
 606                 if (q->qdisc) {
 607                         skb = q->qdisc->ops->dequeue(q->qdisc);
 608                         if (skb)
 609                                 goto deliver;
 610                 }
 611                 qdisc_watchdog_schedule(&q->watchdog, time_to_send);
 612         }
 613
 614         if (q->qdisc) {
 615                 skb = q->qdisc->ops->dequeue(q->qdisc);
 616                 if (skb)
 617                         goto deliver;
 618         }
 619         return NULL;
 620 }
 621
 622 static void netem_reset(struct Qdisc *sch)
 623 {
 624         struct netem_sched_data *q = qdisc_priv(sch);
 625
 626         qdisc_reset_queue(sch);
 627         tfifo_reset(sch);
 628         if (q->qdisc)
 629                 qdisc_reset(q->qdisc);
 630         qdisc_watchdog_cancel(&q->watchdog);
 631 }
 632
 633 static void dist_free(struct disttable *d)
 634 {
 635         if (d) {
 636                 if (is_vmalloc_addr(d))
 637                         vfree(d);
 638                 else
 639                         kfree(d);
 640         }
 641 }
 642
 643 /*
 644  * Distribution data is a variable size payload containing
 645  * signed 16 bit values.
 646  */
 647 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 648 {
 649         struct netem_sched_data *q = qdisc_priv(sch);
 650         size_t n = nla_len(attr)/sizeof(__s16);
 651         const __s16 *data = nla_data(attr);
 652         spinlock_t *root_lock;
 653         struct disttable *d;
 654         int i;
 655         size_t s;
 656
 657         if (n > NETEM_DIST_MAX)
 658                 return -EINVAL;
 659
 660         s = sizeof(struct disttable) + n * sizeof(s16);
 661         d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 662         if (!d)
 663                 d = vmalloc(s);
 664         if (!d)
 665                 return -ENOMEM;
 666
 667         d->size = n;
 668         for (i = 0; i < n; i++)
 669                 d->table[i] = data[i];
 670
 671         root_lock = qdisc_root_sleeping_lock(sch);
 672
 673         spin_lock_bh(root_lock);
 674         swap(q->delay_dist, d);
 675         spin_unlock_bh(root_lock);
 676
 677         dist_free(d);
 678         return 0;
 679 }
 680
 681 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 682 {
 683         struct netem_sched_data *q = qdisc_priv(sch);
 684         const struct tc_netem_corr *c = nla_data(attr);
 685
 686         init_crandom(&q->delay_cor, c->delay_corr);
 687         init_crandom(&q->loss_cor, c->loss_corr);
 688         init_crandom(&q->dup_cor, c->dup_corr);
 689 }
 690
 691 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 692 {
 693         struct netem_sched_data *q = qdisc_priv(sch);
 694         const struct tc_netem_reorder *r = nla_data(attr);
 695
 696         q->reorder = r->probability;
 697         init_crandom(&q->reorder_cor, r->correlation);
 698 }
 699
 700 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 701 {
 702         struct netem_sched_data *q = qdisc_priv(sch);
 703         const struct tc_netem_corrupt *r = nla_data(attr);
 704
 705         q->corrupt = r->probability;
 706         init_crandom(&q->corrupt_cor, r->correlation);
 707 }
 708
 709 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 710 {
 711         struct netem_sched_data *q = qdisc_priv(sch);
 712         const struct tc_netem_rate *r = nla_data(attr);
 713
 714         q->rate = r->rate;
 715         q->packet_overhead = r->packet_overhead;
 716         q->cell_size = r->cell_size;
 717         if (q->cell_size)
 718                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 719         q->cell_overhead = r->cell_overhead;
 720 }
 721
 722 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 723 {
 724         struct netem_sched_data *q = qdisc_priv(sch);
 725         const struct nlattr *la;
 726         int rem;
 727
 728         nla_for_each_nested(la, attr, rem) {
 729                 u16 type = nla_type(la);
 730
 731                 switch(type) {
 732                 case NETEM_LOSS_GI: {
 733                         const struct tc_netem_gimodel *gi = nla_data(la);
 734
 735                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 736                                 pr_info("netem: incorrect gi model size\n");
 737                                 return -EINVAL;
 738                         }
 739
 740                         q->loss_model = CLG_4_STATES;
 741
 742                         q->clg.state = 1;
 743                         q->clg.a1 = gi->p13;
 744                         q->clg.a2 = gi->p31;
 745                         q->clg.a3 = gi->p32;
 746                         q->clg.a4 = gi->p14;
 747                         q->clg.a5 = gi->p23;
 748                         break;
 749                 }
 750
 751                 case NETEM_LOSS_GE: {
 752                         const struct tc_netem_gemodel *ge = nla_data(la);
 753
 754                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 755                                 pr_info("netem: incorrect ge model size\n");
 756                                 return -EINVAL;
 757                         }
 758
 759                         q->loss_model = CLG_GILB_ELL;
 760                         q->clg.state = 1;
 761                         q->clg.a1 = ge->p;
 762                         q->clg.a2 = ge->r;
 763                         q->clg.a3 = ge->h;
 764                         q->clg.a4 = ge->k1;
 765                         break;
 766                 }
 767
 768                 default:
 769                         pr_info("netem: unknown loss type %u\n", type);
 770                         return -EINVAL;
 771                 }
 772         }
 773
 774         return 0;
 775 }
 776
 777 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 778         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 779         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 780         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 781         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 782         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 783         [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 784 };
 785
 786 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 787                       const struct nla_policy *policy, int len)
 788 {
 789         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 790
 791         if (nested_len < 0) {
 792                 pr_info("netem: invalid attributes len %d\n", nested_len);
 793                 return -EINVAL;
 794         }
 795
 796         if (nested_len >= nla_attr_size(0))
 797                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 798                                  nested_len, policy);
 799
 800         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 801         return 0;
 802 }
 803
 804 /* Parse netlink message to set options */
 805 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 806 {
 807         struct netem_sched_data *q = qdisc_priv(sch);
 808         struct nlattr *tb[TCA_NETEM_MAX + 1];
 809         struct tc_netem_qopt *qopt;
 810         int ret;
 811
 812         if (opt == NULL)
 813                 return -EINVAL;
 814
 815         qopt = nla_data(opt);
 816         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 817         if (ret < 0)
 818                 return ret;
 819
 820         sch->limit = qopt->limit;
 821
 822         q->latency = qopt->latency;
 823         q->jitter = qopt->jitter;
 824         q->limit = qopt->limit;
 825         q->gap = qopt->gap;
 826         q->counter = 0;
 827         q->loss = qopt->loss;
 828         q->duplicate = qopt->duplicate;
 829
 830         /* for compatibility with earlier versions.
 831          * if gap is set, need to assume 100% probability
 832          */
 833         if (q->gap)
 834                 q->reorder = ~0;
 835
 836         if (tb[TCA_NETEM_CORR])
 837                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 838
 839         if (tb[TCA_NETEM_DELAY_DIST]) {
 840                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 841                 if (ret)
 842                         return ret;
 843         }
 844
 845         if (tb[TCA_NETEM_REORDER])
 846                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 847
 848         if (tb[TCA_NETEM_CORRUPT])
 849                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 850
 851         if (tb[TCA_NETEM_RATE])
 852                 get_rate(sch, tb[TCA_NETEM_RATE]);
 853
 854         if (tb[TCA_NETEM_ECN])
 855                 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 856
 857         q->loss_model = CLG_RANDOM;
 858         if (tb[TCA_NETEM_LOSS])
 859                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 860
 861         return ret;
 862 }
 863
 864 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 865 {
 866         struct netem_sched_data *q = qdisc_priv(sch);
 867         int ret;
 868
 869         if (!opt)
 870                 return -EINVAL;
 871
 872         qdisc_watchdog_init(&q->watchdog, sch);
 873
 874         q->loss_model = CLG_RANDOM;
 875         ret = netem_change(sch, opt);
 876         if (ret)
 877                 pr_info("netem: change failed\n");
 878         return ret;
 879 }
 880
 881 static void netem_destroy(struct Qdisc *sch)
 882 {
 883         struct netem_sched_data *q = qdisc_priv(sch);
 884
 885         qdisc_watchdog_cancel(&q->watchdog);
 886         if (q->qdisc)
 887                 qdisc_destroy(q->qdisc);
 888         dist_free(q->delay_dist);
 889 }
 890
 891 static int dump_loss_model(const struct netem_sched_data *q,
 892                            struct sk_buff *skb)
 893 {
 894         struct nlattr *nest;
 895
 896         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 897         if (nest == NULL)
 898                 goto nla_put_failure;
 899
 900         switch (q->loss_model) {
 901         case CLG_RANDOM:
 902                 /* legacy loss model */
 903                 nla_nest_cancel(skb, nest);
 904                 return 0;       /* no data */
 905
 906         case CLG_4_STATES: {
 907                 struct tc_netem_gimodel gi = {
 908                         .p13 = q->clg.a1,
 909                         .p31 = q->clg.a2,
 910                         .p32 = q->clg.a3,
 911                         .p14 = q->clg.a4,
 912                         .p23 = q->clg.a5,
 913                 };
 914
 915                 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
 916                         goto nla_put_failure;
 917                 break;
 918         }
 919         case CLG_GILB_ELL: {
 920                 struct tc_netem_gemodel ge = {
 921                         .p = q->clg.a1,
 922                         .r = q->clg.a2,
 923                         .h = q->clg.a3,
 924                         .k1 = q->clg.a4,
 925                 };
 926
 927                 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
 928                         goto nla_put_failure;
 929                 break;
 930         }
 931         }
 932
 933         nla_nest_end(skb, nest);
 934         return 0;
 935
 936 nla_put_failure:
 937         nla_nest_cancel(skb, nest);
 938         return -1;
 939 }
 940
 941 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 942 {
 943         const struct netem_sched_data *q = qdisc_priv(sch);
 944         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 945         struct tc_netem_qopt qopt;
 946         struct tc_netem_corr cor;
 947         struct tc_netem_reorder reorder;
 948         struct tc_netem_corrupt corrupt;
 949         struct tc_netem_rate rate;
 950
 951         qopt.latency = q->latency;
 952         qopt.jitter = q->jitter;
 953         qopt.limit = q->limit;
 954         qopt.loss = q->loss;
 955         qopt.gap = q->gap;
 956         qopt.duplicate = q->duplicate;
 957         if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
 958                 goto nla_put_failure;
 959
 960         cor.delay_corr = q->delay_cor.rho;
 961         cor.loss_corr = q->loss_cor.rho;
 962         cor.dup_corr = q->dup_cor.rho;
 963         if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
 964                 goto nla_put_failure;
 965
 966         reorder.probability = q->reorder;
 967         reorder.correlation = q->reorder_cor.rho;
 968         if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
 969                 goto nla_put_failure;
 970
 971         corrupt.probability = q->corrupt;
 972         corrupt.correlation = q->corrupt_cor.rho;
 973         if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
 974                 goto nla_put_failure;
 975
 976         rate.rate = q->rate;
 977         rate.packet_overhead = q->packet_overhead;
 978         rate.cell_size = q->cell_size;
 979         rate.cell_overhead = q->cell_overhead;
 980         if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
 981                 goto nla_put_failure;
 982
 983         if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
 984                 goto nla_put_failure;
 985
 986         if (dump_loss_model(q, skb) != 0)
 987                 goto nla_put_failure;
 988
 989         return nla_nest_end(skb, nla);
 990
 991 nla_put_failure:
 992         nlmsg_trim(skb, nla);
 993         return -1;
 994 }
 995
 996 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 997                           struct sk_buff *skb, struct tcmsg *tcm)
 998 {
 999         struct netem_sched_data *q = qdisc_priv(sch);
1000
1001         if (cl != 1 || !q->qdisc)       /* only one class */
1002                 return -ENOENT;
1003
1004         tcm->tcm_handle |= TC_H_MIN(1);
1005         tcm->tcm_info = q->qdisc->handle;
1006
1007         return 0;
1008 }
1009
1010 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1011                      struct Qdisc **old)
1012 {
1013         struct netem_sched_data *q = qdisc_priv(sch);
1014
1015         sch_tree_lock(sch);
1016         *old = q->qdisc;
1017         q->qdisc = new;
1018         if (*old) {
1019                 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1020                 qdisc_reset(*old);
1021         }
1022         sch_tree_unlock(sch);
1023
1024         return 0;
1025 }
1026
1027 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1028 {
1029         struct netem_sched_data *q = qdisc_priv(sch);
1030         return q->qdisc;
1031 }
1032
1033 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
1034 {
1035         return 1;
1036 }
1037
1038 static void netem_put(struct Qdisc *sch, unsigned long arg)
1039 {
1040 }
1041
1042 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1043 {
1044         if (!walker->stop) {
1045                 if (walker->count >= walker->skip)
1046                         if (walker->fn(sch, 1, walker) < 0) {
1047                                 walker->stop = 1;
1048                                 return;
1049                         }
1050                 walker->count++;
1051         }
1052 }
1053
1054 static const struct Qdisc_class_ops netem_class_ops = {
1055         .graft          =       netem_graft,
1056         .leaf           =       netem_leaf,
1057         .get            =       netem_get,
1058         .put            =       netem_put,
1059         .walk           =       netem_walk,
1060         .dump           =       netem_dump_class,
1061 };
1062
1063 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1064         .id             =       "netem",
1065         .cl_ops         =       &netem_class_ops,
1066         .priv_size      =       sizeof(struct netem_sched_data),
1067         .enqueue        =       netem_enqueue,
1068         .dequeue        =       netem_dequeue,
1069         .peek           =       qdisc_peek_dequeued,
1070         .drop           =       netem_drop,
1071         .init           =       netem_init,
1072         .reset          =       netem_reset,
1073         .destroy        =       netem_destroy,
1074         .change         =       netem_change,
1075         .dump           =       netem_dump,
1076         .owner          =       THIS_MODULE,
1077 };
1078
1079
1080 static int __init netem_module_init(void)
1081 {
1082         pr_info("netem: version " VERSION "\n");
1083         return register_qdisc(&netem_qdisc_ops);
1084 }
1085 static void __exit netem_module_exit(void)
1086 {
1087         unregister_qdisc(&netem_qdisc_ops);
1088 }
1089 module_init(netem_module_init)
1090 module_exit(netem_module_exit)
1091 MODULE_LICENSE("GPL");