blob: 5c0f0c209a4c5500ef988dbd4916deac7cb3eecf [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_netem.c Network emulator
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
11 *
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/errno.h>
22#include <linux/netdevice.h>
23#include <linux/skbuff.h>
24#include <linux/rtnetlink.h>
25
26#include <net/pkt_sched.h>
27
28/* Network Emulation Queuing algorithm.
29 ====================================
30
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
34
35 ----------------------------------------------------------------
36
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
44
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
49
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
52*/
53
54struct netem_sched_data {
55 struct Qdisc *qdisc;
56 struct sk_buff_head delayed;
57 struct timer_list timer;
58
59 u32 latency;
60 u32 loss;
61 u32 limit;
62 u32 counter;
63 u32 gap;
64 u32 jitter;
65 u32 duplicate;
66
67 struct crndstate {
68 unsigned long last;
69 unsigned long rho;
70 } delay_cor, loss_cor, dup_cor;
71
72 struct disttable {
73 u32 size;
74 s16 table[0];
75 } *delay_dist;
76};
77
78/* Time stamp put into socket buffer control block */
79struct netem_skb_cb {
80 psched_time_t time_to_send;
81};
82
83/* init_crandom - initialize correlated random number generator
84 * Use entropy source for initial seed.
85 */
86static void init_crandom(struct crndstate *state, unsigned long rho)
87{
88 state->rho = rho;
89 state->last = net_random();
90}
91
92/* get_crandom - correlated random number generator
93 * Next number depends on last value.
94 * rho is scaled to avoid floating point.
95 */
96static unsigned long get_crandom(struct crndstate *state)
97{
98 u64 value, rho;
99 unsigned long answer;
100
101 if (state->rho == 0) /* no correllation */
102 return net_random();
103
104 value = net_random();
105 rho = (u64)state->rho + 1;
106 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 state->last = answer;
108 return answer;
109}
110
111/* tabledist - return a pseudo-randomly distributed value with mean mu and
112 * std deviation sigma. Uses table lookup to approximate the desired
113 * distribution, and a uniformly-distributed pseudo-random source.
114 */
115static long tabledist(unsigned long mu, long sigma,
116 struct crndstate *state, const struct disttable *dist)
117{
118 long t, x;
119 unsigned long rnd;
120
121 if (sigma == 0)
122 return mu;
123
124 rnd = get_crandom(state);
125
126 /* default uniform distribution */
127 if (dist == NULL)
128 return (rnd % (2*sigma)) - sigma + mu;
129
130 t = dist->table[rnd % dist->size];
131 x = (sigma % NETEM_DIST_SCALE) * t;
132 if (x >= 0)
133 x += NETEM_DIST_SCALE/2;
134 else
135 x -= NETEM_DIST_SCALE/2;
136
137 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
138}
139
140/* Put skb in the private delayed queue. */
Stephen Hemminger771018e2005-05-03 16:24:32 -0700141static int netem_delay(struct Qdisc *sch, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142{
143 struct netem_sched_data *q = qdisc_priv(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144 psched_tdiff_t td;
145 psched_time_t now;
146
147 PSCHED_GET_TIME(now);
148 td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149
150 /* Always queue at tail to keep packets in order */
151 if (likely(q->delayed.qlen < q->limit)) {
Stephen Hemminger771018e2005-05-03 16:24:32 -0700152 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
153
154 PSCHED_TADD2(now, td, cb->time_to_send);
155
156 pr_debug("netem_delay: skb=%p now=%llu tosend=%llu\n", skb,
157 now, cb->time_to_send);
158
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 __skb_queue_tail(&q->delayed, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 return NET_XMIT_SUCCESS;
161 }
162
Stephen Hemminger771018e2005-05-03 16:24:32 -0700163 pr_debug("netem_delay: queue over limit %d\n", q->limit);
164 sch->qstats.overlimits++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165 kfree_skb(skb);
166 return NET_XMIT_DROP;
167}
168
Stephen Hemminger771018e2005-05-03 16:24:32 -0700169/*
170 * Move a packet that is ready to send from the delay holding
171 * list to the underlying qdisc.
172 */
173static int netem_run(struct Qdisc *sch)
174{
175 struct netem_sched_data *q = qdisc_priv(sch);
176 struct sk_buff *skb;
177 psched_time_t now;
178
179 PSCHED_GET_TIME(now);
180
181 skb = skb_peek(&q->delayed);
182 if (skb) {
183 const struct netem_skb_cb *cb
184 = (const struct netem_skb_cb *)skb->cb;
185 long delay
186 = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
187 pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
188
189 /* if more time remaining? */
190 if (delay > 0) {
191 mod_timer(&q->timer, jiffies + delay);
192 return 1;
193 }
194
195 __skb_unlink(skb, &q->delayed);
196
197 if (q->qdisc->enqueue(skb, q->qdisc)) {
198 sch->q.qlen--;
199 sch->qstats.drops++;
200 }
201 }
202
203 return 0;
204}
205
Stephen Hemminger0afb51e2005-05-26 12:53:49 -0700206/*
207 * Insert one skb into qdisc.
208 * Note: parent depends on return value to account for queue length.
209 * NET_XMIT_DROP: queue length didn't change.
210 * NET_XMIT_SUCCESS: one skb was queued.
211 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
213{
214 struct netem_sched_data *q = qdisc_priv(sch);
Stephen Hemminger0afb51e2005-05-26 12:53:49 -0700215 struct sk_buff *skb2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700216 int ret;
Stephen Hemminger0afb51e2005-05-26 12:53:49 -0700217 int count = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218
Stephen Hemminger771018e2005-05-03 16:24:32 -0700219 pr_debug("netem_enqueue skb=%p\n", skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220
Stephen Hemminger0afb51e2005-05-26 12:53:49 -0700221 /* Random duplication */
222 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
223 ++count;
224
Linus Torvalds1da177e2005-04-16 15:20:36 -0700225 /* Random packet drop 0 => none, ~0 => all */
Stephen Hemminger0afb51e2005-05-26 12:53:49 -0700226 if (q->loss && q->loss >= get_crandom(&q->loss_cor))
227 --count;
228
229 if (count == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230 sch->qstats.drops++;
231 kfree_skb(skb);
Stephen Hemminger0afb51e2005-05-26 12:53:49 -0700232 return NET_XMIT_DROP;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700233 }
234
Stephen Hemminger0afb51e2005-05-26 12:53:49 -0700235 /*
236 * If we need to duplicate packet, then re-insert at top of the
237 * qdisc tree, since parent queuer expects that only one
238 * skb will be queued.
239 */
240 if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
241 struct Qdisc *rootq = sch->dev->qdisc;
242 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
243 q->duplicate = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700244
Stephen Hemminger0afb51e2005-05-26 12:53:49 -0700245 rootq->enqueue(skb2, rootq);
246 q->duplicate = dupsave;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247 }
248
249 /* If doing simple delay then gap == 0 so all packets
250 * go into the delayed holding queue
251 * otherwise if doing out of order only "1 out of gap"
252 * packets will be delayed.
253 */
254 if (q->counter < q->gap) {
255 ++q->counter;
256 ret = q->qdisc->enqueue(skb, q->qdisc);
257 } else {
258 q->counter = 0;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700259 ret = netem_delay(sch, skb);
260 netem_run(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261 }
262
263 if (likely(ret == NET_XMIT_SUCCESS)) {
264 sch->q.qlen++;
265 sch->bstats.bytes += skb->len;
266 sch->bstats.packets++;
267 } else
268 sch->qstats.drops++;
269
Stephen Hemmingerd5d75cd2005-05-03 16:24:57 -0700270 pr_debug("netem: enqueue ret %d\n", ret);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 return ret;
272}
273
274/* Requeue packets but don't change time stamp */
275static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
276{
277 struct netem_sched_data *q = qdisc_priv(sch);
278 int ret;
279
280 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
281 sch->q.qlen++;
282 sch->qstats.requeues++;
283 }
284
285 return ret;
286}
287
288static unsigned int netem_drop(struct Qdisc* sch)
289{
290 struct netem_sched_data *q = qdisc_priv(sch);
291 unsigned int len;
292
293 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
294 sch->q.qlen--;
295 sch->qstats.drops++;
296 }
297 return len;
298}
299
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300static struct sk_buff *netem_dequeue(struct Qdisc *sch)
301{
302 struct netem_sched_data *q = qdisc_priv(sch);
303 struct sk_buff *skb;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700304 int pending;
305
306 pending = netem_run(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700307
308 skb = q->qdisc->dequeue(q->qdisc);
Stephen Hemminger771018e2005-05-03 16:24:32 -0700309 if (skb) {
310 pr_debug("netem_dequeue: return skb=%p\n", skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311 sch->q.qlen--;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700312 sch->flags &= ~TCQ_F_THROTTLED;
313 }
314 else if (pending) {
315 pr_debug("netem_dequeue: throttling\n");
316 sch->flags |= TCQ_F_THROTTLED;
317 }
318
Linus Torvalds1da177e2005-04-16 15:20:36 -0700319 return skb;
320}
321
322static void netem_watchdog(unsigned long arg)
323{
324 struct Qdisc *sch = (struct Qdisc *)arg;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700325
Stephen Hemminger771018e2005-05-03 16:24:32 -0700326 pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
327 sch->flags &= ~TCQ_F_THROTTLED;
328 netif_schedule(sch->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700329}
330
331static void netem_reset(struct Qdisc *sch)
332{
333 struct netem_sched_data *q = qdisc_priv(sch);
334
335 qdisc_reset(q->qdisc);
336 skb_queue_purge(&q->delayed);
337
338 sch->q.qlen = 0;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700339 sch->flags &= ~TCQ_F_THROTTLED;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700340 del_timer_sync(&q->timer);
341}
342
343static int set_fifo_limit(struct Qdisc *q, int limit)
344{
345 struct rtattr *rta;
346 int ret = -ENOMEM;
347
348 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
349 if (rta) {
350 rta->rta_type = RTM_NEWQDISC;
351 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
352 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
353
354 ret = q->ops->change(q, rta);
355 kfree(rta);
356 }
357 return ret;
358}
359
360/*
361 * Distribution data is a variable size payload containing
362 * signed 16 bit values.
363 */
364static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
365{
366 struct netem_sched_data *q = qdisc_priv(sch);
367 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
368 const __s16 *data = RTA_DATA(attr);
369 struct disttable *d;
370 int i;
371
372 if (n > 65536)
373 return -EINVAL;
374
375 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
376 if (!d)
377 return -ENOMEM;
378
379 d->size = n;
380 for (i = 0; i < n; i++)
381 d->table[i] = data[i];
382
383 spin_lock_bh(&sch->dev->queue_lock);
384 d = xchg(&q->delay_dist, d);
385 spin_unlock_bh(&sch->dev->queue_lock);
386
387 kfree(d);
388 return 0;
389}
390
391static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
392{
393 struct netem_sched_data *q = qdisc_priv(sch);
394 const struct tc_netem_corr *c = RTA_DATA(attr);
395
396 if (RTA_PAYLOAD(attr) != sizeof(*c))
397 return -EINVAL;
398
399 init_crandom(&q->delay_cor, c->delay_corr);
400 init_crandom(&q->loss_cor, c->loss_corr);
401 init_crandom(&q->dup_cor, c->dup_corr);
402 return 0;
403}
404
405static int netem_change(struct Qdisc *sch, struct rtattr *opt)
406{
407 struct netem_sched_data *q = qdisc_priv(sch);
408 struct tc_netem_qopt *qopt;
409 int ret;
410
411 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
412 return -EINVAL;
413
414 qopt = RTA_DATA(opt);
415 ret = set_fifo_limit(q->qdisc, qopt->limit);
416 if (ret) {
417 pr_debug("netem: can't set fifo limit\n");
418 return ret;
419 }
420
421 q->latency = qopt->latency;
422 q->jitter = qopt->jitter;
423 q->limit = qopt->limit;
424 q->gap = qopt->gap;
425 q->loss = qopt->loss;
426 q->duplicate = qopt->duplicate;
427
428 /* Handle nested options after initial queue options.
429 * Should have put all options in nested format but too late now.
430 */
431 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
432 struct rtattr *tb[TCA_NETEM_MAX];
433 if (rtattr_parse(tb, TCA_NETEM_MAX,
434 RTA_DATA(opt) + sizeof(*qopt),
435 RTA_PAYLOAD(opt) - sizeof(*qopt)))
436 return -EINVAL;
437
438 if (tb[TCA_NETEM_CORR-1]) {
439 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
440 if (ret)
441 return ret;
442 }
443
444 if (tb[TCA_NETEM_DELAY_DIST-1]) {
445 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
446 if (ret)
447 return ret;
448 }
449 }
450
451
452 return 0;
453}
454
455static int netem_init(struct Qdisc *sch, struct rtattr *opt)
456{
457 struct netem_sched_data *q = qdisc_priv(sch);
458 int ret;
459
460 if (!opt)
461 return -EINVAL;
462
463 skb_queue_head_init(&q->delayed);
464 init_timer(&q->timer);
465 q->timer.function = netem_watchdog;
466 q->timer.data = (unsigned long) sch;
467 q->counter = 0;
468
469 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
470 if (!q->qdisc) {
471 pr_debug("netem: qdisc create failed\n");
472 return -ENOMEM;
473 }
474
475 ret = netem_change(sch, opt);
476 if (ret) {
477 pr_debug("netem: change failed\n");
478 qdisc_destroy(q->qdisc);
479 }
480 return ret;
481}
482
483static void netem_destroy(struct Qdisc *sch)
484{
485 struct netem_sched_data *q = qdisc_priv(sch);
486
487 del_timer_sync(&q->timer);
488 qdisc_destroy(q->qdisc);
489 kfree(q->delay_dist);
490}
491
492static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
493{
494 const struct netem_sched_data *q = qdisc_priv(sch);
495 unsigned char *b = skb->tail;
496 struct rtattr *rta = (struct rtattr *) b;
497 struct tc_netem_qopt qopt;
498 struct tc_netem_corr cor;
499
500 qopt.latency = q->latency;
501 qopt.jitter = q->jitter;
502 qopt.limit = q->limit;
503 qopt.loss = q->loss;
504 qopt.gap = q->gap;
505 qopt.duplicate = q->duplicate;
506 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
507
508 cor.delay_corr = q->delay_cor.rho;
509 cor.loss_corr = q->loss_cor.rho;
510 cor.dup_corr = q->dup_cor.rho;
511 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
512 rta->rta_len = skb->tail - b;
513
514 return skb->len;
515
516rtattr_failure:
517 skb_trim(skb, b - skb->data);
518 return -1;
519}
520
521static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
522 struct sk_buff *skb, struct tcmsg *tcm)
523{
524 struct netem_sched_data *q = qdisc_priv(sch);
525
526 if (cl != 1) /* only one class */
527 return -ENOENT;
528
529 tcm->tcm_handle |= TC_H_MIN(1);
530 tcm->tcm_info = q->qdisc->handle;
531
532 return 0;
533}
534
535static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
536 struct Qdisc **old)
537{
538 struct netem_sched_data *q = qdisc_priv(sch);
539
540 if (new == NULL)
541 new = &noop_qdisc;
542
543 sch_tree_lock(sch);
544 *old = xchg(&q->qdisc, new);
545 qdisc_reset(*old);
546 sch->q.qlen = 0;
547 sch_tree_unlock(sch);
548
549 return 0;
550}
551
552static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
553{
554 struct netem_sched_data *q = qdisc_priv(sch);
555 return q->qdisc;
556}
557
558static unsigned long netem_get(struct Qdisc *sch, u32 classid)
559{
560 return 1;
561}
562
563static void netem_put(struct Qdisc *sch, unsigned long arg)
564{
565}
566
567static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
568 struct rtattr **tca, unsigned long *arg)
569{
570 return -ENOSYS;
571}
572
573static int netem_delete(struct Qdisc *sch, unsigned long arg)
574{
575 return -ENOSYS;
576}
577
578static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
579{
580 if (!walker->stop) {
581 if (walker->count >= walker->skip)
582 if (walker->fn(sch, 1, walker) < 0) {
583 walker->stop = 1;
584 return;
585 }
586 walker->count++;
587 }
588}
589
590static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
591{
592 return NULL;
593}
594
595static struct Qdisc_class_ops netem_class_ops = {
596 .graft = netem_graft,
597 .leaf = netem_leaf,
598 .get = netem_get,
599 .put = netem_put,
600 .change = netem_change_class,
601 .delete = netem_delete,
602 .walk = netem_walk,
603 .tcf_chain = netem_find_tcf,
604 .dump = netem_dump_class,
605};
606
607static struct Qdisc_ops netem_qdisc_ops = {
608 .id = "netem",
609 .cl_ops = &netem_class_ops,
610 .priv_size = sizeof(struct netem_sched_data),
611 .enqueue = netem_enqueue,
612 .dequeue = netem_dequeue,
613 .requeue = netem_requeue,
614 .drop = netem_drop,
615 .init = netem_init,
616 .reset = netem_reset,
617 .destroy = netem_destroy,
618 .change = netem_change,
619 .dump = netem_dump,
620 .owner = THIS_MODULE,
621};
622
623
624static int __init netem_module_init(void)
625{
626 return register_qdisc(&netem_qdisc_ops);
627}
628static void __exit netem_module_exit(void)
629{
630 unregister_qdisc(&netem_qdisc_ops);
631}
632module_init(netem_module_init)
633module_exit(netem_module_exit)
634MODULE_LICENSE("GPL");