blob: e0c9fbe73b15cee32b44f4469e1ac5eeb9849267 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_netem.c Network emulator
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Many of the algorithms and ideas for this came from
10 * NIST Net which is not copyrighted.
11 *
12 * Authors: Stephen Hemminger <shemminger@osdl.org>
13 * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
14 */
15
16#include <linux/config.h>
17#include <linux/module.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/errno.h>
22#include <linux/netdevice.h>
23#include <linux/skbuff.h>
24#include <linux/rtnetlink.h>
25
26#include <net/pkt_sched.h>
27
28/* Network Emulation Queuing algorithm.
29 ====================================
30
31 Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
32 Network Emulation Tool
33 [2] Luigi Rizzo, DummyNet for FreeBSD
34
35 ----------------------------------------------------------------
36
37 This started out as a simple way to delay outgoing packets to
38 test TCP but has grown to include most of the functionality
39 of a full blown network emulator like NISTnet. It can delay
40 packets and add random jitter (and correlation). The random
41 distribution can be loaded from a table as well to provide
42 normal, Pareto, or experimental curves. Packet loss,
43 duplication, and reordering can also be emulated.
44
45 This qdisc does not do classification that can be handled in
46 layering other disciplines. It does not need to do bandwidth
47 control either since that can be handled by using token
48 bucket or other rate control.
49
50 The simulator is limited by the Linux timer resolution
51 and will create packet bursts on the HZ boundary (1ms).
52*/
53
54struct netem_sched_data {
55 struct Qdisc *qdisc;
56 struct sk_buff_head delayed;
57 struct timer_list timer;
58
59 u32 latency;
60 u32 loss;
61 u32 limit;
62 u32 counter;
63 u32 gap;
64 u32 jitter;
65 u32 duplicate;
66
67 struct crndstate {
68 unsigned long last;
69 unsigned long rho;
70 } delay_cor, loss_cor, dup_cor;
71
72 struct disttable {
73 u32 size;
74 s16 table[0];
75 } *delay_dist;
76};
77
78/* Time stamp put into socket buffer control block */
79struct netem_skb_cb {
80 psched_time_t time_to_send;
81};
82
83/* init_crandom - initialize correlated random number generator
84 * Use entropy source for initial seed.
85 */
86static void init_crandom(struct crndstate *state, unsigned long rho)
87{
88 state->rho = rho;
89 state->last = net_random();
90}
91
92/* get_crandom - correlated random number generator
93 * Next number depends on last value.
94 * rho is scaled to avoid floating point.
95 */
96static unsigned long get_crandom(struct crndstate *state)
97{
98 u64 value, rho;
99 unsigned long answer;
100
101 if (state->rho == 0) /* no correllation */
102 return net_random();
103
104 value = net_random();
105 rho = (u64)state->rho + 1;
106 answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
107 state->last = answer;
108 return answer;
109}
110
111/* tabledist - return a pseudo-randomly distributed value with mean mu and
112 * std deviation sigma. Uses table lookup to approximate the desired
113 * distribution, and a uniformly-distributed pseudo-random source.
114 */
115static long tabledist(unsigned long mu, long sigma,
116 struct crndstate *state, const struct disttable *dist)
117{
118 long t, x;
119 unsigned long rnd;
120
121 if (sigma == 0)
122 return mu;
123
124 rnd = get_crandom(state);
125
126 /* default uniform distribution */
127 if (dist == NULL)
128 return (rnd % (2*sigma)) - sigma + mu;
129
130 t = dist->table[rnd % dist->size];
131 x = (sigma % NETEM_DIST_SCALE) * t;
132 if (x >= 0)
133 x += NETEM_DIST_SCALE/2;
134 else
135 x -= NETEM_DIST_SCALE/2;
136
137 return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
138}
139
140/* Put skb in the private delayed queue. */
Stephen Hemminger771018e2005-05-03 16:24:32 -0700141static int netem_delay(struct Qdisc *sch, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142{
143 struct netem_sched_data *q = qdisc_priv(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144 psched_tdiff_t td;
145 psched_time_t now;
146
147 PSCHED_GET_TIME(now);
148 td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700149
150 /* Always queue at tail to keep packets in order */
151 if (likely(q->delayed.qlen < q->limit)) {
Stephen Hemminger771018e2005-05-03 16:24:32 -0700152 struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb;
153
154 PSCHED_TADD2(now, td, cb->time_to_send);
155
156 pr_debug("netem_delay: skb=%p now=%llu tosend=%llu\n", skb,
157 now, cb->time_to_send);
158
Linus Torvalds1da177e2005-04-16 15:20:36 -0700159 __skb_queue_tail(&q->delayed, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160 return NET_XMIT_SUCCESS;
161 }
162
Stephen Hemminger771018e2005-05-03 16:24:32 -0700163 pr_debug("netem_delay: queue over limit %d\n", q->limit);
164 sch->qstats.overlimits++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165 kfree_skb(skb);
166 return NET_XMIT_DROP;
167}
168
Stephen Hemminger771018e2005-05-03 16:24:32 -0700169/*
170 * Move a packet that is ready to send from the delay holding
171 * list to the underlying qdisc.
172 */
173static int netem_run(struct Qdisc *sch)
174{
175 struct netem_sched_data *q = qdisc_priv(sch);
176 struct sk_buff *skb;
177 psched_time_t now;
178
179 PSCHED_GET_TIME(now);
180
181 skb = skb_peek(&q->delayed);
182 if (skb) {
183 const struct netem_skb_cb *cb
184 = (const struct netem_skb_cb *)skb->cb;
185 long delay
186 = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now));
187 pr_debug("netem_run: skb=%p delay=%ld\n", skb, delay);
188
189 /* if more time remaining? */
190 if (delay > 0) {
191 mod_timer(&q->timer, jiffies + delay);
192 return 1;
193 }
194
195 __skb_unlink(skb, &q->delayed);
196
197 if (q->qdisc->enqueue(skb, q->qdisc)) {
198 sch->q.qlen--;
199 sch->qstats.drops++;
200 }
201 }
202
203 return 0;
204}
205
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
207{
208 struct netem_sched_data *q = qdisc_priv(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209 int ret;
210
Stephen Hemminger771018e2005-05-03 16:24:32 -0700211 pr_debug("netem_enqueue skb=%p\n", skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700212
213 /* Random packet drop 0 => none, ~0 => all */
214 if (q->loss && q->loss >= get_crandom(&q->loss_cor)) {
215 pr_debug("netem_enqueue: random loss\n");
216 sch->qstats.drops++;
217 kfree_skb(skb);
218 return 0; /* lie about loss so TCP doesn't know */
219 }
220
221 /* Random duplication */
Stephen Hemmingerd5d75cd2005-05-03 16:24:57 -0700222 if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor)) {
223 struct sk_buff *skb2;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700224
Stephen Hemmingerd5d75cd2005-05-03 16:24:57 -0700225 skb2 = skb_clone(skb, GFP_ATOMIC);
226 if (skb2 && netem_delay(sch, skb2) == NET_XMIT_SUCCESS) {
227 struct Qdisc *qp;
228
229 /* Since one packet can generate two packets in the
230 * queue, the parent's qlen accounting gets confused,
231 * so fix it.
232 */
233 qp = qdisc_lookup(sch->dev, TC_H_MAJ(sch->parent));
234 if (qp)
235 qp->q.qlen++;
236
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 sch->q.qlen++;
238 sch->bstats.bytes += skb2->len;
239 sch->bstats.packets++;
240 } else
241 sch->qstats.drops++;
242 }
243
244 /* If doing simple delay then gap == 0 so all packets
245 * go into the delayed holding queue
246 * otherwise if doing out of order only "1 out of gap"
247 * packets will be delayed.
248 */
249 if (q->counter < q->gap) {
250 ++q->counter;
251 ret = q->qdisc->enqueue(skb, q->qdisc);
252 } else {
253 q->counter = 0;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700254 ret = netem_delay(sch, skb);
255 netem_run(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256 }
257
258 if (likely(ret == NET_XMIT_SUCCESS)) {
259 sch->q.qlen++;
260 sch->bstats.bytes += skb->len;
261 sch->bstats.packets++;
262 } else
263 sch->qstats.drops++;
264
Stephen Hemmingerd5d75cd2005-05-03 16:24:57 -0700265 pr_debug("netem: enqueue ret %d\n", ret);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266 return ret;
267}
268
269/* Requeue packets but don't change time stamp */
270static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch)
271{
272 struct netem_sched_data *q = qdisc_priv(sch);
273 int ret;
274
275 if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
276 sch->q.qlen++;
277 sch->qstats.requeues++;
278 }
279
280 return ret;
281}
282
283static unsigned int netem_drop(struct Qdisc* sch)
284{
285 struct netem_sched_data *q = qdisc_priv(sch);
286 unsigned int len;
287
288 if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) {
289 sch->q.qlen--;
290 sch->qstats.drops++;
291 }
292 return len;
293}
294
Linus Torvalds1da177e2005-04-16 15:20:36 -0700295static struct sk_buff *netem_dequeue(struct Qdisc *sch)
296{
297 struct netem_sched_data *q = qdisc_priv(sch);
298 struct sk_buff *skb;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700299 int pending;
300
301 pending = netem_run(sch);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302
303 skb = q->qdisc->dequeue(q->qdisc);
Stephen Hemminger771018e2005-05-03 16:24:32 -0700304 if (skb) {
305 pr_debug("netem_dequeue: return skb=%p\n", skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306 sch->q.qlen--;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700307 sch->flags &= ~TCQ_F_THROTTLED;
308 }
309 else if (pending) {
310 pr_debug("netem_dequeue: throttling\n");
311 sch->flags |= TCQ_F_THROTTLED;
312 }
313
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 return skb;
315}
316
317static void netem_watchdog(unsigned long arg)
318{
319 struct Qdisc *sch = (struct Qdisc *)arg;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700320
Stephen Hemminger771018e2005-05-03 16:24:32 -0700321 pr_debug("netem_watchdog qlen=%d\n", sch->q.qlen);
322 sch->flags &= ~TCQ_F_THROTTLED;
323 netif_schedule(sch->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324}
325
326static void netem_reset(struct Qdisc *sch)
327{
328 struct netem_sched_data *q = qdisc_priv(sch);
329
330 qdisc_reset(q->qdisc);
331 skb_queue_purge(&q->delayed);
332
333 sch->q.qlen = 0;
Stephen Hemminger771018e2005-05-03 16:24:32 -0700334 sch->flags &= ~TCQ_F_THROTTLED;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335 del_timer_sync(&q->timer);
336}
337
338static int set_fifo_limit(struct Qdisc *q, int limit)
339{
340 struct rtattr *rta;
341 int ret = -ENOMEM;
342
343 rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
344 if (rta) {
345 rta->rta_type = RTM_NEWQDISC;
346 rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
347 ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
348
349 ret = q->ops->change(q, rta);
350 kfree(rta);
351 }
352 return ret;
353}
354
355/*
356 * Distribution data is a variable size payload containing
357 * signed 16 bit values.
358 */
359static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr)
360{
361 struct netem_sched_data *q = qdisc_priv(sch);
362 unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16);
363 const __s16 *data = RTA_DATA(attr);
364 struct disttable *d;
365 int i;
366
367 if (n > 65536)
368 return -EINVAL;
369
370 d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL);
371 if (!d)
372 return -ENOMEM;
373
374 d->size = n;
375 for (i = 0; i < n; i++)
376 d->table[i] = data[i];
377
378 spin_lock_bh(&sch->dev->queue_lock);
379 d = xchg(&q->delay_dist, d);
380 spin_unlock_bh(&sch->dev->queue_lock);
381
382 kfree(d);
383 return 0;
384}
385
386static int get_correlation(struct Qdisc *sch, const struct rtattr *attr)
387{
388 struct netem_sched_data *q = qdisc_priv(sch);
389 const struct tc_netem_corr *c = RTA_DATA(attr);
390
391 if (RTA_PAYLOAD(attr) != sizeof(*c))
392 return -EINVAL;
393
394 init_crandom(&q->delay_cor, c->delay_corr);
395 init_crandom(&q->loss_cor, c->loss_corr);
396 init_crandom(&q->dup_cor, c->dup_corr);
397 return 0;
398}
399
400static int netem_change(struct Qdisc *sch, struct rtattr *opt)
401{
402 struct netem_sched_data *q = qdisc_priv(sch);
403 struct tc_netem_qopt *qopt;
404 int ret;
405
406 if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt))
407 return -EINVAL;
408
409 qopt = RTA_DATA(opt);
410 ret = set_fifo_limit(q->qdisc, qopt->limit);
411 if (ret) {
412 pr_debug("netem: can't set fifo limit\n");
413 return ret;
414 }
415
416 q->latency = qopt->latency;
417 q->jitter = qopt->jitter;
418 q->limit = qopt->limit;
419 q->gap = qopt->gap;
420 q->loss = qopt->loss;
421 q->duplicate = qopt->duplicate;
422
423 /* Handle nested options after initial queue options.
424 * Should have put all options in nested format but too late now.
425 */
426 if (RTA_PAYLOAD(opt) > sizeof(*qopt)) {
427 struct rtattr *tb[TCA_NETEM_MAX];
428 if (rtattr_parse(tb, TCA_NETEM_MAX,
429 RTA_DATA(opt) + sizeof(*qopt),
430 RTA_PAYLOAD(opt) - sizeof(*qopt)))
431 return -EINVAL;
432
433 if (tb[TCA_NETEM_CORR-1]) {
434 ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]);
435 if (ret)
436 return ret;
437 }
438
439 if (tb[TCA_NETEM_DELAY_DIST-1]) {
440 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]);
441 if (ret)
442 return ret;
443 }
444 }
445
446
447 return 0;
448}
449
450static int netem_init(struct Qdisc *sch, struct rtattr *opt)
451{
452 struct netem_sched_data *q = qdisc_priv(sch);
453 int ret;
454
455 if (!opt)
456 return -EINVAL;
457
458 skb_queue_head_init(&q->delayed);
459 init_timer(&q->timer);
460 q->timer.function = netem_watchdog;
461 q->timer.data = (unsigned long) sch;
462 q->counter = 0;
463
464 q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops);
465 if (!q->qdisc) {
466 pr_debug("netem: qdisc create failed\n");
467 return -ENOMEM;
468 }
469
470 ret = netem_change(sch, opt);
471 if (ret) {
472 pr_debug("netem: change failed\n");
473 qdisc_destroy(q->qdisc);
474 }
475 return ret;
476}
477
478static void netem_destroy(struct Qdisc *sch)
479{
480 struct netem_sched_data *q = qdisc_priv(sch);
481
482 del_timer_sync(&q->timer);
483 qdisc_destroy(q->qdisc);
484 kfree(q->delay_dist);
485}
486
487static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
488{
489 const struct netem_sched_data *q = qdisc_priv(sch);
490 unsigned char *b = skb->tail;
491 struct rtattr *rta = (struct rtattr *) b;
492 struct tc_netem_qopt qopt;
493 struct tc_netem_corr cor;
494
495 qopt.latency = q->latency;
496 qopt.jitter = q->jitter;
497 qopt.limit = q->limit;
498 qopt.loss = q->loss;
499 qopt.gap = q->gap;
500 qopt.duplicate = q->duplicate;
501 RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt);
502
503 cor.delay_corr = q->delay_cor.rho;
504 cor.loss_corr = q->loss_cor.rho;
505 cor.dup_corr = q->dup_cor.rho;
506 RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor);
507 rta->rta_len = skb->tail - b;
508
509 return skb->len;
510
511rtattr_failure:
512 skb_trim(skb, b - skb->data);
513 return -1;
514}
515
516static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
517 struct sk_buff *skb, struct tcmsg *tcm)
518{
519 struct netem_sched_data *q = qdisc_priv(sch);
520
521 if (cl != 1) /* only one class */
522 return -ENOENT;
523
524 tcm->tcm_handle |= TC_H_MIN(1);
525 tcm->tcm_info = q->qdisc->handle;
526
527 return 0;
528}
529
530static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
531 struct Qdisc **old)
532{
533 struct netem_sched_data *q = qdisc_priv(sch);
534
535 if (new == NULL)
536 new = &noop_qdisc;
537
538 sch_tree_lock(sch);
539 *old = xchg(&q->qdisc, new);
540 qdisc_reset(*old);
541 sch->q.qlen = 0;
542 sch_tree_unlock(sch);
543
544 return 0;
545}
546
547static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
548{
549 struct netem_sched_data *q = qdisc_priv(sch);
550 return q->qdisc;
551}
552
553static unsigned long netem_get(struct Qdisc *sch, u32 classid)
554{
555 return 1;
556}
557
558static void netem_put(struct Qdisc *sch, unsigned long arg)
559{
560}
561
562static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
563 struct rtattr **tca, unsigned long *arg)
564{
565 return -ENOSYS;
566}
567
568static int netem_delete(struct Qdisc *sch, unsigned long arg)
569{
570 return -ENOSYS;
571}
572
573static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
574{
575 if (!walker->stop) {
576 if (walker->count >= walker->skip)
577 if (walker->fn(sch, 1, walker) < 0) {
578 walker->stop = 1;
579 return;
580 }
581 walker->count++;
582 }
583}
584
585static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl)
586{
587 return NULL;
588}
589
590static struct Qdisc_class_ops netem_class_ops = {
591 .graft = netem_graft,
592 .leaf = netem_leaf,
593 .get = netem_get,
594 .put = netem_put,
595 .change = netem_change_class,
596 .delete = netem_delete,
597 .walk = netem_walk,
598 .tcf_chain = netem_find_tcf,
599 .dump = netem_dump_class,
600};
601
602static struct Qdisc_ops netem_qdisc_ops = {
603 .id = "netem",
604 .cl_ops = &netem_class_ops,
605 .priv_size = sizeof(struct netem_sched_data),
606 .enqueue = netem_enqueue,
607 .dequeue = netem_dequeue,
608 .requeue = netem_requeue,
609 .drop = netem_drop,
610 .init = netem_init,
611 .reset = netem_reset,
612 .destroy = netem_destroy,
613 .change = netem_change,
614 .dump = netem_dump,
615 .owner = THIS_MODULE,
616};
617
618
619static int __init netem_module_init(void)
620{
621 return register_qdisc(&netem_qdisc_ops);
622}
623static void __exit netem_module_exit(void)
624{
625 unregister_qdisc(&netem_qdisc_ops);
626}
627module_init(netem_module_init)
628module_exit(netem_module_exit)
629MODULE_LICENSE("GPL");