blob: 97c1c75d5c787f0b6a8d9416965e329662e7f52f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/config.h>
19#include <linux/module.h>
20#include <linux/types.h>
21#include <linux/kernel.h>
22#include <linux/sched.h>
23#include <linux/string.h>
24#include <linux/mm.h>
25#include <linux/socket.h>
26#include <linux/sockios.h>
27#include <linux/in.h>
28#include <linux/errno.h>
29#include <linux/interrupt.h>
30#include <linux/netdevice.h>
31#include <linux/skbuff.h>
32#include <linux/rtnetlink.h>
33#include <linux/init.h>
34#include <linux/proc_fs.h>
35#include <linux/seq_file.h>
36#include <linux/kmod.h>
37#include <linux/list.h>
38#include <linux/bitops.h>
39
40#include <net/sock.h>
41#include <net/pkt_sched.h>
42
43#include <asm/processor.h>
44#include <asm/uaccess.h>
45#include <asm/system.h>
46
47static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
48 struct Qdisc *old, struct Qdisc *new);
49static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
50 struct Qdisc *q, unsigned long cl, int event);
51
52/*
53
54 Short review.
55 -------------
56
57 This file consists of two interrelated parts:
58
59 1. queueing disciplines manager frontend.
60 2. traffic classes manager frontend.
61
62 Generally, queueing discipline ("qdisc") is a black box,
63 which is able to enqueue packets and to dequeue them (when
64 device is ready to send something) in order and at times
65 determined by algorithm hidden in it.
66
67 qdisc's are divided to two categories:
68 - "queues", which have no internal structure visible from outside.
69 - "schedulers", which split all the packets to "traffic classes",
70 using "packet classifiers" (look at cls_api.c)
71
72 In turn, classes may have child qdiscs (as rule, queues)
73 attached to them etc. etc. etc.
74
75 The goal of the routines in this file is to translate
76 information supplied by user in the form of handles
77 to more intelligible for kernel form, to make some sanity
78 checks and part of work, which is common to all qdiscs
79 and to provide rtnetlink notifications.
80
81 All real intelligent work is done inside qdisc modules.
82
83
84
85 Every discipline has two major routines: enqueue and dequeue.
86
87 ---dequeue
88
89 dequeue usually returns a skb to send. It is allowed to return NULL,
90 but it does not mean that queue is empty, it just means that
91 discipline does not want to send anything this time.
92 Queue is really empty if q->q.qlen == 0.
93 For complicated disciplines with multiple queues q->q is not
94 real packet queue, but however q->q.qlen must be valid.
95
96 ---enqueue
97
98 enqueue returns 0, if packet was enqueued successfully.
99 If packet (this one or another one) was dropped, it returns
100 not zero error code.
101 NET_XMIT_DROP - this packet dropped
102 Expected action: do not backoff, but wait until queue will clear.
103 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
104 Expected action: backoff or ignore
105 NET_XMIT_POLICED - dropped by police.
106 Expected action: backoff or error to real-time apps.
107
108 Auxiliary routines:
109
110 ---requeue
111
112 requeues once dequeued packet. It is used for non-standard or
113 just buggy devices, which can defer output even if dev->tbusy=0.
114
115 ---reset
116
117 returns qdisc to initial state: purge all buffers, clear all
118 timers, counters (except for statistics) etc.
119
120 ---init
121
122 initializes newly created qdisc.
123
124 ---destroy
125
126 destroys resources allocated by init and during lifetime of qdisc.
127
128 ---change
129
130 changes qdisc parameters.
131 */
132
133/* Protects list of registered TC modules. It is pure SMP lock. */
134static DEFINE_RWLOCK(qdisc_mod_lock);
135
136
137/************************************************
138 * Queueing disciplines manipulation. *
139 ************************************************/
140
141
142/* The list of all installed queueing disciplines. */
143
144static struct Qdisc_ops *qdisc_base;
145
146/* Register/uregister queueing discipline */
147
148int register_qdisc(struct Qdisc_ops *qops)
149{
150 struct Qdisc_ops *q, **qp;
151 int rc = -EEXIST;
152
153 write_lock(&qdisc_mod_lock);
154 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
155 if (!strcmp(qops->id, q->id))
156 goto out;
157
158 if (qops->enqueue == NULL)
159 qops->enqueue = noop_qdisc_ops.enqueue;
160 if (qops->requeue == NULL)
161 qops->requeue = noop_qdisc_ops.requeue;
162 if (qops->dequeue == NULL)
163 qops->dequeue = noop_qdisc_ops.dequeue;
164
165 qops->next = NULL;
166 *qp = qops;
167 rc = 0;
168out:
169 write_unlock(&qdisc_mod_lock);
170 return rc;
171}
172
173int unregister_qdisc(struct Qdisc_ops *qops)
174{
175 struct Qdisc_ops *q, **qp;
176 int err = -ENOENT;
177
178 write_lock(&qdisc_mod_lock);
179 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
180 if (q == qops)
181 break;
182 if (q) {
183 *qp = q->next;
184 q->next = NULL;
185 err = 0;
186 }
187 write_unlock(&qdisc_mod_lock);
188 return err;
189}
190
191/* We know handle. Find qdisc among all qdisc's attached to device
192 (root qdisc, all its children, children of children etc.)
193 */
194
195struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
196{
197 struct Qdisc *q;
198
199 read_lock_bh(&qdisc_tree_lock);
200 list_for_each_entry(q, &dev->qdisc_list, list) {
201 if (q->handle == handle) {
202 read_unlock_bh(&qdisc_tree_lock);
203 return q;
204 }
205 }
206 read_unlock_bh(&qdisc_tree_lock);
207 return NULL;
208}
209
210static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
211{
212 unsigned long cl;
213 struct Qdisc *leaf;
214 struct Qdisc_class_ops *cops = p->ops->cl_ops;
215
216 if (cops == NULL)
217 return NULL;
218 cl = cops->get(p, classid);
219
220 if (cl == 0)
221 return NULL;
222 leaf = cops->leaf(p, cl);
223 cops->put(p, cl);
224 return leaf;
225}
226
227/* Find queueing discipline by name */
228
229static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
230{
231 struct Qdisc_ops *q = NULL;
232
233 if (kind) {
234 read_lock(&qdisc_mod_lock);
235 for (q = qdisc_base; q; q = q->next) {
236 if (rtattr_strcmp(kind, q->id) == 0) {
237 if (!try_module_get(q->owner))
238 q = NULL;
239 break;
240 }
241 }
242 read_unlock(&qdisc_mod_lock);
243 }
244 return q;
245}
246
247static struct qdisc_rate_table *qdisc_rtab_list;
248
249struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
250{
251 struct qdisc_rate_table *rtab;
252
253 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
254 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
255 rtab->refcnt++;
256 return rtab;
257 }
258 }
259
260 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
261 return NULL;
262
263 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
264 if (rtab) {
265 rtab->rate = *r;
266 rtab->refcnt = 1;
267 memcpy(rtab->data, RTA_DATA(tab), 1024);
268 rtab->next = qdisc_rtab_list;
269 qdisc_rtab_list = rtab;
270 }
271 return rtab;
272}
273
274void qdisc_put_rtab(struct qdisc_rate_table *tab)
275{
276 struct qdisc_rate_table *rtab, **rtabp;
277
278 if (!tab || --tab->refcnt)
279 return;
280
281 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
282 if (rtab == tab) {
283 *rtabp = rtab->next;
284 kfree(rtab);
285 return;
286 }
287 }
288}
289
290
291/* Allocate an unique handle from space managed by kernel */
292
293static u32 qdisc_alloc_handle(struct net_device *dev)
294{
295 int i = 0x10000;
296 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
297
298 do {
299 autohandle += TC_H_MAKE(0x10000U, 0);
300 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
301 autohandle = TC_H_MAKE(0x80000000U, 0);
302 } while (qdisc_lookup(dev, autohandle) && --i > 0);
303
304 return i>0 ? autohandle : 0;
305}
306
307/* Attach toplevel qdisc to device dev */
308
309static struct Qdisc *
310dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
311{
312 struct Qdisc *oqdisc;
313
314 if (dev->flags & IFF_UP)
315 dev_deactivate(dev);
316
317 qdisc_lock_tree(dev);
318 if (qdisc && qdisc->flags&TCQ_F_INGRESS) {
319 oqdisc = dev->qdisc_ingress;
320 /* Prune old scheduler */
321 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
322 /* delete */
323 qdisc_reset(oqdisc);
324 dev->qdisc_ingress = NULL;
325 } else { /* new */
326 dev->qdisc_ingress = qdisc;
327 }
328
329 } else {
330
331 oqdisc = dev->qdisc_sleeping;
332
333 /* Prune old scheduler */
334 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
335 qdisc_reset(oqdisc);
336
337 /* ... and graft new one */
338 if (qdisc == NULL)
339 qdisc = &noop_qdisc;
340 dev->qdisc_sleeping = qdisc;
341 dev->qdisc = &noop_qdisc;
342 }
343
344 qdisc_unlock_tree(dev);
345
346 if (dev->flags & IFF_UP)
347 dev_activate(dev);
348
349 return oqdisc;
350}
351
352
353/* Graft qdisc "new" to class "classid" of qdisc "parent" or
354 to device "dev".
355
356 Old qdisc is not destroyed but returned in *old.
357 */
358
359static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
360 u32 classid,
361 struct Qdisc *new, struct Qdisc **old)
362{
363 int err = 0;
364 struct Qdisc *q = *old;
365
366
367 if (parent == NULL) {
368 if (q && q->flags&TCQ_F_INGRESS) {
369 *old = dev_graft_qdisc(dev, q);
370 } else {
371 *old = dev_graft_qdisc(dev, new);
372 }
373 } else {
374 struct Qdisc_class_ops *cops = parent->ops->cl_ops;
375
376 err = -EINVAL;
377
378 if (cops) {
379 unsigned long cl = cops->get(parent, classid);
380 if (cl) {
381 err = cops->graft(parent, cl, new, old);
382 if (new)
383 new->parent = classid;
384 cops->put(parent, cl);
385 }
386 }
387 }
388 return err;
389}
390
391/*
392 Allocate and initialize new qdisc.
393
394 Parameters are passed via opt.
395 */
396
397static struct Qdisc *
398qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
399{
400 int err;
401 struct rtattr *kind = tca[TCA_KIND-1];
402 void *p = NULL;
403 struct Qdisc *sch;
404 struct Qdisc_ops *ops;
405 int size;
406
407 ops = qdisc_lookup_ops(kind);
408#ifdef CONFIG_KMOD
409 if (ops == NULL && kind != NULL) {
410 char name[IFNAMSIZ];
411 if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
412 /* We dropped the RTNL semaphore in order to
413 * perform the module load. So, even if we
414 * succeeded in loading the module we have to
415 * tell the caller to replay the request. We
416 * indicate this using -EAGAIN.
417 * We replay the request because the device may
418 * go away in the mean time.
419 */
420 rtnl_unlock();
421 request_module("sch_%s", name);
422 rtnl_lock();
423 ops = qdisc_lookup_ops(kind);
424 if (ops != NULL) {
425 /* We will try again qdisc_lookup_ops,
426 * so don't keep a reference.
427 */
428 module_put(ops->owner);
429 err = -EAGAIN;
430 goto err_out;
431 }
432 }
433 }
434#endif
435
436 err = -EINVAL;
437 if (ops == NULL)
438 goto err_out;
439
440 /* ensure that the Qdisc and the private data are 32-byte aligned */
441 size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST);
442 size += ops->priv_size + QDISC_ALIGN_CONST;
443
444 p = kmalloc(size, GFP_KERNEL);
445 err = -ENOBUFS;
446 if (!p)
447 goto err_out2;
448 memset(p, 0, size);
449 sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST)
450 & ~QDISC_ALIGN_CONST);
451 sch->padded = (char *)sch - (char *)p;
452
453 INIT_LIST_HEAD(&sch->list);
454 skb_queue_head_init(&sch->q);
455
456 if (handle == TC_H_INGRESS)
457 sch->flags |= TCQ_F_INGRESS;
458
459 sch->ops = ops;
460 sch->enqueue = ops->enqueue;
461 sch->dequeue = ops->dequeue;
462 sch->dev = dev;
463 dev_hold(dev);
464 atomic_set(&sch->refcnt, 1);
465 sch->stats_lock = &dev->queue_lock;
466 if (handle == 0) {
467 handle = qdisc_alloc_handle(dev);
468 err = -ENOMEM;
469 if (handle == 0)
470 goto err_out3;
471 }
472
473 if (handle == TC_H_INGRESS)
474 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
475 else
476 sch->handle = handle;
477
478 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
479 qdisc_lock_tree(dev);
480 list_add_tail(&sch->list, &dev->qdisc_list);
481 qdisc_unlock_tree(dev);
482
483#ifdef CONFIG_NET_ESTIMATOR
484 if (tca[TCA_RATE-1])
485 gen_new_estimator(&sch->bstats, &sch->rate_est,
486 sch->stats_lock, tca[TCA_RATE-1]);
487#endif
488 return sch;
489 }
490err_out3:
491 dev_put(dev);
492err_out2:
493 module_put(ops->owner);
494err_out:
495 *errp = err;
496 if (p)
497 kfree(p);
498 return NULL;
499}
500
501static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
502{
503 if (tca[TCA_OPTIONS-1]) {
504 int err;
505
506 if (sch->ops->change == NULL)
507 return -EINVAL;
508 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
509 if (err)
510 return err;
511 }
512#ifdef CONFIG_NET_ESTIMATOR
513 if (tca[TCA_RATE-1])
514 gen_replace_estimator(&sch->bstats, &sch->rate_est,
515 sch->stats_lock, tca[TCA_RATE-1]);
516#endif
517 return 0;
518}
519
520struct check_loop_arg
521{
522 struct qdisc_walker w;
523 struct Qdisc *p;
524 int depth;
525};
526
527static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
528
529static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
530{
531 struct check_loop_arg arg;
532
533 if (q->ops->cl_ops == NULL)
534 return 0;
535
536 arg.w.stop = arg.w.skip = arg.w.count = 0;
537 arg.w.fn = check_loop_fn;
538 arg.depth = depth;
539 arg.p = p;
540 q->ops->cl_ops->walk(q, &arg.w);
541 return arg.w.stop ? -ELOOP : 0;
542}
543
544static int
545check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
546{
547 struct Qdisc *leaf;
548 struct Qdisc_class_ops *cops = q->ops->cl_ops;
549 struct check_loop_arg *arg = (struct check_loop_arg *)w;
550
551 leaf = cops->leaf(q, cl);
552 if (leaf) {
553 if (leaf == arg->p || arg->depth > 7)
554 return -ELOOP;
555 return check_loop(leaf, arg->p, arg->depth + 1);
556 }
557 return 0;
558}
559
560/*
561 * Delete/get qdisc.
562 */
563
564static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
565{
566 struct tcmsg *tcm = NLMSG_DATA(n);
567 struct rtattr **tca = arg;
568 struct net_device *dev;
569 u32 clid = tcm->tcm_parent;
570 struct Qdisc *q = NULL;
571 struct Qdisc *p = NULL;
572 int err;
573
574 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
575 return -ENODEV;
576
577 if (clid) {
578 if (clid != TC_H_ROOT) {
579 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
580 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
581 return -ENOENT;
582 q = qdisc_leaf(p, clid);
583 } else { /* ingress */
584 q = dev->qdisc_ingress;
585 }
586 } else {
587 q = dev->qdisc_sleeping;
588 }
589 if (!q)
590 return -ENOENT;
591
592 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
593 return -EINVAL;
594 } else {
595 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
596 return -ENOENT;
597 }
598
599 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
600 return -EINVAL;
601
602 if (n->nlmsg_type == RTM_DELQDISC) {
603 if (!clid)
604 return -EINVAL;
605 if (q->handle == 0)
606 return -ENOENT;
607 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
608 return err;
609 if (q) {
610 qdisc_notify(skb, n, clid, q, NULL);
611 spin_lock_bh(&dev->queue_lock);
612 qdisc_destroy(q);
613 spin_unlock_bh(&dev->queue_lock);
614 }
615 } else {
616 qdisc_notify(skb, n, clid, NULL, q);
617 }
618 return 0;
619}
620
621/*
622 Create/change qdisc.
623 */
624
625static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
626{
627 struct tcmsg *tcm;
628 struct rtattr **tca;
629 struct net_device *dev;
630 u32 clid;
631 struct Qdisc *q, *p;
632 int err;
633
634replay:
635 /* Reinit, just in case something touches this. */
636 tcm = NLMSG_DATA(n);
637 tca = arg;
638 clid = tcm->tcm_parent;
639 q = p = NULL;
640
641 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
642 return -ENODEV;
643
644 if (clid) {
645 if (clid != TC_H_ROOT) {
646 if (clid != TC_H_INGRESS) {
647 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
648 return -ENOENT;
649 q = qdisc_leaf(p, clid);
650 } else { /*ingress */
651 q = dev->qdisc_ingress;
652 }
653 } else {
654 q = dev->qdisc_sleeping;
655 }
656
657 /* It may be default qdisc, ignore it */
658 if (q && q->handle == 0)
659 q = NULL;
660
661 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
662 if (tcm->tcm_handle) {
663 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
664 return -EEXIST;
665 if (TC_H_MIN(tcm->tcm_handle))
666 return -EINVAL;
667 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
668 goto create_n_graft;
669 if (n->nlmsg_flags&NLM_F_EXCL)
670 return -EEXIST;
671 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
672 return -EINVAL;
673 if (q == p ||
674 (p && check_loop(q, p, 0)))
675 return -ELOOP;
676 atomic_inc(&q->refcnt);
677 goto graft;
678 } else {
679 if (q == NULL)
680 goto create_n_graft;
681
682 /* This magic test requires explanation.
683 *
684 * We know, that some child q is already
685 * attached to this parent and have choice:
686 * either to change it or to create/graft new one.
687 *
688 * 1. We are allowed to create/graft only
689 * if CREATE and REPLACE flags are set.
690 *
691 * 2. If EXCL is set, requestor wanted to say,
692 * that qdisc tcm_handle is not expected
693 * to exist, so that we choose create/graft too.
694 *
695 * 3. The last case is when no flags are set.
696 * Alas, it is sort of hole in API, we
697 * cannot decide what to do unambiguously.
698 * For now we select create/graft, if
699 * user gave KIND, which does not match existing.
700 */
701 if ((n->nlmsg_flags&NLM_F_CREATE) &&
702 (n->nlmsg_flags&NLM_F_REPLACE) &&
703 ((n->nlmsg_flags&NLM_F_EXCL) ||
704 (tca[TCA_KIND-1] &&
705 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
706 goto create_n_graft;
707 }
708 }
709 } else {
710 if (!tcm->tcm_handle)
711 return -EINVAL;
712 q = qdisc_lookup(dev, tcm->tcm_handle);
713 }
714
715 /* Change qdisc parameters */
716 if (q == NULL)
717 return -ENOENT;
718 if (n->nlmsg_flags&NLM_F_EXCL)
719 return -EEXIST;
720 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
721 return -EINVAL;
722 err = qdisc_change(q, tca);
723 if (err == 0)
724 qdisc_notify(skb, n, clid, NULL, q);
725 return err;
726
727create_n_graft:
728 if (!(n->nlmsg_flags&NLM_F_CREATE))
729 return -ENOENT;
730 if (clid == TC_H_INGRESS)
731 q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
732 else
733 q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
734 if (q == NULL) {
735 if (err == -EAGAIN)
736 goto replay;
737 return err;
738 }
739
740graft:
741 if (1) {
742 struct Qdisc *old_q = NULL;
743 err = qdisc_graft(dev, p, clid, q, &old_q);
744 if (err) {
745 if (q) {
746 spin_lock_bh(&dev->queue_lock);
747 qdisc_destroy(q);
748 spin_unlock_bh(&dev->queue_lock);
749 }
750 return err;
751 }
752 qdisc_notify(skb, n, clid, old_q, q);
753 if (old_q) {
754 spin_lock_bh(&dev->queue_lock);
755 qdisc_destroy(old_q);
756 spin_unlock_bh(&dev->queue_lock);
757 }
758 }
759 return 0;
760}
761
762static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700763 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764{
765 struct tcmsg *tcm;
766 struct nlmsghdr *nlh;
767 unsigned char *b = skb->tail;
768 struct gnet_dump d;
769
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700770 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700771 tcm = NLMSG_DATA(nlh);
772 tcm->tcm_family = AF_UNSPEC;
773 tcm->tcm_ifindex = q->dev->ifindex;
774 tcm->tcm_parent = clid;
775 tcm->tcm_handle = q->handle;
776 tcm->tcm_info = atomic_read(&q->refcnt);
777 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
778 if (q->ops->dump && q->ops->dump(q, skb) < 0)
779 goto rtattr_failure;
780 q->qstats.qlen = q->q.qlen;
781
782 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
783 TCA_XSTATS, q->stats_lock, &d) < 0)
784 goto rtattr_failure;
785
786 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
787 goto rtattr_failure;
788
789 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
790#ifdef CONFIG_NET_ESTIMATOR
791 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
792#endif
793 gnet_stats_copy_queue(&d, &q->qstats) < 0)
794 goto rtattr_failure;
795
796 if (gnet_stats_finish_copy(&d) < 0)
797 goto rtattr_failure;
798
799 nlh->nlmsg_len = skb->tail - b;
800 return skb->len;
801
802nlmsg_failure:
803rtattr_failure:
804 skb_trim(skb, b - skb->data);
805 return -1;
806}
807
808static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
809 u32 clid, struct Qdisc *old, struct Qdisc *new)
810{
811 struct sk_buff *skb;
812 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
813
814 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
815 if (!skb)
816 return -ENOBUFS;
817
818 if (old && old->handle) {
819 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
820 goto err_out;
821 }
822 if (new) {
823 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
824 goto err_out;
825 }
826
827 if (skb->len)
828 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
829
830err_out:
831 kfree_skb(skb);
832 return -EINVAL;
833}
834
835static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
836{
837 int idx, q_idx;
838 int s_idx, s_q_idx;
839 struct net_device *dev;
840 struct Qdisc *q;
841
842 s_idx = cb->args[0];
843 s_q_idx = q_idx = cb->args[1];
844 read_lock(&dev_base_lock);
845 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
846 if (idx < s_idx)
847 continue;
848 if (idx > s_idx)
849 s_q_idx = 0;
850 read_lock_bh(&qdisc_tree_lock);
851 q_idx = 0;
852 list_for_each_entry(q, &dev->qdisc_list, list) {
853 if (q_idx < s_q_idx) {
854 q_idx++;
855 continue;
856 }
857 if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
858 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
859 read_unlock_bh(&qdisc_tree_lock);
860 goto done;
861 }
862 q_idx++;
863 }
864 read_unlock_bh(&qdisc_tree_lock);
865 }
866
867done:
868 read_unlock(&dev_base_lock);
869
870 cb->args[0] = idx;
871 cb->args[1] = q_idx;
872
873 return skb->len;
874}
875
876
877
878/************************************************
879 * Traffic classes manipulation. *
880 ************************************************/
881
882
883
884static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
885{
886 struct tcmsg *tcm = NLMSG_DATA(n);
887 struct rtattr **tca = arg;
888 struct net_device *dev;
889 struct Qdisc *q = NULL;
890 struct Qdisc_class_ops *cops;
891 unsigned long cl = 0;
892 unsigned long new_cl;
893 u32 pid = tcm->tcm_parent;
894 u32 clid = tcm->tcm_handle;
895 u32 qid = TC_H_MAJ(clid);
896 int err;
897
898 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
899 return -ENODEV;
900
901 /*
902 parent == TC_H_UNSPEC - unspecified parent.
903 parent == TC_H_ROOT - class is root, which has no parent.
904 parent == X:0 - parent is root class.
905 parent == X:Y - parent is a node in hierarchy.
906 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
907
908 handle == 0:0 - generate handle from kernel pool.
909 handle == 0:Y - class is X:Y, where X:0 is qdisc.
910 handle == X:Y - clear.
911 handle == X:0 - root class.
912 */
913
914 /* Step 1. Determine qdisc handle X:0 */
915
916 if (pid != TC_H_ROOT) {
917 u32 qid1 = TC_H_MAJ(pid);
918
919 if (qid && qid1) {
920 /* If both majors are known, they must be identical. */
921 if (qid != qid1)
922 return -EINVAL;
923 } else if (qid1) {
924 qid = qid1;
925 } else if (qid == 0)
926 qid = dev->qdisc_sleeping->handle;
927
928 /* Now qid is genuine qdisc handle consistent
929 both with parent and child.
930
931 TC_H_MAJ(pid) still may be unspecified, complete it now.
932 */
933 if (pid)
934 pid = TC_H_MAKE(qid, pid);
935 } else {
936 if (qid == 0)
937 qid = dev->qdisc_sleeping->handle;
938 }
939
940 /* OK. Locate qdisc */
941 if ((q = qdisc_lookup(dev, qid)) == NULL)
942 return -ENOENT;
943
944 /* An check that it supports classes */
945 cops = q->ops->cl_ops;
946 if (cops == NULL)
947 return -EINVAL;
948
949 /* Now try to get class */
950 if (clid == 0) {
951 if (pid == TC_H_ROOT)
952 clid = qid;
953 } else
954 clid = TC_H_MAKE(qid, clid);
955
956 if (clid)
957 cl = cops->get(q, clid);
958
959 if (cl == 0) {
960 err = -ENOENT;
961 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
962 goto out;
963 } else {
964 switch (n->nlmsg_type) {
965 case RTM_NEWTCLASS:
966 err = -EEXIST;
967 if (n->nlmsg_flags&NLM_F_EXCL)
968 goto out;
969 break;
970 case RTM_DELTCLASS:
971 err = cops->delete(q, cl);
972 if (err == 0)
973 tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
974 goto out;
975 case RTM_GETTCLASS:
976 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
977 goto out;
978 default:
979 err = -EINVAL;
980 goto out;
981 }
982 }
983
984 new_cl = cl;
985 err = cops->change(q, clid, pid, tca, &new_cl);
986 if (err == 0)
987 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
988
989out:
990 if (cl)
991 cops->put(q, cl);
992
993 return err;
994}
995
996
997static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
998 unsigned long cl,
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -0700999 u32 pid, u32 seq, u16 flags, int event)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001000{
1001 struct tcmsg *tcm;
1002 struct nlmsghdr *nlh;
1003 unsigned char *b = skb->tail;
1004 struct gnet_dump d;
1005 struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1006
Jamal Hadi Salime431b8c2005-06-18 22:55:31 -07001007 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008 tcm = NLMSG_DATA(nlh);
1009 tcm->tcm_family = AF_UNSPEC;
1010 tcm->tcm_ifindex = q->dev->ifindex;
1011 tcm->tcm_parent = q->handle;
1012 tcm->tcm_handle = q->handle;
1013 tcm->tcm_info = 0;
1014 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
1015 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1016 goto rtattr_failure;
1017
1018 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS,
1019 TCA_XSTATS, q->stats_lock, &d) < 0)
1020 goto rtattr_failure;
1021
1022 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1023 goto rtattr_failure;
1024
1025 if (gnet_stats_finish_copy(&d) < 0)
1026 goto rtattr_failure;
1027
1028 nlh->nlmsg_len = skb->tail - b;
1029 return skb->len;
1030
1031nlmsg_failure:
1032rtattr_failure:
1033 skb_trim(skb, b - skb->data);
1034 return -1;
1035}
1036
1037static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
1038 struct Qdisc *q, unsigned long cl, int event)
1039{
1040 struct sk_buff *skb;
1041 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1042
1043 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1044 if (!skb)
1045 return -ENOBUFS;
1046
1047 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1048 kfree_skb(skb);
1049 return -EINVAL;
1050 }
1051
1052 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1053}
1054
1055struct qdisc_dump_args
1056{
1057 struct qdisc_walker w;
1058 struct sk_buff *skb;
1059 struct netlink_callback *cb;
1060};
1061
1062static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1063{
1064 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1065
1066 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1067 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1068}
1069
1070static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1071{
1072 int t;
1073 int s_t;
1074 struct net_device *dev;
1075 struct Qdisc *q;
1076 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1077 struct qdisc_dump_args arg;
1078
1079 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1080 return 0;
1081 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1082 return 0;
1083
1084 s_t = cb->args[0];
1085 t = 0;
1086
1087 read_lock_bh(&qdisc_tree_lock);
1088 list_for_each_entry(q, &dev->qdisc_list, list) {
1089 if (t < s_t || !q->ops->cl_ops ||
1090 (tcm->tcm_parent &&
1091 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1092 t++;
1093 continue;
1094 }
1095 if (t > s_t)
1096 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1097 arg.w.fn = qdisc_class_dump;
1098 arg.skb = skb;
1099 arg.cb = cb;
1100 arg.w.stop = 0;
1101 arg.w.skip = cb->args[1];
1102 arg.w.count = 0;
1103 q->ops->cl_ops->walk(q, &arg.w);
1104 cb->args[1] = arg.w.count;
1105 if (arg.w.stop)
1106 break;
1107 t++;
1108 }
1109 read_unlock_bh(&qdisc_tree_lock);
1110
1111 cb->args[0] = t;
1112
1113 dev_put(dev);
1114 return skb->len;
1115}
1116
1117/* Main classifier routine: scans classifier chain attached
1118 to this qdisc, (optionally) tests for protocol and asks
1119 specific classifiers.
1120 */
1121int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1122 struct tcf_result *res)
1123{
1124 int err = 0;
1125 u32 protocol = skb->protocol;
1126#ifdef CONFIG_NET_CLS_ACT
1127 struct tcf_proto *otp = tp;
1128reclassify:
1129#endif
1130 protocol = skb->protocol;
1131
1132 for ( ; tp; tp = tp->next) {
1133 if ((tp->protocol == protocol ||
1134 tp->protocol == __constant_htons(ETH_P_ALL)) &&
1135 (err = tp->classify(skb, tp, res)) >= 0) {
1136#ifdef CONFIG_NET_CLS_ACT
1137 if ( TC_ACT_RECLASSIFY == err) {
1138 __u32 verd = (__u32) G_TC_VERD(skb->tc_verd);
1139 tp = otp;
1140
1141 if (MAX_REC_LOOP < verd++) {
1142 printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n",
1143 tp->prio&0xffff, ntohs(tp->protocol));
1144 return TC_ACT_SHOT;
1145 }
1146 skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd);
1147 goto reclassify;
1148 } else {
1149 if (skb->tc_verd)
1150 skb->tc_verd = SET_TC_VERD(skb->tc_verd,0);
1151 return err;
1152 }
1153#else
1154
1155 return err;
1156#endif
1157 }
1158
1159 }
1160 return -1;
1161}
1162
1163static int psched_us_per_tick = 1;
1164static int psched_tick_per_us = 1;
1165
1166#ifdef CONFIG_PROC_FS
1167static int psched_show(struct seq_file *seq, void *v)
1168{
1169 seq_printf(seq, "%08x %08x %08x %08x\n",
1170 psched_tick_per_us, psched_us_per_tick,
1171 1000000, HZ);
1172
1173 return 0;
1174}
1175
1176static int psched_open(struct inode *inode, struct file *file)
1177{
1178 return single_open(file, psched_show, PDE(inode)->data);
1179}
1180
1181static struct file_operations psched_fops = {
1182 .owner = THIS_MODULE,
1183 .open = psched_open,
1184 .read = seq_read,
1185 .llseek = seq_lseek,
1186 .release = single_release,
1187};
1188#endif
1189
1190#ifdef CONFIG_NET_SCH_CLK_CPU
1191psched_tdiff_t psched_clock_per_hz;
1192int psched_clock_scale;
1193EXPORT_SYMBOL(psched_clock_per_hz);
1194EXPORT_SYMBOL(psched_clock_scale);
1195
1196psched_time_t psched_time_base;
1197cycles_t psched_time_mark;
1198EXPORT_SYMBOL(psched_time_mark);
1199EXPORT_SYMBOL(psched_time_base);
1200
1201/*
1202 * Periodically adjust psched_time_base to avoid overflow
1203 * with 32-bit get_cycles(). Safe up to 4GHz CPU.
1204 */
1205static void psched_tick(unsigned long);
1206static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0);
1207
1208static void psched_tick(unsigned long dummy)
1209{
1210 if (sizeof(cycles_t) == sizeof(u32)) {
1211 psched_time_t dummy_stamp;
1212 PSCHED_GET_TIME(dummy_stamp);
1213 psched_timer.expires = jiffies + 1*HZ;
1214 add_timer(&psched_timer);
1215 }
1216}
1217
1218int __init psched_calibrate_clock(void)
1219{
1220 psched_time_t stamp, stamp1;
1221 struct timeval tv, tv1;
1222 psched_tdiff_t delay;
1223 long rdelay;
1224 unsigned long stop;
1225
1226 psched_tick(0);
1227 stop = jiffies + HZ/10;
1228 PSCHED_GET_TIME(stamp);
1229 do_gettimeofday(&tv);
1230 while (time_before(jiffies, stop)) {
1231 barrier();
1232 cpu_relax();
1233 }
1234 PSCHED_GET_TIME(stamp1);
1235 do_gettimeofday(&tv1);
1236
1237 delay = PSCHED_TDIFF(stamp1, stamp);
1238 rdelay = tv1.tv_usec - tv.tv_usec;
1239 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1240 if (rdelay > delay)
1241 return -1;
1242 delay /= rdelay;
1243 psched_tick_per_us = delay;
1244 while ((delay>>=1) != 0)
1245 psched_clock_scale++;
1246 psched_us_per_tick = 1<<psched_clock_scale;
1247 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1248 return 0;
1249}
1250#endif
1251
1252static int __init pktsched_init(void)
1253{
1254 struct rtnetlink_link *link_p;
1255
1256#ifdef CONFIG_NET_SCH_CLK_CPU
1257 if (psched_calibrate_clock() < 0)
1258 return -1;
1259#elif defined(CONFIG_NET_SCH_CLK_JIFFIES)
1260 psched_tick_per_us = HZ<<PSCHED_JSCALE;
1261 psched_us_per_tick = 1000000;
1262#endif
1263
1264 link_p = rtnetlink_links[PF_UNSPEC];
1265
1266 /* Setup rtnetlink links. It is made here to avoid
1267 exporting large number of public symbols.
1268 */
1269
1270 if (link_p) {
1271 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1272 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1273 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1274 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1275 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1276 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1277 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1278 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1279 }
1280
1281 register_qdisc(&pfifo_qdisc_ops);
1282 register_qdisc(&bfifo_qdisc_ops);
1283 proc_net_fops_create("psched", 0, &psched_fops);
1284
1285 return 0;
1286}
1287
1288subsys_initcall(pktsched_init);
1289
Stephen Hemmingerd5d75cd2005-05-03 16:24:57 -07001290EXPORT_SYMBOL(qdisc_lookup);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001291EXPORT_SYMBOL(qdisc_get_rtab);
1292EXPORT_SYMBOL(qdisc_put_rtab);
1293EXPORT_SYMBOL(register_qdisc);
1294EXPORT_SYMBOL(unregister_qdisc);
1295EXPORT_SYMBOL(tc_classify);