blob: dcd9c31dc3990ea533e029785282def16d7e070c [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
Linus Torvalds1da177e2005-04-16 15:20:36 -070013#include <linux/module.h>
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070019#include <linux/string.h>
20#include <linux/mm.h>
21#include <linux/socket.h>
22#include <linux/sockios.h>
23#include <linux/in.h>
24#include <linux/errno.h>
25#include <linux/interrupt.h>
26#include <linux/if_ether.h>
27#include <linux/inet.h>
28#include <linux/netdevice.h>
29#include <linux/etherdevice.h>
30#include <linux/notifier.h>
31#include <net/ip.h>
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -070032#include <net/netlink.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070033#include <net/route.h>
34#include <linux/skbuff.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h>
37
38
39/* Class-Based Queueing (CBQ) algorithm.
40 =======================================
41
42 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090043 Management Models for Packet Networks",
Linus Torvalds1da177e2005-04-16 15:20:36 -070044 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
45
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090046 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
Linus Torvalds1da177e2005-04-16 15:20:36 -070047
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090048 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
Linus Torvalds1da177e2005-04-16 15:20:36 -070049 Parameters", 1996
50
51 [4] Sally Floyd and Michael Speer, "Experimental Results
52 for Class-Based Queueing", 1998, not published.
53
54 -----------------------------------------------------------------------
55
56 Algorithm skeleton was taken from NS simulator cbq.cc.
57 If someone wants to check this code against the LBL version,
58 he should take into account that ONLY the skeleton was borrowed,
59 the implementation is different. Particularly:
60
61 --- The WRR algorithm is different. Our version looks more
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +090062 reasonable (I hope) and works when quanta are allowed to be
63 less than MTU, which is always the case when real time classes
64 have small rates. Note, that the statement of [3] is
65 incomplete, delay may actually be estimated even if class
66 per-round allotment is less than MTU. Namely, if per-round
67 allotment is W*r_i, and r_1+...+r_k = r < 1
Linus Torvalds1da177e2005-04-16 15:20:36 -070068
69 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
70
71 In the worst case we have IntServ estimate with D = W*r+k*MTU
72 and C = MTU*r. The proof (if correct at all) is trivial.
73
74
75 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
76 interpret some places, which look like wrong translations
77 from NS. Anyone is advised to find these differences
78 and explain to me, why I am wrong 8).
79
80 --- Linux has no EOI event, so that we cannot estimate true class
81 idle time. Workaround is to consider the next dequeue event
82 as sign that previous packet is finished. This is wrong because of
83 internal device queueing, but on a permanently loaded link it is true.
84 Moreover, combined with clock integrator, this scheme looks
85 very close to an ideal solution. */
86
87struct cbq_sched_data;
88
89
90struct cbq_class
91{
92 struct cbq_class *next; /* hash table link */
93 struct cbq_class *next_alive; /* next class with backlog in this priority band */
94
95/* Parameters */
96 u32 classid;
97 unsigned char priority; /* class priority */
98 unsigned char priority2; /* priority to be used after overlimit */
99 unsigned char ewma_log; /* time constant for idle time calculation */
100 unsigned char ovl_strategy;
101#ifdef CONFIG_NET_CLS_POLICE
102 unsigned char police;
103#endif
104
105 u32 defmap;
106
107 /* Link-sharing scheduler parameters */
108 long maxidle; /* Class parameters: see below. */
109 long offtime;
110 long minidle;
111 u32 avpkt;
112 struct qdisc_rate_table *R_tab;
113
114 /* Overlimit strategy parameters */
115 void (*overlimit)(struct cbq_class *cl);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700116 psched_tdiff_t penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117
118 /* General scheduler (WRR) parameters */
119 long allot;
120 long quantum; /* Allotment per WRR round */
121 long weight; /* Relative allotment: see below */
122
123 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
124 struct cbq_class *split; /* Ptr to split node */
125 struct cbq_class *share; /* Ptr to LS parent in the class tree */
126 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
127 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
128 parent otherwise */
129 struct cbq_class *sibling; /* Sibling chain */
130 struct cbq_class *children; /* Pointer to children chain */
131
132 struct Qdisc *q; /* Elementary queueing discipline */
133
134
135/* Variables */
136 unsigned char cpriority; /* Effective priority */
137 unsigned char delayed;
138 unsigned char level; /* level of the class in hierarchy:
139 0 for leaf classes, and maximal
140 level of children + 1 for nodes.
141 */
142
143 psched_time_t last; /* Last end of service */
144 psched_time_t undertime;
145 long avgidle;
146 long deficit; /* Saved deficit for WRR */
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700147 psched_time_t penalized;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 struct gnet_stats_basic bstats;
149 struct gnet_stats_queue qstats;
150 struct gnet_stats_rate_est rate_est;
151 spinlock_t *stats_lock;
152 struct tc_cbq_xstats xstats;
153
154 struct tcf_proto *filter_list;
155
156 int refcnt;
157 int filters;
158
159 struct cbq_class *defaults[TC_PRIO_MAX+1];
160};
161
162struct cbq_sched_data
163{
164 struct cbq_class *classes[16]; /* Hash table of all classes */
165 int nclasses[TC_CBQ_MAXPRIO+1];
166 unsigned quanta[TC_CBQ_MAXPRIO+1];
167
168 struct cbq_class link;
169
170 unsigned activemask;
171 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes
172 with backlog */
173
174#ifdef CONFIG_NET_CLS_POLICE
175 struct cbq_class *rx_class;
176#endif
177 struct cbq_class *tx_class;
178 struct cbq_class *tx_borrowed;
179 int tx_len;
180 psched_time_t now; /* Cached timestamp */
181 psched_time_t now_rt; /* Cached real time */
182 unsigned pmask;
183
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700184 struct hrtimer delay_timer;
Patrick McHardy88a99352007-03-16 01:21:11 -0700185 struct qdisc_watchdog watchdog; /* Watchdog timer,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 started when CBQ has
187 backlog, but cannot
188 transmit just now */
Patrick McHardy88a99352007-03-16 01:21:11 -0700189 psched_tdiff_t wd_expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700190 int toplevel;
191 u32 hgenerator;
192};
193
194
195#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
196
197
198static __inline__ unsigned cbq_hash(u32 h)
199{
200 h ^= h>>8;
201 h ^= h>>4;
202 return h&0xF;
203}
204
205static __inline__ struct cbq_class *
206cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
207{
208 struct cbq_class *cl;
209
210 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
211 if (cl->classid == classid)
212 return cl;
213 return NULL;
214}
215
216#ifdef CONFIG_NET_CLS_POLICE
217
218static struct cbq_class *
219cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
220{
221 struct cbq_class *cl, *new;
222
223 for (cl = this->tparent; cl; cl = cl->tparent)
224 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
225 return new;
226
227 return NULL;
228}
229
230#endif
231
232/* Classify packet. The procedure is pretty complicated, but
233 it allows us to combine link sharing and priority scheduling
234 transparently.
235
236 Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
237 so that it resolves to split nodes. Then packets are classified
238 by logical priority, or a more specific classifier may be attached
239 to the split node.
240 */
241
242static struct cbq_class *
243cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
244{
245 struct cbq_sched_data *q = qdisc_priv(sch);
246 struct cbq_class *head = &q->link;
247 struct cbq_class **defmap;
248 struct cbq_class *cl = NULL;
249 u32 prio = skb->priority;
250 struct tcf_result res;
251
252 /*
253 * Step 1. If skb->priority points to one of our classes, use it.
254 */
255 if (TC_H_MAJ(prio^sch->handle) == 0 &&
256 (cl = cbq_class_lookup(q, prio)) != NULL)
257 return cl;
258
Jamal Hadi Salim29f1df62006-01-08 22:35:55 -0800259 *qerr = NET_XMIT_BYPASS;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700260 for (;;) {
261 int result = 0;
262 defmap = head->defaults;
263
264 /*
265 * Step 2+n. Apply classifier.
266 */
267 if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
268 goto fallback;
269
270 if ((cl = (void*)res.class) == NULL) {
271 if (TC_H_MAJ(res.classid))
272 cl = cbq_class_lookup(q, res.classid);
273 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
274 cl = defmap[TC_PRIO_BESTEFFORT];
275
276 if (cl == NULL || cl->level >= head->level)
277 goto fallback;
278 }
279
280#ifdef CONFIG_NET_CLS_ACT
281 switch (result) {
282 case TC_ACT_QUEUED:
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900283 case TC_ACT_STOLEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700284 *qerr = NET_XMIT_SUCCESS;
285 case TC_ACT_SHOT:
286 return NULL;
287 }
288#elif defined(CONFIG_NET_CLS_POLICE)
289 switch (result) {
290 case TC_POLICE_RECLASSIFY:
291 return cbq_reclassify(skb, cl);
292 case TC_POLICE_SHOT:
293 return NULL;
294 default:
295 break;
296 }
297#endif
298 if (cl->level == 0)
299 return cl;
300
301 /*
302 * Step 3+n. If classifier selected a link sharing class,
303 * apply agency specific classifier.
304 * Repeat this procdure until we hit a leaf node.
305 */
306 head = cl;
307 }
308
309fallback:
310 cl = head;
311
312 /*
313 * Step 4. No success...
314 */
315 if (TC_H_MAJ(prio) == 0 &&
316 !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
317 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
318 return head;
319
320 return cl;
321}
322
323/*
324 A packet has just been enqueued on the empty class.
325 cbq_activate_class adds it to the tail of active class list
326 of its priority band.
327 */
328
329static __inline__ void cbq_activate_class(struct cbq_class *cl)
330{
331 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
332 int prio = cl->cpriority;
333 struct cbq_class *cl_tail;
334
335 cl_tail = q->active[prio];
336 q->active[prio] = cl;
337
338 if (cl_tail != NULL) {
339 cl->next_alive = cl_tail->next_alive;
340 cl_tail->next_alive = cl;
341 } else {
342 cl->next_alive = cl;
343 q->activemask |= (1<<prio);
344 }
345}
346
347/*
348 Unlink class from active chain.
349 Note that this same procedure is done directly in cbq_dequeue*
350 during round-robin procedure.
351 */
352
353static void cbq_deactivate_class(struct cbq_class *this)
354{
355 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
356 int prio = this->cpriority;
357 struct cbq_class *cl;
358 struct cbq_class *cl_prev = q->active[prio];
359
360 do {
361 cl = cl_prev->next_alive;
362 if (cl == this) {
363 cl_prev->next_alive = cl->next_alive;
364 cl->next_alive = NULL;
365
366 if (cl == q->active[prio]) {
367 q->active[prio] = cl_prev;
368 if (cl == q->active[prio]) {
369 q->active[prio] = NULL;
370 q->activemask &= ~(1<<prio);
371 return;
372 }
373 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 return;
375 }
376 } while ((cl_prev = cl) != q->active[prio]);
377}
378
379static void
380cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
381{
382 int toplevel = q->toplevel;
383
384 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
385 psched_time_t now;
386 psched_tdiff_t incr;
387
388 PSCHED_GET_TIME(now);
389 incr = PSCHED_TDIFF(now, q->now_rt);
390 PSCHED_TADD2(q->now, incr, now);
391
392 do {
393 if (PSCHED_TLESS(cl->undertime, now)) {
394 q->toplevel = cl->level;
395 return;
396 }
397 } while ((cl=cl->borrow) != NULL && toplevel > cl->level);
398 }
399}
400
401static int
402cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
403{
404 struct cbq_sched_data *q = qdisc_priv(sch);
405 int len = skb->len;
406 int ret;
407 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
408
409#ifdef CONFIG_NET_CLS_POLICE
410 q->rx_class = cl;
411#endif
412 if (cl == NULL) {
Jamal Hadi Salim29f1df62006-01-08 22:35:55 -0800413 if (ret == NET_XMIT_BYPASS)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700414 sch->qstats.drops++;
415 kfree_skb(skb);
416 return ret;
417 }
418
419#ifdef CONFIG_NET_CLS_POLICE
420 cl->q->__parent = sch;
421#endif
422 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
423 sch->q.qlen++;
424 sch->bstats.packets++;
425 sch->bstats.bytes+=len;
426 cbq_mark_toplevel(q, cl);
427 if (!cl->next_alive)
428 cbq_activate_class(cl);
429 return ret;
430 }
431
432 sch->qstats.drops++;
433 cbq_mark_toplevel(q, cl);
434 cl->qstats.drops++;
435 return ret;
436}
437
438static int
439cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
440{
441 struct cbq_sched_data *q = qdisc_priv(sch);
442 struct cbq_class *cl;
443 int ret;
444
445 if ((cl = q->tx_class) == NULL) {
446 kfree_skb(skb);
447 sch->qstats.drops++;
448 return NET_XMIT_CN;
449 }
450 q->tx_class = NULL;
451
452 cbq_mark_toplevel(q, cl);
453
454#ifdef CONFIG_NET_CLS_POLICE
455 q->rx_class = cl;
456 cl->q->__parent = sch;
457#endif
458 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
459 sch->q.qlen++;
460 sch->qstats.requeues++;
461 if (!cl->next_alive)
462 cbq_activate_class(cl);
463 return 0;
464 }
465 sch->qstats.drops++;
466 cl->qstats.drops++;
467 return ret;
468}
469
470/* Overlimit actions */
471
472/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
473
474static void cbq_ovl_classic(struct cbq_class *cl)
475{
476 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
477 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
478
479 if (!cl->delayed) {
480 delay += cl->offtime;
481
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900482 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700483 Class goes to sleep, so that it will have no
484 chance to work avgidle. Let's forgive it 8)
485
486 BTW cbq-2.0 has a crap in this
487 place, apparently they forgot to shift it by cl->ewma_log.
488 */
489 if (cl->avgidle < 0)
490 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
491 if (cl->avgidle < cl->minidle)
492 cl->avgidle = cl->minidle;
493 if (delay <= 0)
494 delay = 1;
495 PSCHED_TADD2(q->now, delay, cl->undertime);
496
497 cl->xstats.overactions++;
498 cl->delayed = 1;
499 }
500 if (q->wd_expires == 0 || q->wd_expires > delay)
501 q->wd_expires = delay;
502
503 /* Dirty work! We must schedule wakeups based on
504 real available rate, rather than leaf rate,
505 which may be tiny (even zero).
506 */
507 if (q->toplevel == TC_CBQ_MAXLEVEL) {
508 struct cbq_class *b;
509 psched_tdiff_t base_delay = q->wd_expires;
510
511 for (b = cl->borrow; b; b = b->borrow) {
512 delay = PSCHED_TDIFF(b->undertime, q->now);
513 if (delay < base_delay) {
514 if (delay <= 0)
515 delay = 1;
516 base_delay = delay;
517 }
518 }
519
520 q->wd_expires = base_delay;
521 }
522}
523
524/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
525 they go overlimit
526 */
527
528static void cbq_ovl_rclassic(struct cbq_class *cl)
529{
530 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
531 struct cbq_class *this = cl;
532
533 do {
534 if (cl->level > q->toplevel) {
535 cl = NULL;
536 break;
537 }
538 } while ((cl = cl->borrow) != NULL);
539
540 if (cl == NULL)
541 cl = this;
542 cbq_ovl_classic(cl);
543}
544
545/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
546
547static void cbq_ovl_delay(struct cbq_class *cl)
548{
549 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
550 psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now);
551
552 if (!cl->delayed) {
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700553 psched_time_t sched = q->now;
554 ktime_t expires;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700555
556 delay += cl->offtime;
557 if (cl->avgidle < 0)
558 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
559 if (cl->avgidle < cl->minidle)
560 cl->avgidle = cl->minidle;
561 PSCHED_TADD2(q->now, delay, cl->undertime);
562
563 if (delay > 0) {
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700564 sched += delay + cl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700565 cl->penalized = sched;
566 cl->cpriority = TC_CBQ_MAXPRIO;
567 q->pmask |= (1<<TC_CBQ_MAXPRIO);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700568
569 expires = ktime_set(0, 0);
570 expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
571 if (hrtimer_try_to_cancel(&q->delay_timer) &&
572 ktime_to_ns(ktime_sub(q->delay_timer.expires,
573 expires)) > 0)
574 q->delay_timer.expires = expires;
575 hrtimer_restart(&q->delay_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576 cl->delayed = 1;
577 cl->xstats.overactions++;
578 return;
579 }
580 delay = 1;
581 }
582 if (q->wd_expires == 0 || q->wd_expires > delay)
583 q->wd_expires = delay;
584}
585
586/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
587
588static void cbq_ovl_lowprio(struct cbq_class *cl)
589{
590 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
591
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700592 cl->penalized = q->now + cl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700593
594 if (cl->cpriority != cl->priority2) {
595 cl->cpriority = cl->priority2;
596 q->pmask |= (1<<cl->cpriority);
597 cl->xstats.overactions++;
598 }
599 cbq_ovl_classic(cl);
600}
601
602/* TC_CBQ_OVL_DROP: penalize class by dropping */
603
604static void cbq_ovl_drop(struct cbq_class *cl)
605{
606 if (cl->q->ops->drop)
607 if (cl->q->ops->drop(cl->q))
608 cl->qdisc->q.qlen--;
609 cl->xstats.overactions++;
610 cbq_ovl_classic(cl);
611}
612
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700613static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
614 psched_time_t now)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615{
616 struct cbq_class *cl;
617 struct cbq_class *cl_prev = q->active[prio];
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700618 psched_time_t sched = now;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619
620 if (cl_prev == NULL)
Patrick McHardye9054a32007-03-16 01:21:40 -0700621 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700622
623 do {
624 cl = cl_prev->next_alive;
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700625 if (now - cl->penalized > 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700626 cl_prev->next_alive = cl->next_alive;
627 cl->next_alive = NULL;
628 cl->cpriority = cl->priority;
629 cl->delayed = 0;
630 cbq_activate_class(cl);
631
632 if (cl == q->active[prio]) {
633 q->active[prio] = cl_prev;
634 if (cl == q->active[prio]) {
635 q->active[prio] = NULL;
636 return 0;
637 }
638 }
639
640 cl = cl_prev->next_alive;
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700641 } else if (sched - cl->penalized > 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642 sched = cl->penalized;
643 } while ((cl_prev = cl) != q->active[prio]);
644
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700645 return sched - now;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646}
647
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700648static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649{
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700650 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
651 delay_timer);
652 struct Qdisc *sch = q->watchdog.qdisc;
653 psched_time_t now;
654 psched_tdiff_t delay = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700655 unsigned pmask;
656
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700657 PSCHED_GET_TIME(now);
658
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 pmask = q->pmask;
660 q->pmask = 0;
661
662 while (pmask) {
663 int prio = ffz(~pmask);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700664 psched_tdiff_t tmp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665
666 pmask &= ~(1<<prio);
667
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700668 tmp = cbq_undelay_prio(q, prio, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669 if (tmp > 0) {
670 q->pmask |= 1<<prio;
671 if (tmp < delay || delay == 0)
672 delay = tmp;
673 }
674 }
675
676 if (delay) {
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700677 ktime_t time;
678
679 time = ktime_set(0, 0);
680 time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
681 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700682 }
683
684 sch->flags &= ~TCQ_F_THROTTLED;
685 netif_schedule(sch->dev);
Patrick McHardy1a13cb62007-03-16 01:22:20 -0700686 return HRTIMER_NORESTART;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687}
688
689
690#ifdef CONFIG_NET_CLS_POLICE
691
692static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
693{
694 int len = skb->len;
695 struct Qdisc *sch = child->__parent;
696 struct cbq_sched_data *q = qdisc_priv(sch);
697 struct cbq_class *cl = q->rx_class;
698
699 q->rx_class = NULL;
700
701 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
702
703 cbq_mark_toplevel(q, cl);
704
705 q->rx_class = cl;
706 cl->q->__parent = sch;
707
708 if (cl->q->enqueue(skb, cl->q) == 0) {
709 sch->q.qlen++;
710 sch->bstats.packets++;
711 sch->bstats.bytes+=len;
712 if (!cl->next_alive)
713 cbq_activate_class(cl);
714 return 0;
715 }
716 sch->qstats.drops++;
717 return 0;
718 }
719
720 sch->qstats.drops++;
721 return -1;
722}
723#endif
724
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900725/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700726 It is mission critical procedure.
727
728 We "regenerate" toplevel cutoff, if transmitting class
729 has backlog and it is not regulated. It is not part of
730 original CBQ description, but looks more reasonable.
731 Probably, it is wrong. This question needs further investigation.
732*/
733
734static __inline__ void
735cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
736 struct cbq_class *borrowed)
737{
738 if (cl && q->toplevel >= borrowed->level) {
739 if (cl->q->q.qlen > 1) {
740 do {
741 if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) {
742 q->toplevel = borrowed->level;
743 return;
744 }
745 } while ((borrowed=borrowed->borrow) != NULL);
746 }
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900747#if 0
Linus Torvalds1da177e2005-04-16 15:20:36 -0700748 /* It is not necessary now. Uncommenting it
749 will save CPU cycles, but decrease fairness.
750 */
751 q->toplevel = TC_CBQ_MAXLEVEL;
752#endif
753 }
754}
755
756static void
757cbq_update(struct cbq_sched_data *q)
758{
759 struct cbq_class *this = q->tx_class;
760 struct cbq_class *cl = this;
761 int len = q->tx_len;
762
763 q->tx_class = NULL;
764
765 for ( ; cl; cl = cl->share) {
766 long avgidle = cl->avgidle;
767 long idle;
768
769 cl->bstats.packets++;
770 cl->bstats.bytes += len;
771
772 /*
773 (now - last) is total time between packet right edges.
774 (last_pktlen/rate) is "virtual" busy time, so that
775
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900776 idle = (now - last) - last_pktlen/rate
Linus Torvalds1da177e2005-04-16 15:20:36 -0700777 */
778
779 idle = PSCHED_TDIFF(q->now, cl->last);
780 if ((unsigned long)idle > 128*1024*1024) {
781 avgidle = cl->maxidle;
782 } else {
783 idle -= L2T(cl, len);
784
785 /* true_avgidle := (1-W)*true_avgidle + W*idle,
786 where W=2^{-ewma_log}. But cl->avgidle is scaled:
787 cl->avgidle == true_avgidle/W,
788 hence:
789 */
790 avgidle += idle - (avgidle>>cl->ewma_log);
791 }
792
793 if (avgidle <= 0) {
794 /* Overlimit or at-limit */
795
796 if (avgidle < cl->minidle)
797 avgidle = cl->minidle;
798
799 cl->avgidle = avgidle;
800
801 /* Calculate expected time, when this class
802 will be allowed to send.
803 It will occur, when:
804 (1-W)*true_avgidle + W*delay = 0, i.e.
805 idle = (1/W - 1)*(-true_avgidle)
806 or
807 idle = (1 - W)*(-cl->avgidle);
808 */
809 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
810
811 /*
812 That is not all.
813 To maintain the rate allocated to the class,
814 we add to undertime virtual clock,
815 necessary to complete transmitted packet.
816 (len/phys_bandwidth has been already passed
817 to the moment of cbq_update)
818 */
819
820 idle -= L2T(&q->link, len);
821 idle += L2T(cl, len);
822
823 PSCHED_AUDIT_TDIFF(idle);
824
825 PSCHED_TADD2(q->now, idle, cl->undertime);
826 } else {
827 /* Underlimit */
828
829 PSCHED_SET_PASTPERFECT(cl->undertime);
830 if (avgidle > cl->maxidle)
831 cl->avgidle = cl->maxidle;
832 else
833 cl->avgidle = avgidle;
834 }
835 cl->last = q->now;
836 }
837
838 cbq_update_toplevel(q, this, q->tx_borrowed);
839}
840
841static __inline__ struct cbq_class *
842cbq_under_limit(struct cbq_class *cl)
843{
844 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
845 struct cbq_class *this_cl = cl;
846
847 if (cl->tparent == NULL)
848 return cl;
849
850 if (PSCHED_IS_PASTPERFECT(cl->undertime) ||
851 !PSCHED_TLESS(q->now, cl->undertime)) {
852 cl->delayed = 0;
853 return cl;
854 }
855
856 do {
857 /* It is very suspicious place. Now overlimit
858 action is generated for not bounded classes
859 only if link is completely congested.
860 Though it is in agree with ancestor-only paradigm,
861 it looks very stupid. Particularly,
862 it means that this chunk of code will either
863 never be called or result in strong amplification
864 of burstiness. Dangerous, silly, and, however,
865 no another solution exists.
866 */
867 if ((cl = cl->borrow) == NULL) {
868 this_cl->qstats.overlimits++;
869 this_cl->overlimit(this_cl);
870 return NULL;
871 }
872 if (cl->level > q->toplevel)
873 return NULL;
874 } while (!PSCHED_IS_PASTPERFECT(cl->undertime) &&
875 PSCHED_TLESS(q->now, cl->undertime));
876
877 cl->delayed = 0;
878 return cl;
879}
880
881static __inline__ struct sk_buff *
882cbq_dequeue_prio(struct Qdisc *sch, int prio)
883{
884 struct cbq_sched_data *q = qdisc_priv(sch);
885 struct cbq_class *cl_tail, *cl_prev, *cl;
886 struct sk_buff *skb;
887 int deficit;
888
889 cl_tail = cl_prev = q->active[prio];
890 cl = cl_prev->next_alive;
891
892 do {
893 deficit = 0;
894
895 /* Start round */
896 do {
897 struct cbq_class *borrow = cl;
898
899 if (cl->q->q.qlen &&
900 (borrow = cbq_under_limit(cl)) == NULL)
901 goto skip_class;
902
903 if (cl->deficit <= 0) {
904 /* Class exhausted its allotment per
905 this round. Switch to the next one.
906 */
907 deficit = 1;
908 cl->deficit += cl->quantum;
909 goto next_class;
910 }
911
912 skb = cl->q->dequeue(cl->q);
913
914 /* Class did not give us any skb :-(
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +0900915 It could occur even if cl->q->q.qlen != 0
Linus Torvalds1da177e2005-04-16 15:20:36 -0700916 f.e. if cl->q == "tbf"
917 */
918 if (skb == NULL)
919 goto skip_class;
920
921 cl->deficit -= skb->len;
922 q->tx_class = cl;
923 q->tx_borrowed = borrow;
924 if (borrow != cl) {
925#ifndef CBQ_XSTATS_BORROWS_BYTES
926 borrow->xstats.borrows++;
927 cl->xstats.borrows++;
928#else
929 borrow->xstats.borrows += skb->len;
930 cl->xstats.borrows += skb->len;
931#endif
932 }
933 q->tx_len = skb->len;
934
935 if (cl->deficit <= 0) {
936 q->active[prio] = cl;
937 cl = cl->next_alive;
938 cl->deficit += cl->quantum;
939 }
940 return skb;
941
942skip_class:
943 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
944 /* Class is empty or penalized.
945 Unlink it from active chain.
946 */
947 cl_prev->next_alive = cl->next_alive;
948 cl->next_alive = NULL;
949
950 /* Did cl_tail point to it? */
951 if (cl == cl_tail) {
952 /* Repair it! */
953 cl_tail = cl_prev;
954
955 /* Was it the last class in this band? */
956 if (cl == cl_tail) {
957 /* Kill the band! */
958 q->active[prio] = NULL;
959 q->activemask &= ~(1<<prio);
960 if (cl->q->q.qlen)
961 cbq_activate_class(cl);
962 return NULL;
963 }
964
965 q->active[prio] = cl_tail;
966 }
967 if (cl->q->q.qlen)
968 cbq_activate_class(cl);
969
970 cl = cl_prev;
971 }
972
973next_class:
974 cl_prev = cl;
975 cl = cl->next_alive;
976 } while (cl_prev != cl_tail);
977 } while (deficit);
978
979 q->active[prio] = cl_prev;
980
981 return NULL;
982}
983
984static __inline__ struct sk_buff *
985cbq_dequeue_1(struct Qdisc *sch)
986{
987 struct cbq_sched_data *q = qdisc_priv(sch);
988 struct sk_buff *skb;
989 unsigned activemask;
990
991 activemask = q->activemask&0xFF;
992 while (activemask) {
993 int prio = ffz(~activemask);
994 activemask &= ~(1<<prio);
995 skb = cbq_dequeue_prio(sch, prio);
996 if (skb)
997 return skb;
998 }
999 return NULL;
1000}
1001
1002static struct sk_buff *
1003cbq_dequeue(struct Qdisc *sch)
1004{
1005 struct sk_buff *skb;
1006 struct cbq_sched_data *q = qdisc_priv(sch);
1007 psched_time_t now;
1008 psched_tdiff_t incr;
1009
1010 PSCHED_GET_TIME(now);
1011 incr = PSCHED_TDIFF(now, q->now_rt);
1012
1013 if (q->tx_class) {
1014 psched_tdiff_t incr2;
1015 /* Time integrator. We calculate EOS time
1016 by adding expected packet transmission time.
1017 If real time is greater, we warp artificial clock,
1018 so that:
1019
1020 cbq_time = max(real_time, work);
1021 */
1022 incr2 = L2T(&q->link, q->tx_len);
1023 PSCHED_TADD(q->now, incr2);
1024 cbq_update(q);
1025 if ((incr -= incr2) < 0)
1026 incr = 0;
1027 }
1028 PSCHED_TADD(q->now, incr);
1029 q->now_rt = now;
1030
1031 for (;;) {
1032 q->wd_expires = 0;
1033
1034 skb = cbq_dequeue_1(sch);
1035 if (skb) {
1036 sch->q.qlen--;
1037 sch->flags &= ~TCQ_F_THROTTLED;
1038 return skb;
1039 }
1040
1041 /* All the classes are overlimit.
1042
1043 It is possible, if:
1044
1045 1. Scheduler is empty.
1046 2. Toplevel cutoff inhibited borrowing.
1047 3. Root class is overlimit.
1048
1049 Reset 2d and 3d conditions and retry.
1050
1051 Note, that NS and cbq-2.0 are buggy, peeking
1052 an arbitrary class is appropriate for ancestor-only
1053 sharing, but not for toplevel algorithm.
1054
1055 Our version is better, but slower, because it requires
1056 two passes, but it is unavoidable with top-level sharing.
1057 */
1058
1059 if (q->toplevel == TC_CBQ_MAXLEVEL &&
1060 PSCHED_IS_PASTPERFECT(q->link.undertime))
1061 break;
1062
1063 q->toplevel = TC_CBQ_MAXLEVEL;
1064 PSCHED_SET_PASTPERFECT(q->link.undertime);
1065 }
1066
1067 /* No packets in scheduler or nobody wants to give them to us :-(
1068 Sigh... start watchdog timer in the last case. */
1069
1070 if (sch->q.qlen) {
1071 sch->qstats.overlimits++;
Patrick McHardy88a99352007-03-16 01:21:11 -07001072 if (q->wd_expires)
1073 qdisc_watchdog_schedule(&q->watchdog,
Patrick McHardybb239ac2007-03-16 12:31:28 -07001074 now + q->wd_expires);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001075 }
1076 return NULL;
1077}
1078
1079/* CBQ class maintanance routines */
1080
1081static void cbq_adjust_levels(struct cbq_class *this)
1082{
1083 if (this == NULL)
1084 return;
1085
1086 do {
1087 int level = 0;
1088 struct cbq_class *cl;
1089
1090 if ((cl = this->children) != NULL) {
1091 do {
1092 if (cl->level > level)
1093 level = cl->level;
1094 } while ((cl = cl->sibling) != this->children);
1095 }
1096 this->level = level+1;
1097 } while ((this = this->tparent) != NULL);
1098}
1099
1100static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1101{
1102 struct cbq_class *cl;
1103 unsigned h;
1104
1105 if (q->quanta[prio] == 0)
1106 return;
1107
1108 for (h=0; h<16; h++) {
1109 for (cl = q->classes[h]; cl; cl = cl->next) {
1110 /* BUGGGG... Beware! This expression suffer of
1111 arithmetic overflows!
1112 */
1113 if (cl->priority == prio) {
1114 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1115 q->quanta[prio];
1116 }
1117 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1118 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1119 cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1120 }
1121 }
1122 }
1123}
1124
1125static void cbq_sync_defmap(struct cbq_class *cl)
1126{
1127 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1128 struct cbq_class *split = cl->split;
1129 unsigned h;
1130 int i;
1131
1132 if (split == NULL)
1133 return;
1134
1135 for (i=0; i<=TC_PRIO_MAX; i++) {
1136 if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1137 split->defaults[i] = NULL;
1138 }
1139
1140 for (i=0; i<=TC_PRIO_MAX; i++) {
1141 int level = split->level;
1142
1143 if (split->defaults[i])
1144 continue;
1145
1146 for (h=0; h<16; h++) {
1147 struct cbq_class *c;
1148
1149 for (c = q->classes[h]; c; c = c->next) {
1150 if (c->split == split && c->level < level &&
1151 c->defmap&(1<<i)) {
1152 split->defaults[i] = c;
1153 level = c->level;
1154 }
1155 }
1156 }
1157 }
1158}
1159
1160static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1161{
1162 struct cbq_class *split = NULL;
1163
1164 if (splitid == 0) {
1165 if ((split = cl->split) == NULL)
1166 return;
1167 splitid = split->classid;
1168 }
1169
1170 if (split == NULL || split->classid != splitid) {
1171 for (split = cl->tparent; split; split = split->tparent)
1172 if (split->classid == splitid)
1173 break;
1174 }
1175
1176 if (split == NULL)
1177 return;
1178
1179 if (cl->split != split) {
1180 cl->defmap = 0;
1181 cbq_sync_defmap(cl);
1182 cl->split = split;
1183 cl->defmap = def&mask;
1184 } else
1185 cl->defmap = (cl->defmap&~mask)|(def&mask);
1186
1187 cbq_sync_defmap(cl);
1188}
1189
1190static void cbq_unlink_class(struct cbq_class *this)
1191{
1192 struct cbq_class *cl, **clp;
1193 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1194
1195 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1196 if (cl == this) {
1197 *clp = cl->next;
1198 cl->next = NULL;
1199 break;
1200 }
1201 }
1202
1203 if (this->tparent) {
1204 clp=&this->sibling;
1205 cl = *clp;
1206 do {
1207 if (cl == this) {
1208 *clp = cl->sibling;
1209 break;
1210 }
1211 clp = &cl->sibling;
1212 } while ((cl = *clp) != this->sibling);
1213
1214 if (this->tparent->children == this) {
1215 this->tparent->children = this->sibling;
1216 if (this->sibling == this)
1217 this->tparent->children = NULL;
1218 }
1219 } else {
1220 BUG_TRAP(this->sibling == this);
1221 }
1222}
1223
1224static void cbq_link_class(struct cbq_class *this)
1225{
1226 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1227 unsigned h = cbq_hash(this->classid);
1228 struct cbq_class *parent = this->tparent;
1229
1230 this->sibling = this;
1231 this->next = q->classes[h];
1232 q->classes[h] = this;
1233
1234 if (parent == NULL)
1235 return;
1236
1237 if (parent->children == NULL) {
1238 parent->children = this;
1239 } else {
1240 this->sibling = parent->children->sibling;
1241 parent->children->sibling = this;
1242 }
1243}
1244
1245static unsigned int cbq_drop(struct Qdisc* sch)
1246{
1247 struct cbq_sched_data *q = qdisc_priv(sch);
1248 struct cbq_class *cl, *cl_head;
1249 int prio;
1250 unsigned int len;
1251
1252 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1253 if ((cl_head = q->active[prio]) == NULL)
1254 continue;
1255
1256 cl = cl_head;
1257 do {
1258 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1259 sch->q.qlen--;
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001260 if (!cl->q->q.qlen)
1261 cbq_deactivate_class(cl);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001262 return len;
1263 }
1264 } while ((cl = cl->next_alive) != cl_head);
1265 }
1266 return 0;
1267}
1268
1269static void
1270cbq_reset(struct Qdisc* sch)
1271{
1272 struct cbq_sched_data *q = qdisc_priv(sch);
1273 struct cbq_class *cl;
1274 int prio;
1275 unsigned h;
1276
1277 q->activemask = 0;
1278 q->pmask = 0;
1279 q->tx_class = NULL;
1280 q->tx_borrowed = NULL;
Patrick McHardy88a99352007-03-16 01:21:11 -07001281 qdisc_watchdog_cancel(&q->watchdog);
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001282 hrtimer_cancel(&q->delay_timer);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001283 q->toplevel = TC_CBQ_MAXLEVEL;
1284 PSCHED_GET_TIME(q->now);
1285 q->now_rt = q->now;
1286
1287 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1288 q->active[prio] = NULL;
1289
1290 for (h = 0; h < 16; h++) {
1291 for (cl = q->classes[h]; cl; cl = cl->next) {
1292 qdisc_reset(cl->q);
1293
1294 cl->next_alive = NULL;
1295 PSCHED_SET_PASTPERFECT(cl->undertime);
1296 cl->avgidle = cl->maxidle;
1297 cl->deficit = cl->quantum;
1298 cl->cpriority = cl->priority;
1299 }
1300 }
1301 sch->q.qlen = 0;
1302}
1303
1304
1305static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1306{
1307 if (lss->change&TCF_CBQ_LSS_FLAGS) {
1308 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1309 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1310 }
1311 if (lss->change&TCF_CBQ_LSS_EWMA)
1312 cl->ewma_log = lss->ewma_log;
1313 if (lss->change&TCF_CBQ_LSS_AVPKT)
1314 cl->avpkt = lss->avpkt;
1315 if (lss->change&TCF_CBQ_LSS_MINIDLE)
1316 cl->minidle = -(long)lss->minidle;
1317 if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1318 cl->maxidle = lss->maxidle;
1319 cl->avgidle = lss->maxidle;
1320 }
1321 if (lss->change&TCF_CBQ_LSS_OFFTIME)
1322 cl->offtime = lss->offtime;
1323 return 0;
1324}
1325
1326static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1327{
1328 q->nclasses[cl->priority]--;
1329 q->quanta[cl->priority] -= cl->weight;
1330 cbq_normalize_quanta(q, cl->priority);
1331}
1332
1333static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1334{
1335 q->nclasses[cl->priority]++;
1336 q->quanta[cl->priority] += cl->weight;
1337 cbq_normalize_quanta(q, cl->priority);
1338}
1339
1340static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1341{
1342 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1343
1344 if (wrr->allot)
1345 cl->allot = wrr->allot;
1346 if (wrr->weight)
1347 cl->weight = wrr->weight;
1348 if (wrr->priority) {
1349 cl->priority = wrr->priority-1;
1350 cl->cpriority = cl->priority;
1351 if (cl->priority >= cl->priority2)
1352 cl->priority2 = TC_CBQ_MAXPRIO-1;
1353 }
1354
1355 cbq_addprio(q, cl);
1356 return 0;
1357}
1358
1359static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1360{
1361 switch (ovl->strategy) {
1362 case TC_CBQ_OVL_CLASSIC:
1363 cl->overlimit = cbq_ovl_classic;
1364 break;
1365 case TC_CBQ_OVL_DELAY:
1366 cl->overlimit = cbq_ovl_delay;
1367 break;
1368 case TC_CBQ_OVL_LOWPRIO:
1369 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1370 ovl->priority2-1 <= cl->priority)
1371 return -EINVAL;
1372 cl->priority2 = ovl->priority2-1;
1373 cl->overlimit = cbq_ovl_lowprio;
1374 break;
1375 case TC_CBQ_OVL_DROP:
1376 cl->overlimit = cbq_ovl_drop;
1377 break;
1378 case TC_CBQ_OVL_RCLASSIC:
1379 cl->overlimit = cbq_ovl_rclassic;
1380 break;
1381 default:
1382 return -EINVAL;
1383 }
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001384 cl->penalty = ovl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001385 return 0;
1386}
1387
1388#ifdef CONFIG_NET_CLS_POLICE
1389static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1390{
1391 cl->police = p->police;
1392
1393 if (cl->q->handle) {
1394 if (p->police == TC_POLICE_RECLASSIFY)
1395 cl->q->reshape_fail = cbq_reshape_fail;
1396 else
1397 cl->q->reshape_fail = NULL;
1398 }
1399 return 0;
1400}
1401#endif
1402
1403static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1404{
1405 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1406 return 0;
1407}
1408
1409static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1410{
1411 struct cbq_sched_data *q = qdisc_priv(sch);
1412 struct rtattr *tb[TCA_CBQ_MAX];
1413 struct tc_ratespec *r;
1414
1415 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1416 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1417 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1418 return -EINVAL;
1419
1420 if (tb[TCA_CBQ_LSSOPT-1] &&
1421 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1422 return -EINVAL;
1423
1424 r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1425
1426 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1427 return -EINVAL;
1428
1429 q->link.refcnt = 1;
1430 q->link.sibling = &q->link;
1431 q->link.classid = sch->handle;
1432 q->link.qdisc = sch;
Patrick McHardy9f9afec2006-11-29 17:35:18 -08001433 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1434 sch->handle)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 q->link.q = &noop_qdisc;
1436
1437 q->link.priority = TC_CBQ_MAXPRIO-1;
1438 q->link.priority2 = TC_CBQ_MAXPRIO-1;
1439 q->link.cpriority = TC_CBQ_MAXPRIO-1;
1440 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1441 q->link.overlimit = cbq_ovl_classic;
1442 q->link.allot = psched_mtu(sch->dev);
1443 q->link.quantum = q->link.allot;
1444 q->link.weight = q->link.R_tab->rate.rate;
1445
1446 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1447 q->link.avpkt = q->link.allot/2;
1448 q->link.minidle = -0x7FFFFFFF;
1449 q->link.stats_lock = &sch->dev->queue_lock;
1450
Patrick McHardy88a99352007-03-16 01:21:11 -07001451 qdisc_watchdog_init(&q->watchdog, sch);
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001452 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453 q->delay_timer.function = cbq_undelay;
1454 q->toplevel = TC_CBQ_MAXLEVEL;
1455 PSCHED_GET_TIME(q->now);
1456 q->now_rt = q->now;
1457
1458 cbq_link_class(&q->link);
1459
1460 if (tb[TCA_CBQ_LSSOPT-1])
1461 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1462
1463 cbq_addprio(q, &q->link);
1464 return 0;
1465}
1466
1467static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1468{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001469 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001470
1471 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1472 return skb->len;
1473
1474rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001475 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001476 return -1;
1477}
1478
1479static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1480{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001481 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482 struct tc_cbq_lssopt opt;
1483
1484 opt.flags = 0;
1485 if (cl->borrow == NULL)
1486 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1487 if (cl->share == NULL)
1488 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1489 opt.ewma_log = cl->ewma_log;
1490 opt.level = cl->level;
1491 opt.avpkt = cl->avpkt;
1492 opt.maxidle = cl->maxidle;
1493 opt.minidle = (u32)(-cl->minidle);
1494 opt.offtime = cl->offtime;
1495 opt.change = ~0;
1496 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1497 return skb->len;
1498
1499rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001500 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001501 return -1;
1502}
1503
1504static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1505{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001506 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507 struct tc_cbq_wrropt opt;
1508
1509 opt.flags = 0;
1510 opt.allot = cl->allot;
1511 opt.priority = cl->priority+1;
1512 opt.cpriority = cl->cpriority+1;
1513 opt.weight = cl->weight;
1514 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1515 return skb->len;
1516
1517rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001518 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001519 return -1;
1520}
1521
1522static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1523{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001524 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525 struct tc_cbq_ovl opt;
1526
1527 opt.strategy = cl->ovl_strategy;
1528 opt.priority2 = cl->priority2+1;
Patrick McHardy8a470772005-06-28 12:56:45 -07001529 opt.pad = 0;
Patrick McHardy1a13cb62007-03-16 01:22:20 -07001530 opt.penalty = cl->penalty;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001531 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1532 return skb->len;
1533
1534rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001535 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001536 return -1;
1537}
1538
1539static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1540{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001541 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001542 struct tc_cbq_fopt opt;
1543
1544 if (cl->split || cl->defmap) {
1545 opt.split = cl->split ? cl->split->classid : 0;
1546 opt.defmap = cl->defmap;
1547 opt.defchange = ~0;
1548 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1549 }
1550 return skb->len;
1551
1552rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001553 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554 return -1;
1555}
1556
1557#ifdef CONFIG_NET_CLS_POLICE
1558static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1559{
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001560 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001561 struct tc_cbq_police opt;
1562
1563 if (cl->police) {
1564 opt.police = cl->police;
Patrick McHardy9ef1d4c2005-06-28 12:55:30 -07001565 opt.__res1 = 0;
1566 opt.__res2 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001567 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1568 }
1569 return skb->len;
1570
1571rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001572 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001573 return -1;
1574}
1575#endif
1576
1577static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1578{
1579 if (cbq_dump_lss(skb, cl) < 0 ||
1580 cbq_dump_rate(skb, cl) < 0 ||
1581 cbq_dump_wrr(skb, cl) < 0 ||
1582 cbq_dump_ovl(skb, cl) < 0 ||
1583#ifdef CONFIG_NET_CLS_POLICE
1584 cbq_dump_police(skb, cl) < 0 ||
1585#endif
1586 cbq_dump_fopt(skb, cl) < 0)
1587 return -1;
1588 return 0;
1589}
1590
1591static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1592{
1593 struct cbq_sched_data *q = qdisc_priv(sch);
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001594 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001595 struct rtattr *rta;
1596
1597 rta = (struct rtattr*)b;
1598 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1599 if (cbq_dump_attr(skb, &q->link) < 0)
1600 goto rtattr_failure;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001601 rta->rta_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602 return skb->len;
1603
1604rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001605 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001606 return -1;
1607}
1608
1609static int
1610cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1611{
1612 struct cbq_sched_data *q = qdisc_priv(sch);
1613
1614 q->link.xstats.avgidle = q->link.avgidle;
1615 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1616}
1617
1618static int
1619cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1620 struct sk_buff *skb, struct tcmsg *tcm)
1621{
1622 struct cbq_class *cl = (struct cbq_class*)arg;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001623 unsigned char *b = skb_tail_pointer(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001624 struct rtattr *rta;
1625
1626 if (cl->tparent)
1627 tcm->tcm_parent = cl->tparent->classid;
1628 else
1629 tcm->tcm_parent = TC_H_ROOT;
1630 tcm->tcm_handle = cl->classid;
1631 tcm->tcm_info = cl->q->handle;
1632
1633 rta = (struct rtattr*)b;
1634 RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1635 if (cbq_dump_attr(skb, cl) < 0)
1636 goto rtattr_failure;
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001637 rta->rta_len = skb_tail_pointer(skb) - b;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001638 return skb->len;
1639
1640rtattr_failure:
Arnaldo Carvalho de Melodc5fc572007-03-25 23:06:12 -07001641 nlmsg_trim(skb, b);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001642 return -1;
1643}
1644
1645static int
1646cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1647 struct gnet_dump *d)
1648{
1649 struct cbq_sched_data *q = qdisc_priv(sch);
1650 struct cbq_class *cl = (struct cbq_class*)arg;
1651
1652 cl->qstats.qlen = cl->q->q.qlen;
1653 cl->xstats.avgidle = cl->avgidle;
1654 cl->xstats.undertime = 0;
1655
1656 if (!PSCHED_IS_PASTPERFECT(cl->undertime))
1657 cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now);
1658
1659 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1660#ifdef CONFIG_NET_ESTIMATOR
1661 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1662#endif
1663 gnet_stats_copy_queue(d, &cl->qstats) < 0)
1664 return -1;
1665
1666 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1667}
1668
1669static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1670 struct Qdisc **old)
1671{
1672 struct cbq_class *cl = (struct cbq_class*)arg;
1673
1674 if (cl) {
1675 if (new == NULL) {
Patrick McHardy9f9afec2006-11-29 17:35:18 -08001676 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1677 cl->classid)) == NULL)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001678 return -ENOBUFS;
1679 } else {
1680#ifdef CONFIG_NET_CLS_POLICE
1681 if (cl->police == TC_POLICE_RECLASSIFY)
1682 new->reshape_fail = cbq_reshape_fail;
1683#endif
1684 }
1685 sch_tree_lock(sch);
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001686 *old = xchg(&cl->q, new);
Patrick McHardy5e50da02006-11-29 17:36:20 -08001687 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001688 qdisc_reset(*old);
1689 sch_tree_unlock(sch);
1690
1691 return 0;
1692 }
1693 return -ENOENT;
1694}
1695
1696static struct Qdisc *
1697cbq_leaf(struct Qdisc *sch, unsigned long arg)
1698{
1699 struct cbq_class *cl = (struct cbq_class*)arg;
1700
1701 return cl ? cl->q : NULL;
1702}
1703
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001704static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1705{
1706 struct cbq_class *cl = (struct cbq_class *)arg;
1707
1708 if (cl->q->q.qlen == 0)
1709 cbq_deactivate_class(cl);
1710}
1711
Linus Torvalds1da177e2005-04-16 15:20:36 -07001712static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1713{
1714 struct cbq_sched_data *q = qdisc_priv(sch);
1715 struct cbq_class *cl = cbq_class_lookup(q, classid);
1716
1717 if (cl) {
1718 cl->refcnt++;
1719 return (unsigned long)cl;
1720 }
1721 return 0;
1722}
1723
1724static void cbq_destroy_filters(struct cbq_class *cl)
1725{
1726 struct tcf_proto *tp;
1727
1728 while ((tp = cl->filter_list) != NULL) {
1729 cl->filter_list = tp->next;
1730 tcf_destroy(tp);
1731 }
1732}
1733
1734static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1735{
1736 struct cbq_sched_data *q = qdisc_priv(sch);
1737
1738 BUG_TRAP(!cl->filters);
1739
1740 cbq_destroy_filters(cl);
1741 qdisc_destroy(cl->q);
1742 qdisc_put_rtab(cl->R_tab);
1743#ifdef CONFIG_NET_ESTIMATOR
1744 gen_kill_estimator(&cl->bstats, &cl->rate_est);
1745#endif
1746 if (cl != &q->link)
1747 kfree(cl);
1748}
1749
1750static void
1751cbq_destroy(struct Qdisc* sch)
1752{
1753 struct cbq_sched_data *q = qdisc_priv(sch);
1754 struct cbq_class *cl;
1755 unsigned h;
1756
1757#ifdef CONFIG_NET_CLS_POLICE
1758 q->rx_class = NULL;
1759#endif
1760 /*
1761 * Filters must be destroyed first because we don't destroy the
1762 * classes from root to leafs which means that filters can still
1763 * be bound to classes which have been destroyed already. --TGR '04
1764 */
1765 for (h = 0; h < 16; h++)
1766 for (cl = q->classes[h]; cl; cl = cl->next)
1767 cbq_destroy_filters(cl);
1768
1769 for (h = 0; h < 16; h++) {
1770 struct cbq_class *next;
1771
1772 for (cl = q->classes[h]; cl; cl = next) {
1773 next = cl->next;
1774 cbq_destroy_class(sch, cl);
1775 }
1776 }
1777}
1778
1779static void cbq_put(struct Qdisc *sch, unsigned long arg)
1780{
1781 struct cbq_class *cl = (struct cbq_class*)arg;
1782
1783 if (--cl->refcnt == 0) {
1784#ifdef CONFIG_NET_CLS_POLICE
1785 struct cbq_sched_data *q = qdisc_priv(sch);
1786
1787 spin_lock_bh(&sch->dev->queue_lock);
1788 if (q->rx_class == cl)
1789 q->rx_class = NULL;
1790 spin_unlock_bh(&sch->dev->queue_lock);
1791#endif
1792
1793 cbq_destroy_class(sch, cl);
1794 }
1795}
1796
1797static int
1798cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1799 unsigned long *arg)
1800{
1801 int err;
1802 struct cbq_sched_data *q = qdisc_priv(sch);
1803 struct cbq_class *cl = (struct cbq_class*)*arg;
1804 struct rtattr *opt = tca[TCA_OPTIONS-1];
1805 struct rtattr *tb[TCA_CBQ_MAX];
1806 struct cbq_class *parent;
1807 struct qdisc_rate_table *rtab = NULL;
1808
1809 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1810 return -EINVAL;
1811
1812 if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1813 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1814 return -EINVAL;
1815
1816 if (tb[TCA_CBQ_FOPT-1] &&
1817 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1818 return -EINVAL;
1819
1820 if (tb[TCA_CBQ_RATE-1] &&
1821 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1822 return -EINVAL;
1823
1824 if (tb[TCA_CBQ_LSSOPT-1] &&
1825 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1826 return -EINVAL;
1827
1828 if (tb[TCA_CBQ_WRROPT-1] &&
1829 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1830 return -EINVAL;
1831
1832#ifdef CONFIG_NET_CLS_POLICE
1833 if (tb[TCA_CBQ_POLICE-1] &&
1834 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1835 return -EINVAL;
1836#endif
1837
1838 if (cl) {
1839 /* Check parent */
1840 if (parentid) {
1841 if (cl->tparent && cl->tparent->classid != parentid)
1842 return -EINVAL;
1843 if (!cl->tparent && parentid != TC_H_ROOT)
1844 return -EINVAL;
1845 }
1846
1847 if (tb[TCA_CBQ_RATE-1]) {
1848 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1849 if (rtab == NULL)
1850 return -EINVAL;
1851 }
1852
1853 /* Change class parameters */
1854 sch_tree_lock(sch);
1855
1856 if (cl->next_alive != NULL)
1857 cbq_deactivate_class(cl);
1858
1859 if (rtab) {
1860 rtab = xchg(&cl->R_tab, rtab);
1861 qdisc_put_rtab(rtab);
1862 }
1863
1864 if (tb[TCA_CBQ_LSSOPT-1])
1865 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1866
1867 if (tb[TCA_CBQ_WRROPT-1]) {
1868 cbq_rmprio(q, cl);
1869 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1870 }
1871
1872 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1873 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1874
1875#ifdef CONFIG_NET_CLS_POLICE
1876 if (tb[TCA_CBQ_POLICE-1])
1877 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1878#endif
1879
1880 if (tb[TCA_CBQ_FOPT-1])
1881 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1882
1883 if (cl->q->q.qlen)
1884 cbq_activate_class(cl);
1885
1886 sch_tree_unlock(sch);
1887
1888#ifdef CONFIG_NET_ESTIMATOR
1889 if (tca[TCA_RATE-1])
1890 gen_replace_estimator(&cl->bstats, &cl->rate_est,
1891 cl->stats_lock, tca[TCA_RATE-1]);
1892#endif
1893 return 0;
1894 }
1895
1896 if (parentid == TC_H_ROOT)
1897 return -EINVAL;
1898
1899 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1900 tb[TCA_CBQ_LSSOPT-1] == NULL)
1901 return -EINVAL;
1902
1903 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1904 if (rtab == NULL)
1905 return -EINVAL;
1906
1907 if (classid) {
1908 err = -EINVAL;
1909 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1910 goto failure;
1911 } else {
1912 int i;
1913 classid = TC_H_MAKE(sch->handle,0x8000);
1914
1915 for (i=0; i<0x8000; i++) {
1916 if (++q->hgenerator >= 0x8000)
1917 q->hgenerator = 1;
1918 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1919 break;
1920 }
1921 err = -ENOSR;
1922 if (i >= 0x8000)
1923 goto failure;
1924 classid = classid|q->hgenerator;
1925 }
1926
1927 parent = &q->link;
1928 if (parentid) {
1929 parent = cbq_class_lookup(q, parentid);
1930 err = -EINVAL;
1931 if (parent == NULL)
1932 goto failure;
1933 }
1934
1935 err = -ENOBUFS;
Panagiotis Issaris0da974f2006-07-21 14:51:30 -07001936 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001937 if (cl == NULL)
1938 goto failure;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001939 cl->R_tab = rtab;
1940 rtab = NULL;
1941 cl->refcnt = 1;
Patrick McHardy9f9afec2006-11-29 17:35:18 -08001942 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001943 cl->q = &noop_qdisc;
1944 cl->classid = classid;
1945 cl->tparent = parent;
1946 cl->qdisc = sch;
1947 cl->allot = parent->allot;
1948 cl->quantum = cl->allot;
1949 cl->weight = cl->R_tab->rate.rate;
1950 cl->stats_lock = &sch->dev->queue_lock;
1951
1952 sch_tree_lock(sch);
1953 cbq_link_class(cl);
1954 cl->borrow = cl->tparent;
1955 if (cl->tparent != &q->link)
1956 cl->share = cl->tparent;
1957 cbq_adjust_levels(parent);
1958 cl->minidle = -0x7FFFFFFF;
1959 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1960 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1961 if (cl->ewma_log==0)
1962 cl->ewma_log = q->link.ewma_log;
1963 if (cl->maxidle==0)
1964 cl->maxidle = q->link.maxidle;
1965 if (cl->avpkt==0)
1966 cl->avpkt = q->link.avpkt;
1967 cl->overlimit = cbq_ovl_classic;
1968 if (tb[TCA_CBQ_OVL_STRATEGY-1])
1969 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1970#ifdef CONFIG_NET_CLS_POLICE
1971 if (tb[TCA_CBQ_POLICE-1])
1972 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1973#endif
1974 if (tb[TCA_CBQ_FOPT-1])
1975 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1976 sch_tree_unlock(sch);
1977
1978#ifdef CONFIG_NET_ESTIMATOR
1979 if (tca[TCA_RATE-1])
1980 gen_new_estimator(&cl->bstats, &cl->rate_est,
1981 cl->stats_lock, tca[TCA_RATE-1]);
1982#endif
1983
1984 *arg = (unsigned long)cl;
1985 return 0;
1986
1987failure:
1988 qdisc_put_rtab(rtab);
1989 return err;
1990}
1991
1992static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1993{
1994 struct cbq_sched_data *q = qdisc_priv(sch);
1995 struct cbq_class *cl = (struct cbq_class*)arg;
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08001996 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001997
1998 if (cl->filters || cl->children || cl == &q->link)
1999 return -EBUSY;
2000
2001 sch_tree_lock(sch);
2002
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08002003 qlen = cl->q->q.qlen;
2004 qdisc_reset(cl->q);
2005 qdisc_tree_decrease_qlen(cl->q, qlen);
2006
Linus Torvalds1da177e2005-04-16 15:20:36 -07002007 if (cl->next_alive)
2008 cbq_deactivate_class(cl);
2009
2010 if (q->tx_borrowed == cl)
2011 q->tx_borrowed = q->tx_class;
2012 if (q->tx_class == cl) {
2013 q->tx_class = NULL;
2014 q->tx_borrowed = NULL;
2015 }
2016#ifdef CONFIG_NET_CLS_POLICE
2017 if (q->rx_class == cl)
2018 q->rx_class = NULL;
2019#endif
2020
2021 cbq_unlink_class(cl);
2022 cbq_adjust_levels(cl->tparent);
2023 cl->defmap = 0;
2024 cbq_sync_defmap(cl);
2025
2026 cbq_rmprio(q, cl);
2027 sch_tree_unlock(sch);
2028
2029 if (--cl->refcnt == 0)
2030 cbq_destroy_class(sch, cl);
2031
2032 return 0;
2033}
2034
2035static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
2036{
2037 struct cbq_sched_data *q = qdisc_priv(sch);
2038 struct cbq_class *cl = (struct cbq_class *)arg;
2039
2040 if (cl == NULL)
2041 cl = &q->link;
2042
2043 return &cl->filter_list;
2044}
2045
2046static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
2047 u32 classid)
2048{
2049 struct cbq_sched_data *q = qdisc_priv(sch);
2050 struct cbq_class *p = (struct cbq_class*)parent;
2051 struct cbq_class *cl = cbq_class_lookup(q, classid);
2052
2053 if (cl) {
2054 if (p && p->level <= cl->level)
2055 return 0;
2056 cl->filters++;
2057 return (unsigned long)cl;
2058 }
2059 return 0;
2060}
2061
2062static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
2063{
2064 struct cbq_class *cl = (struct cbq_class*)arg;
2065
2066 cl->filters--;
2067}
2068
2069static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2070{
2071 struct cbq_sched_data *q = qdisc_priv(sch);
2072 unsigned h;
2073
2074 if (arg->stop)
2075 return;
2076
2077 for (h = 0; h < 16; h++) {
2078 struct cbq_class *cl;
2079
2080 for (cl = q->classes[h]; cl; cl = cl->next) {
2081 if (arg->count < arg->skip) {
2082 arg->count++;
2083 continue;
2084 }
2085 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2086 arg->stop = 1;
2087 return;
2088 }
2089 arg->count++;
2090 }
2091 }
2092}
2093
2094static struct Qdisc_class_ops cbq_class_ops = {
2095 .graft = cbq_graft,
2096 .leaf = cbq_leaf,
Jarek Poplawskia37ef2e2006-12-08 00:25:55 -08002097 .qlen_notify = cbq_qlen_notify,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002098 .get = cbq_get,
2099 .put = cbq_put,
2100 .change = cbq_change_class,
2101 .delete = cbq_delete,
2102 .walk = cbq_walk,
2103 .tcf_chain = cbq_find_tcf,
2104 .bind_tcf = cbq_bind_filter,
2105 .unbind_tcf = cbq_unbind_filter,
2106 .dump = cbq_dump_class,
2107 .dump_stats = cbq_dump_class_stats,
2108};
2109
2110static struct Qdisc_ops cbq_qdisc_ops = {
2111 .next = NULL,
2112 .cl_ops = &cbq_class_ops,
2113 .id = "cbq",
2114 .priv_size = sizeof(struct cbq_sched_data),
2115 .enqueue = cbq_enqueue,
2116 .dequeue = cbq_dequeue,
2117 .requeue = cbq_requeue,
2118 .drop = cbq_drop,
2119 .init = cbq_init,
2120 .reset = cbq_reset,
2121 .destroy = cbq_destroy,
2122 .change = NULL,
2123 .dump = cbq_dump,
2124 .dump_stats = cbq_dump_stats,
2125 .owner = THIS_MODULE,
2126};
2127
2128static int __init cbq_module_init(void)
2129{
2130 return register_qdisc(&cbq_qdisc_ops);
2131}
YOSHIFUJI Hideaki10297b92007-02-09 23:25:16 +09002132static void __exit cbq_module_exit(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002133{
2134 unregister_qdisc(&cbq_qdisc_ops);
2135}
2136module_init(cbq_module_init)
2137module_exit(cbq_module_exit)
2138MODULE_LICENSE("GPL");