ip_dummynet.c source code [xnu/bsd/netinet/ip_dummynet.c]

1	/*
2	* Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
30	* Portions Copyright (c) 2000 Akamba Corp.
31	* All rights reserved
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	*
42	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52	* SUCH DAMAGE.
53	*
54	* $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $
55	*/
56
57	#define DUMMYNET_DEBUG
58
59	/*
60	* This module implements IP dummynet, a bandwidth limiter/delay emulator
61	* used in conjunction with the ipfw package.
62	* Description of the data structures used is in ip_dummynet.h
63	* Here you mainly find the following blocks of code:
64	* + variable declarations;
65	* + heap management functions;
66	* + scheduler and dummynet functions;
67	* + configuration and initialization.
68	*
69	* NOTA BENE: critical sections are protected by the "dummynet lock".
70	*
71	* Most important Changes:
72	*
73	* 010124: Fixed WF2Q behaviour
74	* 010122: Fixed spl protection.
75	* 000601: WF2Q support
76	* 000106: large rewrite, use heaps to handle very many pipes.
77	* 980513: initial release
78	*
79	* include files marked with XXX are probably not needed
80	*/
81
82	#include <sys/param.h>
83	#include <sys/systm.h>
84	#include <sys/malloc.h>
85	#include <sys/mbuf.h>
86	#include <sys/queue.h> /* XXX */
87	#include <sys/kernel.h>
88	#include <sys/random.h>
89	#include <sys/socket.h>
90	#include <sys/socketvar.h>
91	#include <sys/time.h>
92	#include <sys/sysctl.h>
93	#include <net/if.h>
94	#include <net/route.h>
95	#include <net/kpi_protocol.h>
96	#if DUMMYNET
97	#include <net/kpi_protocol.h>
98	#endif /* DUMMYNET */
99	#include <net/nwk_wq.h>
100	#include <net/pfvar.h>
101	#include <netinet/in.h>
102	#include <netinet/in_systm.h>
103	#include <netinet/in_var.h>
104	#include <netinet/ip.h>
105	#include <netinet/ip_fw.h>
106	#include <netinet/ip_dummynet.h>
107	#include <netinet/ip_var.h>
108
109	#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
110	#include <netinet6/ip6_var.h>
111
112	static struct ip_fw default_rule;
113
114	/*
115	* We keep a private variable for the simulation time, but we could
116	* probably use an existing one ("softticks" in sys/kern/kern_timer.c)
117	*/
118	static dn_key curr_time = `0` ; / current simulation time /
119
120	/ this is for the timer that fires to call dummynet() - we only enable the timer when*
121	there are packets to process, otherwise it's disabled /*
122	static int timer_enabled = `0`;
123
124	static int dn_hash_size = `64` ; / default hash size /
125
126	/ statistics on number of queue searches and search steps /
127	static int searches, search_steps ;
128	static int pipe_expire = `1` ; / expire queue if empty /
129	static int dn_max_ratio = `16` ; / max queues/buckets ratio /
130
131	static int red_lookup_depth = `256`; / RED - default lookup table depth /
132	static int red_avg_pkt_size = `512`; / RED - default medium packet size /
133	static int red_max_pkt_size = `1500`; / RED - default max packet size /
134
135	static int serialize = `0`;
136
137	/*
138	* Three heaps contain queues and pipes that the scheduler handles:
139	*
140	* ready_heap contains all dn_flow_queue related to fixed-rate pipes.
141	*
142	* wfq_ready_heap contains the pipes associated with WF2Q flows
143	*
144	* extract_heap contains pipes associated with delay lines.
145	*
146	*/
147	static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
148
149	static int heap_init(struct dn_heap h, int* size) ;
150	static int heap_insert (struct dn_heap h, dn_key key1, void* *p);
151	static void heap_extract(struct dn_heap h, void* *obj);
152
153
154	static void transmit_event(struct dn_pipe pipe, struct* mbuf **head,
155	struct mbuf **tail);
156	static void ready_event(struct dn_flow_queue q, struct* mbuf **head,
157	struct mbuf **tail);
158	static void ready_event_wfq(struct dn_pipe p, struct* mbuf **head,
159	struct mbuf **tail);
160
161	/*
162	* Packets are retrieved from queues in Dummynet in chains instead of
163	* packet-by-packet. The entire list of packets is first dequeued and
164	* sent out by the following function.
165	*/
166	static void dummynet_send(struct mbuf *m);
167
168	#define HASHSIZE 16
169	#define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
170	static struct dn_pipe_head pipehash[HASHSIZE]; / all pipes /
171	static struct dn_flow_set_head flowsethash[HASHSIZE]; / all flowsets /
172
173	#ifdef SYSCTL_NODE
174	SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
175	CTLFLAG_RW \| CTLFLAG_LOCKED, `0`, "Dummynet");
176	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
177	CTLFLAG_RW \| CTLFLAG_LOCKED, &dn_hash_size, `0`, "Default hash table size");
178	SYSCTL_QUAD(_net_inet_ip_dummynet, OID_AUTO, curr_time,
179	CTLFLAG_RD \| CTLFLAG_LOCKED, &curr_time, "Current tick");
180	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
181	CTLFLAG_RD \| CTLFLAG_LOCKED, &ready_heap.size, `0`, "Size of ready heap");
182	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
183	CTLFLAG_RD \| CTLFLAG_LOCKED, &extract_heap.size, `0`, "Size of extract heap");
184	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
185	CTLFLAG_RD \| CTLFLAG_LOCKED, &searches, `0`, "Number of queue searches");
186	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
187	CTLFLAG_RD \| CTLFLAG_LOCKED, &search_steps, `0`, "Number of queue search steps");
188	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
189	CTLFLAG_RW \| CTLFLAG_LOCKED, &pipe_expire, `0`, "Expire queue if empty");
190	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
191	CTLFLAG_RW \| CTLFLAG_LOCKED, &dn_max_ratio, `0`,
192	"Max ratio between dynamic queues and buckets");
193	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
194	CTLFLAG_RD \| CTLFLAG_LOCKED, &red_lookup_depth, `0`, "Depth of RED lookup table");
195	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
196	CTLFLAG_RD \| CTLFLAG_LOCKED, &red_avg_pkt_size, `0`, "RED Medium packet size");
197	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
198	CTLFLAG_RD \| CTLFLAG_LOCKED, &red_max_pkt_size, `0`, "RED Max packet size");
199	#endif
200
201	#ifdef DUMMYNET_DEBUG
202	int dummynet_debug = `0`;
203	#ifdef SYSCTL_NODE
204	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW \| CTLFLAG_LOCKED, &dummynet_debug,
205	`0`, "control debugging printfs");
206	#endif
207	#define DPRINTF(X) if (dummynet_debug) printf X
208	#else
209	#define DPRINTF(X)
210	#endif
211
212	/ dummynet lock /
213	static lck_grp_t *dn_mutex_grp;
214	static lck_grp_attr_t *dn_mutex_grp_attr;
215	static lck_attr_t *dn_mutex_attr;
216	decl_lck_mtx_data(static, dn_mutex_data);
217	static lck_mtx_t *dn_mutex = &dn_mutex_data;
218
219	static int config_pipe(struct dn_pipe *p);
220	static int ip_dn_ctl(struct sockopt *sopt);
221
222	static void dummynet(void *);
223	static void dummynet_flush(void);
224	void dummynet_drain(void);
225	static ip_dn_io_t dummynet_io;
226
227	static void cp_flow_set_to_64_user(struct dn_flow_set set, struct* dn_flow_set_64 *fs_bp);
228	static void cp_queue_to_64_user( struct dn_flow_queue q, struct* dn_flow_queue_64 *qp);
229	static char cp_pipe_to_64_user(struct* dn_pipe p, struct* dn_pipe_64 *pipe_bp);
230	static char* dn_copy_set_64(struct dn_flow_set set, char* *bp);
231	static int cp_pipe_from_user_64( struct sockopt sopt, struct* dn_pipe *p );
232
233	static void cp_flow_set_to_32_user(struct dn_flow_set set, struct* dn_flow_set_32 *fs_bp);
234	static void cp_queue_to_32_user( struct dn_flow_queue q, struct* dn_flow_queue_32 *qp);
235	static char cp_pipe_to_32_user(struct* dn_pipe p, struct* dn_pipe_32 *pipe_bp);
236	static char* dn_copy_set_32(struct dn_flow_set set, char* *bp);
237	static int cp_pipe_from_user_32( struct sockopt sopt, struct* dn_pipe *p );
238
239	struct eventhandler_lists_ctxt dummynet_evhdlr_ctxt;
240
241	uint32_t my_random(void)
242	{
243	uint32_t val;
244	read_frandom(&val, sizeof(val));
245	val &= `0x7FFFFFFF`;
246
247	return (val);
248	}
249
250	/*
251	* Heap management functions.
252	*
253	* In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
254	* Some macros help finding parent/children so we can optimize them.
255	*
256	* heap_init() is called to expand the heap when needed.
257	* Increment size in blocks of 16 entries.
258	* XXX failure to allocate a new element is a pretty bad failure
259	* as we basically stall a whole queue forever!!
260	* Returns 1 on error, 0 on success
261	*/
262	#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
263	#define HEAP_LEFT(x) ( 2*(x) + 1 )
264	#define HEAP_IS_LEFT(x) ( (x) & 1 )
265	#define HEAP_RIGHT(x) ( 2*(x) + 2 )
266	#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
267	#define HEAP_INCREMENT 15
268
269
270	int cp_pipe_from_user_32( struct sockopt sopt, struct* dn_pipe *p )
271	{
272	struct dn_pipe_32 user_pipe_32;
273	int error=`0`;
274
275	error = sooptcopyin(sopt, &user_pipe_32, sizeof(struct dn_pipe_32), sizeof(struct dn_pipe_32));
276	if ( !error ){
277	p->pipe_nr = user_pipe_32.pipe_nr;
278	p->bandwidth = user_pipe_32.bandwidth;
279	p->delay = user_pipe_32.delay;
280	p->V = user_pipe_32.V;
281	p->sum = user_pipe_32.sum;
282	p->numbytes = user_pipe_32.numbytes;
283	p->sched_time = user_pipe_32.sched_time;
284	bcopy( user_pipe_32.if_name, p->if_name, IFNAMSIZ);
285	p->ready = user_pipe_32.ready;
286
287	p->fs.fs_nr = user_pipe_32.fs.fs_nr;
288	p->fs.flags_fs = user_pipe_32.fs.flags_fs;
289	p->fs.parent_nr = user_pipe_32.fs.parent_nr;
290	p->fs.weight = user_pipe_32.fs.weight;
291	p->fs.qsize = user_pipe_32.fs.qsize;
292	p->fs.plr = user_pipe_32.fs.plr;
293	p->fs.flow_mask = user_pipe_32.fs.flow_mask;
294	p->fs.rq_size = user_pipe_32.fs.rq_size;
295	p->fs.rq_elements = user_pipe_32.fs.rq_elements;
296	p->fs.last_expired = user_pipe_32.fs.last_expired;
297	p->fs.backlogged = user_pipe_32.fs.backlogged;
298	p->fs.w_q = user_pipe_32.fs.w_q;
299	p->fs.max_th = user_pipe_32.fs.max_th;
300	p->fs.min_th = user_pipe_32.fs.min_th;
301	p->fs.max_p = user_pipe_32.fs.max_p;
302	p->fs.c_1 = user_pipe_32.fs.c_1;
303	p->fs.c_2 = user_pipe_32.fs.c_2;
304	p->fs.c_3 = user_pipe_32.fs.c_3;
305	p->fs.c_4 = user_pipe_32.fs.c_4;
306	p->fs.lookup_depth = user_pipe_32.fs.lookup_depth;
307	p->fs.lookup_step = user_pipe_32.fs.lookup_step;
308	p->fs.lookup_weight = user_pipe_32.fs.lookup_weight;
309	p->fs.avg_pkt_size = user_pipe_32.fs.avg_pkt_size;
310	p->fs.max_pkt_size = user_pipe_32.fs.max_pkt_size;
311	}
312	return error;
313	}
314
315
316	int cp_pipe_from_user_64( struct sockopt sopt, struct* dn_pipe *p )
317	{
318	struct dn_pipe_64 user_pipe_64;
319	int error=`0`;
320
321	error = sooptcopyin(sopt, &user_pipe_64, sizeof(struct dn_pipe_64), sizeof(struct dn_pipe_64));
322	if ( !error ){
323	p->pipe_nr = user_pipe_64.pipe_nr;
324	p->bandwidth = user_pipe_64.bandwidth;
325	p->delay = user_pipe_64.delay;
326	p->V = user_pipe_64.V;
327	p->sum = user_pipe_64.sum;
328	p->numbytes = user_pipe_64.numbytes;
329	p->sched_time = user_pipe_64.sched_time;
330	bcopy( user_pipe_64.if_name, p->if_name, IFNAMSIZ);
331	p->ready = user_pipe_64.ready;
332
333	p->fs.fs_nr = user_pipe_64.fs.fs_nr;
334	p->fs.flags_fs = user_pipe_64.fs.flags_fs;
335	p->fs.parent_nr = user_pipe_64.fs.parent_nr;
336	p->fs.weight = user_pipe_64.fs.weight;
337	p->fs.qsize = user_pipe_64.fs.qsize;
338	p->fs.plr = user_pipe_64.fs.plr;
339	p->fs.flow_mask = user_pipe_64.fs.flow_mask;
340	p->fs.rq_size = user_pipe_64.fs.rq_size;
341	p->fs.rq_elements = user_pipe_64.fs.rq_elements;
342	p->fs.last_expired = user_pipe_64.fs.last_expired;
343	p->fs.backlogged = user_pipe_64.fs.backlogged;
344	p->fs.w_q = user_pipe_64.fs.w_q;
345	p->fs.max_th = user_pipe_64.fs.max_th;
346	p->fs.min_th = user_pipe_64.fs.min_th;
347	p->fs.max_p = user_pipe_64.fs.max_p;
348	p->fs.c_1 = user_pipe_64.fs.c_1;
349	p->fs.c_2 = user_pipe_64.fs.c_2;
350	p->fs.c_3 = user_pipe_64.fs.c_3;
351	p->fs.c_4 = user_pipe_64.fs.c_4;
352	p->fs.lookup_depth = user_pipe_64.fs.lookup_depth;
353	p->fs.lookup_step = user_pipe_64.fs.lookup_step;
354	p->fs.lookup_weight = user_pipe_64.fs.lookup_weight;
355	p->fs.avg_pkt_size = user_pipe_64.fs.avg_pkt_size;
356	p->fs.max_pkt_size = user_pipe_64.fs.max_pkt_size;
357	}
358	return error;
359	}
360
361	static void
362	cp_flow_set_to_32_user(struct dn_flow_set set, struct* dn_flow_set_32 *fs_bp)
363	{
364	fs_bp->fs_nr = set->fs_nr;
365	fs_bp->flags_fs = set->flags_fs ;
366	fs_bp->parent_nr = set->parent_nr ;
367	fs_bp->weight = set->weight ;
368	fs_bp->qsize = set->qsize ;
369	fs_bp->plr = set->plr ;
370	fs_bp->flow_mask = set->flow_mask ;
371	fs_bp->rq_size = set->rq_size ;
372	fs_bp->rq_elements = set->rq_elements ;
373	fs_bp->last_expired = set->last_expired ;
374	fs_bp->backlogged = set->backlogged ;
375	fs_bp->w_q = set->w_q ;
376	fs_bp->max_th = set->max_th ;
377	fs_bp->min_th = set->min_th ;
378	fs_bp->max_p = set->max_p ;
379	fs_bp->c_1 = set->c_1 ;
380	fs_bp->c_2 = set->c_2 ;
381	fs_bp->c_3 = set->c_3 ;
382	fs_bp->c_4 = set->c_4 ;
383	fs_bp->w_q_lookup = CAST_DOWN_EXPLICIT(user32_addr_t, set->w_q_lookup) ;
384	fs_bp->lookup_depth = set->lookup_depth ;
385	fs_bp->lookup_step = set->lookup_step ;
386	fs_bp->lookup_weight = set->lookup_weight ;
387	fs_bp->avg_pkt_size = set->avg_pkt_size ;
388	fs_bp->max_pkt_size = set->max_pkt_size ;
389	}
390
391	static void
392	cp_flow_set_to_64_user(struct dn_flow_set set, struct* dn_flow_set_64 *fs_bp)
393	{
394	fs_bp->fs_nr = set->fs_nr;
395	fs_bp->flags_fs = set->flags_fs ;
396	fs_bp->parent_nr = set->parent_nr ;
397	fs_bp->weight = set->weight ;
398	fs_bp->qsize = set->qsize ;
399	fs_bp->plr = set->plr ;
400	fs_bp->flow_mask = set->flow_mask ;
401	fs_bp->rq_size = set->rq_size ;
402	fs_bp->rq_elements = set->rq_elements ;
403	fs_bp->last_expired = set->last_expired ;
404	fs_bp->backlogged = set->backlogged ;
405	fs_bp->w_q = set->w_q ;
406	fs_bp->max_th = set->max_th ;
407	fs_bp->min_th = set->min_th ;
408	fs_bp->max_p = set->max_p ;
409	fs_bp->c_1 = set->c_1 ;
410	fs_bp->c_2 = set->c_2 ;
411	fs_bp->c_3 = set->c_3 ;
412	fs_bp->c_4 = set->c_4 ;
413	fs_bp->w_q_lookup = CAST_DOWN(user64_addr_t, set->w_q_lookup) ;
414	fs_bp->lookup_depth = set->lookup_depth ;
415	fs_bp->lookup_step = set->lookup_step ;
416	fs_bp->lookup_weight = set->lookup_weight ;
417	fs_bp->avg_pkt_size = set->avg_pkt_size ;
418	fs_bp->max_pkt_size = set->max_pkt_size ;
419	}
420
421	static
422	void cp_queue_to_32_user( struct dn_flow_queue q, struct* dn_flow_queue_32 *qp)
423	{
424	qp->id = q->id;
425	qp->len = q->len;
426	qp->len_bytes = q->len_bytes;
427	qp->numbytes = q->numbytes;
428	qp->tot_pkts = q->tot_pkts;
429	qp->tot_bytes = q->tot_bytes;
430	qp->drops = q->drops;
431	qp->hash_slot = q->hash_slot;
432	qp->avg = q->avg;
433	qp->count = q->count;
434	qp->random = q->random;
435	qp->q_time = q->q_time;
436	qp->heap_pos = q->heap_pos;
437	qp->sched_time = q->sched_time;
438	qp->S = q->S;
439	qp->F = q->F;
440	}
441
442	static
443	void cp_queue_to_64_user( struct dn_flow_queue q, struct* dn_flow_queue_64 *qp)
444	{
445	qp->id = q->id;
446	qp->len = q->len;
447	qp->len_bytes = q->len_bytes;
448	qp->numbytes = q->numbytes;
449	qp->tot_pkts = q->tot_pkts;
450	qp->tot_bytes = q->tot_bytes;
451	qp->drops = q->drops;
452	qp->hash_slot = q->hash_slot;
453	qp->avg = q->avg;
454	qp->count = q->count;
455	qp->random = q->random;
456	qp->q_time = q->q_time;
457	qp->heap_pos = q->heap_pos;
458	qp->sched_time = q->sched_time;
459	qp->S = q->S;
460	qp->F = q->F;
461	}
462
463	static
464	char cp_pipe_to_32_user(struct* dn_pipe p, struct* dn_pipe_32 *pipe_bp)
465	{
466	char *bp;
467
468	pipe_bp->pipe_nr = p->pipe_nr;
469	pipe_bp->bandwidth = p->bandwidth;
470	pipe_bp->delay = p->delay;
471	bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_32));
472	pipe_bp->scheduler_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->scheduler_heap.p);
473	bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_32));
474	pipe_bp->not_eligible_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->not_eligible_heap.p);
475	bcopy( &(p->idle_heap), &(pipe_bp->idle_heap), sizeof(struct dn_heap_32));
476	pipe_bp->idle_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->idle_heap.p);
477	pipe_bp->V = p->V;
478	pipe_bp->sum = p->sum;
479	pipe_bp->numbytes = p->numbytes;
480	pipe_bp->sched_time = p->sched_time;
481	bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ);
482	pipe_bp->ifp = CAST_DOWN_EXPLICIT(user32_addr_t, p->ifp);
483	pipe_bp->ready = p->ready;
484
485	cp_flow_set_to_32_user( &(p->fs), &(pipe_bp->fs));
486
487	pipe_bp->delay = (pipe_bp->delay * `1000`) / (hz*`10`) ;
488	/*
489	* XXX the following is a hack based on ->next being the
490	* first field in dn_pipe and dn_flow_set. The correct
491	* solution would be to move the dn_flow_set to the beginning
492	* of struct dn_pipe.
493	*/
494	pipe_bp->next = CAST_DOWN_EXPLICIT( user32_addr_t, DN_IS_PIPE );
495	/ clean pointers /
496	pipe_bp->head = pipe_bp->tail = (user32_addr_t) `0` ;
497	pipe_bp->fs.next = (user32_addr_t)`0` ;
498	pipe_bp->fs.pipe = (user32_addr_t)`0` ;
499	pipe_bp->fs.rq = (user32_addr_t)`0` ;
500	bp = ((char )pipe_bp) + sizeof(struct* dn_pipe_32);
501	return( dn_copy_set_32( &(p->fs), bp) );
502	}
503
504	static
505	char cp_pipe_to_64_user(struct* dn_pipe p, struct* dn_pipe_64 *pipe_bp)
506	{
507	char *bp;
508
509	pipe_bp->pipe_nr = p->pipe_nr;
510	pipe_bp->bandwidth = p->bandwidth;
511	pipe_bp->delay = p->delay;
512	bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_64));
513	pipe_bp->scheduler_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->scheduler_heap.p);
514	bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_64));
515	pipe_bp->not_eligible_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->not_eligible_heap.p);
516	bcopy( &(p->idle_heap), &(pipe_bp->idle_heap), sizeof(struct dn_heap_64));
517	pipe_bp->idle_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->idle_heap.p);
518	pipe_bp->V = p->V;
519	pipe_bp->sum = p->sum;
520	pipe_bp->numbytes = p->numbytes;
521	pipe_bp->sched_time = p->sched_time;
522	bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ);
523	pipe_bp->ifp = CAST_DOWN(user64_addr_t, p->ifp);
524	pipe_bp->ready = p->ready;
525
526	cp_flow_set_to_64_user( &(p->fs), &(pipe_bp->fs));
527
528	pipe_bp->delay = (pipe_bp->delay * `1000`) / (hz*`10`) ;
529	/*
530	* XXX the following is a hack based on ->next being the
531	* first field in dn_pipe and dn_flow_set. The correct
532	* solution would be to move the dn_flow_set to the beginning
533	* of struct dn_pipe.
534	*/
535	pipe_bp->next = CAST_DOWN( user64_addr_t, DN_IS_PIPE );
536	/ clean pointers /
537	pipe_bp->head = pipe_bp->tail = USER_ADDR_NULL ;
538	pipe_bp->fs.next = USER_ADDR_NULL ;
539	pipe_bp->fs.pipe = USER_ADDR_NULL ;
540	pipe_bp->fs.rq = USER_ADDR_NULL ;
541	bp = ((char )pipe_bp) + sizeof(struct* dn_pipe_64);
542	return( dn_copy_set_64( &(p->fs), bp) );
543	}
544
545	static int
546	heap_init(struct dn_heap h, int* new_size)
547	{
548	struct dn_heap_entry *p;
549
550	if (h->size >= new_size ) {
551	printf("dummynet: heap_init, Bogus call, have %d want %d\n",
552	h->size, new_size);
553	return `0` ;
554	}
555	new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
556	p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT );
557	if (p == NULL) {
558	printf("dummynet: heap_init, resize %d failed\n", new_size );
559	return `1` ; / error /
560	}
561	if (h->size > `0`) {
562	bcopy(h->p, p, h->size * sizeof(*p) );
563	FREE(h->p, M_DUMMYNET);
564	}
565	h->p = p ;
566	h->size = new_size ;
567	return `0` ;
568	}
569
570	/*
571	* Insert element in heap. Normally, p != NULL, we insert p in
572	* a new position and bubble up. If p == NULL, then the element is
573	* already in place, and key is the position where to start the
574	* bubble-up.
575	* Returns 1 on failure (cannot allocate new heap entry)
576	*
577	* If offset > 0 the position (index, int) of the element in the heap is
578	* also stored in the element itself at the given offset in bytes.
579	*/
580	#define SET_OFFSET(heap, node) \
581	if (heap->offset > 0) \
582	((int )((char *)(heap->p[node].object) + heap->offset)) = node ;
583	/*
584	* RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
585	*/
586	#define RESET_OFFSET(heap, node) \
587	if (heap->offset > 0) \
588	((int )((char *)(heap->p[node].object) + heap->offset)) = -1 ;
589	static int
590	heap_insert(struct dn_heap h, dn_key key1, void* *p)
591	{
592	int son = h->elements ;
593
594	if (p == NULL) / data already there, set starting point /
595	son = key1 ;
596	else { / insert new element at the end, possibly resize /
597	son = h->elements ;
598	if (son == h->size) / need resize... /
599	if (heap_init(h, h->elements+`1`) )
600	return `1` ; / failure... /
601	h->p[son].object = p ;
602	h->p[son].key = key1 ;
603	h->elements++ ;
604	}
605	while (son > `0`) { / bubble up /
606	int father = HEAP_FATHER(son) ;
607	struct dn_heap_entry tmp ;
608
609	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
610	break ; / found right position /
611	/ son smaller than father, swap and repeat /
612	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
613	SET_OFFSET(h, son);
614	son = father ;
615	}
616	SET_OFFSET(h, son);
617	return `0` ;
618	}
619
620	/*
621	* remove top element from heap, or obj if obj != NULL
622	*/
623	static void
624	heap_extract(struct dn_heap h, void* *obj)
625	{
626	int child, father, maxelt = h->elements - `1` ;
627
628	if (maxelt < `0`) {
629	printf("dummynet: warning, extract from empty heap 0x%llx\n",
630	(uint64_t)VM_KERNEL_ADDRPERM(h));
631	return ;
632	}
633	father = `0` ; / default: move up smallest child /
634	if (obj != NULL) { / extract specific element, index is at offset /
635	if (h->offset <= `0`)
636	panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
637	father = ((int* )((char* *)obj + h->offset)) ;
638	if (father < `0` \|\| father >= h->elements) {
639	printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
640	father, h->elements);
641	panic("dummynet: heap_extract");
642	}
643	}
644	RESET_OFFSET(h, father);
645	child = HEAP_LEFT(father) ; / left child /
646	while (child <= maxelt) { / valid entry /
647	if (child != maxelt && DN_KEY_LT(h->p[child+`1`].key, h->p[child].key) )
648	child = child+`1` ; / take right child, otherwise left /
649	h->p[father] = h->p[child] ;
650	SET_OFFSET(h, father);
651	father = child ;
652	child = HEAP_LEFT(child) ; / left child for next loop /
653	}
654	h->elements-- ;
655	if (father != maxelt) {
656	/*
657	* Fill hole with last entry and bubble up, reusing the insert code
658	*/
659	h->p[father] = h->p[maxelt] ;
660	heap_insert(h, father, NULL); / this one cannot fail /
661	}
662	}
663
664	/*
665	* heapify() will reorganize data inside an array to maintain the
666	* heap property. It is needed when we delete a bunch of entries.
667	*/
668	static void
669	heapify(struct dn_heap *h)
670	{
671	int i ;
672
673	for (i = `0` ; i < h->elements ; i++ )
674	heap_insert(h, i , NULL) ;
675	}
676
677	/*
678	* cleanup the heap and free data structure
679	*/
680	static void
681	heap_free(struct dn_heap *h)
682	{
683	if (h->size >`0` )
684	FREE(h->p, M_DUMMYNET);
685	bzero(h, sizeof(*h));
686	}
687
688	/*
689	* --- end of heap management functions ---
690	*/
691
692	/*
693	* Return the mbuf tag holding the dummynet state. As an optimization
694	* this is assumed to be the first tag on the list. If this turns out
695	* wrong we'll need to search the list.
696	*/
697	static struct dn_pkt_tag *
698	dn_tag_get(struct mbuf *m)
699	{
700	struct m_tag *mtag = m_tag_first(m);
701
702	if (!(mtag != NULL &&
703	mtag->m_tag_id == KERNEL_MODULE_TAG_ID &&
704	mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET))
705	panic("packet on dummynet queue w/o dummynet tag: 0x%llx",
706	(uint64_t)VM_KERNEL_ADDRPERM(m));
707
708	return (struct dn_pkt_tag *)(mtag+`1`);
709	}
710
711	/*
712	* Scheduler functions:
713	*
714	* transmit_event() is called when the delay-line needs to enter
715	* the scheduler, either because of existing pkts getting ready,
716	* or new packets entering the queue. The event handled is the delivery
717	* time of the packet.
718	*
719	* ready_event() does something similar with fixed-rate queues, and the
720	* event handled is the finish time of the head pkt.
721	*
722	* wfq_ready_event() does something similar with WF2Q queues, and the
723	* event handled is the start time of the head pkt.
724	*
725	* In all cases, we make sure that the data structures are consistent
726	* before passing pkts out, because this might trigger recursive
727	* invocations of the procedures.
728	*/
729	static void
730	transmit_event(struct dn_pipe pipe, struct* mbuf head, struct mbuf tail)
731	{
732	struct mbuf *m ;
733	struct dn_pkt_tag *pkt = NULL;
734	u_int64_t schedule_time;
735
736	LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
737	ASSERT(serialize >= `0`);
738	if (serialize == `0`) {
739	while ((m = pipe->head) != NULL) {
740	pkt = dn_tag_get(m);
741	if (!DN_KEY_LEQ(pkt->dn_output_time, curr_time))
742	break;
743
744	pipe->head = m->m_nextpkt;
745	if (*tail != NULL)
746	(*tail)->m_nextpkt = m;
747	else
748	*head = m;
749	*tail = m;
750	}
751
752	if (*tail != NULL)
753	(*tail)->m_nextpkt = NULL;
754	}
755
756	schedule_time = pkt == NULL \|\| DN_KEY_LEQ(pkt->dn_output_time, curr_time) ?
757	curr_time + `1` : pkt->dn_output_time;
758
759	/ if there are leftover packets, put the pipe into the heap for next ready event /
760	if ((m = pipe->head) != NULL) {
761	pkt = dn_tag_get(m);
762	/ XXX should check errors on heap_insert, by draining the*
763	* whole pipe p and hoping in the future we are more successful
764	*/
765	heap_insert(&extract_heap, schedule_time, pipe);
766	}
767	}
768
769	/*
770	* the following macro computes how many ticks we have to wait
771	* before being able to transmit a packet. The credit is taken from
772	* either a pipe (WF2Q) or a flow_queue (per-flow queueing)
773	*/
774
775	/ hz is 100, which gives a granularity of 10ms in the old timer.*
776	* The timer has been changed to fire every 1ms, so the use of
777	* hz has been modified here. All instances of hz have been left
778	* in place but adjusted by a factor of 10 so that hz is functionally
779	* equal to 1000.
780	*/
781	#define SET_TICKS(_m, q, p) \
782	((_m)->m_pkthdr.len8(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \
783	p->bandwidth ;
784
785	/*
786	* extract pkt from queue, compute output time (could be now)
787	* and put into delay line (p_queue)
788	*/
789	static void
790	move_pkt(struct mbuf pkt, struct* dn_flow_queue *q,
791	struct dn_pipe p, int* len)
792	{
793	struct dn_pkt_tag *dt = dn_tag_get(pkt);
794
795	q->head = pkt->m_nextpkt ;
796	q->len-- ;
797	q->len_bytes -= len ;
798
799	dt->dn_output_time = curr_time + p->delay ;
800
801	if (p->head == NULL)
802	p->head = pkt;
803	else
804	p->tail->m_nextpkt = pkt;
805	p->tail = pkt;
806	p->tail->m_nextpkt = NULL;
807	}
808
809	/*
810	* ready_event() is invoked every time the queue must enter the
811	* scheduler, either because the first packet arrives, or because
812	* a previously scheduled event fired.
813	* On invokation, drain as many pkts as possible (could be 0) and then
814	* if there are leftover packets reinsert the pkt in the scheduler.
815	*/
816	static void
817	ready_event(struct dn_flow_queue q, struct* mbuf head, struct mbuf tail)
818	{
819	struct mbuf *pkt;
820	struct dn_pipe *p = q->fs->pipe ;
821	int p_was_empty ;
822
823	LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
824
825	if (p == NULL) {
826	printf("dummynet: ready_event pipe is gone\n");
827	return ;
828	}
829	p_was_empty = (p->head == NULL) ;
830
831	/*
832	* schedule fixed-rate queues linked to this pipe:
833	* Account for the bw accumulated since last scheduling, then
834	* drain as many pkts as allowed by q->numbytes and move to
835	* the delay line (in p) computing output time.
836	* bandwidth==0 (no limit) means we can drain the whole queue,
837	* setting len_scaled = 0 does the job.
838	*/
839	q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
840	while ( (pkt = q->head) != NULL ) {
841	int len = pkt->m_pkthdr.len;
842	int len_scaled = p->bandwidth ? len`8`(hz*`10`) : `0` ;
843	if (len_scaled > q->numbytes )
844	break ;
845	q->numbytes -= len_scaled ;
846	move_pkt(pkt, q, p, len);
847	}
848	/*
849	* If we have more packets queued, schedule next ready event
850	* (can only occur when bandwidth != 0, otherwise we would have
851	* flushed the whole queue in the previous loop).
852	* To this purpose we record the current time and compute how many
853	* ticks to go for the finish time of the packet.
854	*/
855	if ( (pkt = q->head) != NULL ) { / this implies bandwidth != 0 /
856	dn_key t = SET_TICKS(pkt, q, p); / ticks i have to wait /
857	q->sched_time = curr_time ;
858	heap_insert(&ready_heap, curr_time + t, (void *)q );
859	/ XXX should check errors on heap_insert, and drain the whole*
860	* queue on error hoping next time we are luckier.
861	*/
862	} else { / RED needs to know when the queue becomes empty /
863	q->q_time = curr_time;
864	q->numbytes = `0`;
865	}
866	/*
867	* If the delay line was empty call transmit_event(p) now.
868	* Otherwise, the scheduler will take care of it.
869	*/
870	if (p_was_empty)
871	transmit_event(p, head, tail);
872	}
873
874	/*
875	* Called when we can transmit packets on WF2Q queues. Take pkts out of
876	* the queues at their start time, and enqueue into the delay line.
877	* Packets are drained until p->numbytes < 0. As long as
878	* len_scaled >= p->numbytes, the packet goes into the delay line
879	* with a deadline p->delay. For the last packet, if p->numbytes<0,
880	* there is an additional delay.
881	*/
882	static void
883	ready_event_wfq(struct dn_pipe p, struct* mbuf head, struct mbuf tail)
884	{
885	int p_was_empty = (p->head == NULL) ;
886	struct dn_heap *sch = &(p->scheduler_heap);
887	struct dn_heap *neh = &(p->not_eligible_heap) ;
888	int64_t p_numbytes = p->numbytes;
889
890	LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
891
892	if (p->if_name[`0`] == `0`) / tx clock is simulated /
893	p_numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
894	else { / tx clock is for real, the ifq must be empty or this is a NOP /
895	if (p->ifp && !IFCQ_IS_EMPTY(&p->ifp->if_snd))
896	return ;
897	else {
898	DPRINTF(("dummynet: pipe %d ready from %s --\n",
899	p->pipe_nr, p->if_name));
900	}
901	}
902
903	/*
904	* While we have backlogged traffic AND credit, we need to do
905	* something on the queue.
906	*/
907	while ( p_numbytes >=`0` && (sch->elements>`0` \|\| neh->elements >`0`) ) {
908	if (sch->elements > `0`) { / have some eligible pkts to send out /
909	struct dn_flow_queue *q = sch->p[`0`].object ;
910	struct mbuf *pkt = q->head;
911	struct dn_flow_set *fs = q->fs;
912	u_int64_t len = pkt->m_pkthdr.len;
913	int len_scaled = p->bandwidth ? len`8`(hz*`10`) : `0` ;
914
915	heap_extract(sch, NULL); / remove queue from heap /
916	p_numbytes -= len_scaled ;
917	move_pkt(pkt, q, p, len);
918
919	p->V += (len<<MY_M) / p->sum ; / update V /
920	q->S = q->F ; / update start time /
921	if (q->len == `0`) { / Flow not backlogged any more /
922	fs->backlogged-- ;
923	heap_insert(&(p->idle_heap), q->F, q);
924	} else { / still backlogged /
925	/*
926	* update F and position in backlogged queue, then
927	* put flow in not_eligible_heap (we will fix this later).
928	*/
929	len = (q->head)->m_pkthdr.len;
930	q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
931	if (DN_KEY_LEQ(q->S, p->V))
932	heap_insert(neh, q->S, q);
933	else
934	heap_insert(sch, q->F, q);
935	}
936	}
937	/*
938	* now compute V = max(V, min(S_i)). Remember that all elements in sch
939	* have by definition S_i <= V so if sch is not empty, V is surely
940	* the max and we must not update it. Conversely, if sch is empty
941	* we only need to look at neh.
942	*/
943	if (sch->elements == `0` && neh->elements > `0`)
944	p->V = MAX64 ( p->V, neh->p[`0`].key );
945	/ move from neh to sch any packets that have become eligible /
946	while (neh->elements > `0` && DN_KEY_LEQ(neh->p[`0`].key, p->V) ) {
947	struct dn_flow_queue *q = neh->p[`0`].object ;
948	heap_extract(neh, NULL);
949	heap_insert(sch, q->F, q);
950	}
951
952	if (p->if_name[`0`] != `'\0'`) {/ tx clock is from a real thing /
953	p_numbytes = -`1` ; / mark not ready for I/O /
954	break ;
955	}
956	}
957	if (sch->elements == `0` && neh->elements == `0` && p_numbytes >= `0`
958	&& p->idle_heap.elements > `0`) {
959	/*
960	* no traffic and no events scheduled. We can get rid of idle-heap.
961	*/
962	int i ;
963
964	for (i = `0` ; i < p->idle_heap.elements ; i++) {
965	struct dn_flow_queue *q = p->idle_heap.p[i].object ;
966
967	q->F = `0` ;
968	q->S = q->F + `1` ;
969	}
970	p->sum = `0` ;
971	p->V = `0` ;
972	p->idle_heap.elements = `0` ;
973	}
974	/*
975	* If we are getting clocks from dummynet (not a real interface) and
976	* If we are under credit, schedule the next ready event.
977	* Also fix the delivery time of the last packet.
978	*/
979	if (p->if_name[`0`]==`0` && p_numbytes < `0`) { / this implies bandwidth >0 /
980	dn_key t=`0` ; / number of ticks i have to wait /
981
982	if (p->bandwidth > `0`)
983	t = ( p->bandwidth -`1` - p_numbytes) / p->bandwidth ;
984	dn_tag_get(p->tail)->dn_output_time += t ;
985	p->sched_time = curr_time ;
986	heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
987	/ XXX should check errors on heap_insert, and drain the whole*
988	* queue on error hoping next time we are luckier.
989	*/
990	}
991
992	/ Fit (adjust if necessary) 64bit result into 32bit variable. /
993	if (p_numbytes > INT_MAX)
994	p->numbytes = INT_MAX;
995	else if (p_numbytes < INT_MIN)
996	p->numbytes = INT_MIN;
997	else
998	p->numbytes = p_numbytes;
999
1000	/*
1001	* If the delay line was empty call transmit_event(p) now.
1002	* Otherwise, the scheduler will take care of it.
1003	*/
1004	if (p_was_empty)
1005	transmit_event(p, head, tail);
1006
1007	}
1008
1009	/*
1010	* This is called every 1ms. It is used to
1011	* increment the current tick counter and schedule expired events.
1012	*/
1013	static void
1014	dummynet(__unused void * unused)
1015	{
1016	void p ; /* generic parameter to handler /
1017	struct dn_heap *h ;
1018	struct dn_heap *heaps[`3`];
1019	struct mbuf head = NULL, tail = NULL;
1020	int i;
1021	struct dn_pipe *pe ;
1022	struct timespec ts;
1023	struct timeval tv;
1024
1025	heaps[`0`] = &ready_heap ; / fixed-rate queues /
1026	heaps[`1`] = &wfq_ready_heap ; / wfq queues /
1027	heaps[`2`] = &extract_heap ; / delay line /
1028
1029	lck_mtx_lock(dn_mutex);
1030
1031	/ make all time measurements in milliseconds (ms) -*
1032	* here we convert secs and usecs to msecs (just divide the
1033	* usecs and take the closest whole number).
1034	*/
1035	microuptime(&tv);
1036	curr_time = (tv.tv_sec * `1000`) + (tv.tv_usec / `1000`);
1037
1038	for (i=`0`; i < `3` ; i++) {
1039	h = heaps[i];
1040	while (h->elements > `0` && DN_KEY_LEQ(h->p[`0`].key, curr_time) ) {
1041	if (h->p[`0`].key > curr_time)
1042	printf("dummynet: warning, heap %d is %d ticks late\n",
1043	i, (int)(curr_time - h->p[`0`].key));
1044	p = h->p[`0`].object ; / store a copy before heap_extract /
1045	heap_extract(h, NULL); / need to extract before processing /
1046	if (i == `0`)
1047	ready_event(p, &head, &tail) ;
1048	else if (i == `1`) {
1049	struct dn_pipe *pipe = p;
1050	if (pipe->if_name[`0`] != `'\0'`)
1051	printf("dummynet: bad ready_event_wfq for pipe %s\n",
1052	pipe->if_name);
1053	else
1054	ready_event_wfq(p, &head, &tail) ;
1055	} else {
1056	transmit_event(p, &head, &tail);
1057	}
1058	}
1059	}
1060	/ sweep pipes trying to expire idle flow_queues /
1061	for (i = `0`; i < HASHSIZE; i++)
1062	SLIST_FOREACH(pe, &pipehash[i], next)
1063	if (pe->idle_heap.elements > `0` &&
1064	DN_KEY_LT(pe->idle_heap.p[`0`].key, pe->V) ) {
1065	struct dn_flow_queue *q = pe->idle_heap.p[`0`].object ;
1066
1067	heap_extract(&(pe->idle_heap), NULL);
1068	q->S = q->F + `1` ; / mark timestamp as invalid /
1069	pe->sum -= q->fs->weight ;
1070	}
1071
1072	/ check the heaps to see if there's still stuff in there, and*
1073	* only set the timer if there are packets to process
1074	*/
1075	timer_enabled = `0`;
1076	for (i=`0`; i < `3` ; i++) {
1077	h = heaps[i];
1078	if (h->elements > `0`) { // set the timer
1079	ts.tv_sec = `0`;
1080	ts.tv_nsec = `1` * `1000000`; // 1ms
1081	timer_enabled = `1`;
1082	bsd_timeout(dummynet, NULL, &ts);
1083	break;
1084	}
1085	}
1086
1087	if (head != NULL)
1088	serialize++;
1089
1090	lck_mtx_unlock(dn_mutex);
1091
1092	/ Send out the de-queued list of ready-to-send packets /
1093	if (head != NULL) {
1094	dummynet_send(head);
1095	lck_mtx_lock(dn_mutex);
1096	serialize--;
1097	lck_mtx_unlock(dn_mutex);
1098	}
1099	}
1100
1101
1102	static void
1103	dummynet_send(struct mbuf *m)
1104	{
1105	struct dn_pkt_tag *pkt;
1106	struct mbuf *n;
1107
1108	for (; m != NULL; m = n) {
1109	n = m->m_nextpkt;
1110	m->m_nextpkt = NULL;
1111	pkt = dn_tag_get(m);
1112
1113	DPRINTF(("dummynet_send m: 0x%llx dn_dir: %d dn_flags: 0x%x\n",
1114	(uint64_t)VM_KERNEL_ADDRPERM(m), pkt->dn_dir,
1115	pkt->dn_flags));
1116
1117	switch (pkt->dn_dir) {
1118	case DN_TO_IP_OUT: {
1119	struct route tmp_rt;
1120
1121	/ route is already in the packet's dn_ro /
1122	bzero(&tmp_rt, sizeof (tmp_rt));
1123
1124	/ Force IP_RAWOUTPUT as the IP header is fully formed /
1125	pkt->dn_flags \|= IP_RAWOUTPUT \| IP_FORWARDING;
1126	(void)ip_output(m, NULL, &tmp_rt, pkt->dn_flags, NULL, NULL);
1127	ROUTE_RELEASE(&tmp_rt);
1128	break ;
1129	}
1130	case DN_TO_IP_IN :
1131	proto_inject(PF_INET, m);
1132	break ;
1133	#ifdef INET6
1134	case DN_TO_IP6_OUT: {
1135	/ routes already in the packet's dn_{ro6,pmtu} /
1136	ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
1137	break;
1138	}
1139	case DN_TO_IP6_IN:
1140	proto_inject(PF_INET6, m);
1141	break;
1142	#endif /* INET6 */
1143	default:
1144	printf("dummynet: bad switch %d!\n", pkt->dn_dir);
1145	m_freem(m);
1146	break ;
1147	}
1148	}
1149	}
1150
1151	/*
1152	* Unconditionally expire empty queues in case of shortage.
1153	* Returns the number of queues freed.
1154	*/
1155	static int
1156	expire_queues(struct dn_flow_set *fs)
1157	{
1158	struct dn_flow_queue q, prev ;
1159	int i, initial_elements = fs->rq_elements ;
1160	struct timeval timenow;
1161
1162	/ reviewed for getmicrotime usage /
1163	getmicrotime(&timenow);
1164
1165	if (fs->last_expired == timenow.tv_sec)
1166	return `0` ;
1167	fs->last_expired = timenow.tv_sec ;
1168	for (i = `0` ; i <= fs->rq_size ; i++) / last one is overflow /
1169	for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
1170	if (q->head != NULL \|\| q->S != q->F+`1`) {
1171	prev = q ;
1172	q = q->next ;
1173	} else { / entry is idle, expire it /
1174	struct dn_flow_queue *old_q = q ;
1175
1176	if (prev != NULL)
1177	prev->next = q = q->next ;
1178	else
1179	fs->rq[i] = q = q->next ;
1180	fs->rq_elements-- ;
1181	FREE(old_q, M_DUMMYNET);
1182	}
1183	return initial_elements - fs->rq_elements ;
1184	}
1185
1186	/*
1187	* If room, create a new queue and put at head of slot i;
1188	* otherwise, create or use the default queue.
1189	*/
1190	static struct dn_flow_queue *
1191	create_queue(struct dn_flow_set fs, int* i)
1192	{
1193	struct dn_flow_queue *q ;
1194
1195	if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
1196	expire_queues(fs) == `0`) {
1197	/*
1198	* No way to get room, use or create overflow queue.
1199	*/
1200	i = fs->rq_size ;
1201	if ( fs->rq[i] != NULL )
1202	return fs->rq[i] ;
1203	}
1204	q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT \| M_ZERO);
1205	if (q == NULL) {
1206	printf("dummynet: sorry, cannot allocate queue for new flow\n");
1207	return NULL ;
1208	}
1209	q->fs = fs ;
1210	q->hash_slot = i ;
1211	q->next = fs->rq[i] ;
1212	q->S = q->F + `1`; / hack - mark timestamp as invalid /
1213	fs->rq[i] = q ;
1214	fs->rq_elements++ ;
1215	return q ;
1216	}
1217
1218	/*
1219	* Given a flow_set and a pkt in last_pkt, find a matching queue
1220	* after appropriate masking. The queue is moved to front
1221	* so that further searches take less time.
1222	*/
1223	static struct dn_flow_queue *
1224	find_queue(struct dn_flow_set fs, struct* ip_flow_id *id)
1225	{
1226	int i = `0` ; / we need i and q for new allocations /
1227	struct dn_flow_queue q, prev;
1228	int is_v6 = IS_IP6_FLOW_ID(id);
1229
1230	if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
1231	q = fs->rq[`0`] ;
1232	else {
1233	/ first, do the masking, then hash /
1234	id->dst_port &= fs->flow_mask.dst_port ;
1235	id->src_port &= fs->flow_mask.src_port ;
1236	id->proto &= fs->flow_mask.proto ;
1237	id->flags = `0` ; / we don't care about this one /
1238	if (is_v6) {
1239	APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
1240	APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
1241	id->flow_id6 &= fs->flow_mask.flow_id6;
1242
1243	i = ((id->dst_ip6.__u6_addr.__u6_addr32[`0`]) & `0xffff`)^
1244	((id->dst_ip6.__u6_addr.__u6_addr32[`1`]) & `0xffff`)^
1245	((id->dst_ip6.__u6_addr.__u6_addr32[`2`]) & `0xffff`)^
1246	((id->dst_ip6.__u6_addr.__u6_addr32[`3`]) & `0xffff`)^
1247
1248	((id->dst_ip6.__u6_addr.__u6_addr32[`0`] >> `15`) & `0xffff`)^
1249	((id->dst_ip6.__u6_addr.__u6_addr32[`1`] >> `15`) & `0xffff`)^
1250	((id->dst_ip6.__u6_addr.__u6_addr32[`2`] >> `15`) & `0xffff`)^
1251	((id->dst_ip6.__u6_addr.__u6_addr32[`3`] >> `15`) & `0xffff`)^
1252
1253	((id->src_ip6.__u6_addr.__u6_addr32[`0`] << `1`) & `0xfffff`)^
1254	((id->src_ip6.__u6_addr.__u6_addr32[`1`] << `1`) & `0xfffff`)^
1255	((id->src_ip6.__u6_addr.__u6_addr32[`2`] << `1`) & `0xfffff`)^
1256	((id->src_ip6.__u6_addr.__u6_addr32[`3`] << `1`) & `0xfffff`)^
1257
1258	((id->src_ip6.__u6_addr.__u6_addr32[`0`] >> `16`) & `0xffff`)^
1259	((id->src_ip6.__u6_addr.__u6_addr32[`1`] >> `16`) & `0xffff`)^
1260	((id->src_ip6.__u6_addr.__u6_addr32[`2`] >> `16`) & `0xffff`)^
1261	((id->src_ip6.__u6_addr.__u6_addr32[`3`] >> `16`) & `0xffff`)^
1262
1263	(id->dst_port << `1`) ^ (id->src_port) ^
1264	(id->proto ) ^
1265	(id->flow_id6);
1266	} else {
1267	id->dst_ip &= fs->flow_mask.dst_ip ;
1268	id->src_ip &= fs->flow_mask.src_ip ;
1269
1270	i = ( (id->dst_ip) & `0xffff` ) ^
1271	( (id->dst_ip >> `15`) & `0xffff` ) ^
1272	( (id->src_ip << `1`) & `0xffff` ) ^
1273	( (id->src_ip >> `16` ) & `0xffff` ) ^
1274	(id->dst_port << `1`) ^ (id->src_port) ^
1275	(id->proto );
1276	}
1277	i = i % fs->rq_size ;
1278	/ finally, scan the current list for a match /
1279	searches++ ;
1280	for (prev=NULL, q = fs->rq[i] ; q ; ) {
1281	search_steps++;
1282	if (is_v6 &&
1283	IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) &&
1284	IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) &&
1285	id->dst_port == q->id.dst_port &&
1286	id->src_port == q->id.src_port &&
1287	id->proto == q->id.proto &&
1288	id->flags == q->id.flags &&
1289	id->flow_id6 == q->id.flow_id6)
1290	break ; / found /
1291
1292	if (!is_v6 && id->dst_ip == q->id.dst_ip &&
1293	id->src_ip == q->id.src_ip &&
1294	id->dst_port == q->id.dst_port &&
1295	id->src_port == q->id.src_port &&
1296	id->proto == q->id.proto &&
1297	id->flags == q->id.flags)
1298	break ; / found /
1299
1300	/ No match. Check if we can expire the entry /
1301	if (pipe_expire && q->head == NULL && q->S == q->F+`1` ) {
1302	/ entry is idle and not in any heap, expire it /
1303	struct dn_flow_queue *old_q = q ;
1304
1305	if (prev != NULL)
1306	prev->next = q = q->next ;
1307	else
1308	fs->rq[i] = q = q->next ;
1309	fs->rq_elements-- ;
1310	FREE(old_q, M_DUMMYNET);
1311	continue ;
1312	}
1313	prev = q ;
1314	q = q->next ;
1315	}
1316	if (q && prev != NULL) { / found and not in front /
1317	prev->next = q->next ;
1318	q->next = fs->rq[i] ;
1319	fs->rq[i] = q ;
1320	}
1321	}
1322	if (q == NULL) { / no match, need to allocate a new entry /
1323	q = create_queue(fs, i);
1324	if (q != NULL)
1325	q->id = *id ;
1326	}
1327	return q ;
1328	}
1329
1330	static int
1331	red_drops(struct dn_flow_set fs, struct* dn_flow_queue q, int* len)
1332	{
1333	/*
1334	* RED algorithm
1335	*
1336	* RED calculates the average queue size (avg) using a low-pass filter
1337	* with an exponential weighted (w_q) moving average:
1338	* avg <- (1-w_q) * avg + w_q * q_size
1339	* where q_size is the queue length (measured in bytes or * packets).
1340	*
1341	* If q_size == 0, we compute the idle time for the link, and set
1342	* avg = (1 - w_q)^(idle/s)
1343	* where s is the time needed for transmitting a medium-sized packet.
1344	*
1345	* Now, if avg < min_th the packet is enqueued.
1346	* If avg > max_th the packet is dropped. Otherwise, the packet is
1347	* dropped with probability P function of avg.
1348	*
1349	*/
1350
1351	int64_t p_b = `0`;
1352	/ queue in bytes or packets ? /
1353	u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
1354
1355	DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size));
1356
1357	/ average queue size estimation /
1358	if (q_size != `0`) {
1359	/*
1360	* queue is not empty, avg <- avg + (q_size - avg) * w_q
1361	*/
1362	int diff = SCALE(q_size) - q->avg;
1363	int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
1364
1365	q->avg += (int) v;
1366	} else {
1367	/*
1368	* queue is empty, find for how long the queue has been
1369	* empty and use a lookup table for computing
1370	* (1 - * w_q)^(idle_time/s) where s is the time to send a
1371	* (small) packet.
1372	* XXX check wraps...
1373	*/
1374	if (q->avg) {
1375	u_int t = (curr_time - q->q_time) / fs->lookup_step;
1376
1377	q->avg = (t < fs->lookup_depth) ?
1378	SCALE_MUL(q->avg, fs->w_q_lookup[t]) : `0`;
1379	}
1380	}
1381	DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
1382
1383	/ should i drop ? /
1384
1385	if (q->avg < fs->min_th) {
1386	q->count = -`1`;
1387	return `0`; / accept packet ; /
1388	}
1389	if (q->avg >= fs->max_th) { / average queue >= max threshold /
1390	if (fs->flags_fs & DN_IS_GENTLE_RED) {
1391	/*
1392	* According to Gentle-RED, if avg is greater than max_th the
1393	* packet is dropped with a probability
1394	* p_b = c_3 * avg - c_4
1395	* where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1396	*/
1397	p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
1398	} else {
1399	q->count = -`1`;
1400	DPRINTF(("dummynet: - drop"));
1401	return `1` ;
1402	}
1403	} else if (q->avg > fs->min_th) {
1404	/*
1405	* we compute p_b using the linear dropping function p_b = c_1 *
1406	* avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1407	* max_p * min_th / (max_th - min_th)
1408	*/
1409	p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
1410	}
1411	if (fs->flags_fs & DN_QSIZE_IS_BYTES)
1412	p_b = (p_b * len) / fs->max_pkt_size;
1413	if (++q->count == `0`)
1414	q->random = (my_random() & `0xffff`);
1415	else {
1416	/*
1417	* q->count counts packets arrived since last drop, so a greater
1418	* value of q->count means a greater packet drop probability.
1419	*/
1420	if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
1421	q->count = `0`;
1422	DPRINTF(("dummynet: - red drop"));
1423	/ after a drop we calculate a new random value /
1424	q->random = (my_random() & `0xffff`);
1425	return `1`; / drop /
1426	}
1427	}
1428	/ end of RED algorithm /
1429	return `0` ; / accept /
1430	}
1431
1432	static __inline
1433	struct dn_flow_set *
1434	locate_flowset(int fs_nr)
1435	{
1436	struct dn_flow_set *fs;
1437	SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next)
1438	if (fs->fs_nr == fs_nr)
1439	return fs ;
1440
1441	return (NULL);
1442	}
1443
1444	static __inline struct dn_pipe *
1445	locate_pipe(int pipe_nr)
1446	{
1447	struct dn_pipe *pipe;
1448
1449	SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next)
1450	if (pipe->pipe_nr == pipe_nr)
1451	return (pipe);
1452
1453	return (NULL);
1454	}
1455
1456
1457
1458	/*
1459	* dummynet hook for packets. Below 'pipe' is a pipe or a queue
1460	* depending on whether WF2Q or fixed bw is used.
1461	*
1462	* pipe_nr pipe or queue the packet is destined for.
1463	* dir where shall we send the packet after dummynet.
1464	* m the mbuf with the packet
1465	* ifp the 'ifp' parameter from the caller.
1466	* NULL in ip_input, destination interface in ip_output,
1467	* real_dst in bdg_forward
1468	* ro route parameter (only used in ip_output, NULL otherwise)
1469	* dst destination address, only used by ip_output
1470	* rule matching rule, in case of multiple passes
1471	* flags flags from the caller, only used in ip_output
1472	*
1473	*/
1474	static int
1475	dummynet_io(struct mbuf m, int* pipe_nr, int dir, struct ip_fw_args fwa, int* client)
1476	{
1477	struct mbuf head = NULL, tail = NULL;
1478	struct dn_pkt_tag *pkt;
1479	struct m_tag *mtag;
1480	struct dn_flow_set *fs = NULL;
1481	struct dn_pipe *pipe ;
1482	u_int64_t len = m->m_pkthdr.len ;
1483	struct dn_flow_queue *q = NULL ;
1484	int is_pipe = `0`;
1485	struct timespec ts;
1486	struct timeval tv;
1487
1488	DPRINTF(("dummynet_io m: 0x%llx pipe: %d dir: %d client: %d\n",
1489	(uint64_t)VM_KERNEL_ADDRPERM(m), pipe_nr, dir, client));
1490
1491	#if IPFIREWALL
1492	#if IPFW2
1493	if (client == DN_CLIENT_IPFW) {
1494	ipfw_insn *cmd = fwa->fwa_ipfw_rule->cmd + fwa->fwa_ipfw_rule->act_ofs;
1495
1496	if (cmd->opcode == O_LOG)
1497	cmd += F_LEN(cmd);
1498	is_pipe = (cmd->opcode == O_PIPE);
1499	}
1500	#else
1501	if (client == DN_CLIENT_IPFW)
1502	is_pipe = (fwa->fwa_ipfw_rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE;
1503	#endif
1504	#endif /* IPFIREWALL */
1505
1506	#if DUMMYNET
1507	if (client == DN_CLIENT_PF)
1508	is_pipe = fwa->fwa_flags == DN_IS_PIPE ? `1` : `0`;
1509	#endif /* DUMMYNET */
1510
1511	pipe_nr &= `0xffff` ;
1512
1513	lck_mtx_lock(dn_mutex);
1514
1515	/ make all time measurements in milliseconds (ms) -*
1516	* here we convert secs and usecs to msecs (just divide the
1517	* usecs and take the closest whole number).
1518	*/
1519	microuptime(&tv);
1520	curr_time = (tv.tv_sec * `1000`) + (tv.tv_usec / `1000`);
1521
1522	/*
1523	* This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1524	*/
1525	if (is_pipe) {
1526	pipe = locate_pipe(pipe_nr);
1527	if (pipe != NULL)
1528	fs = &(pipe->fs);
1529	} else
1530	fs = locate_flowset(pipe_nr);
1531
1532
1533	if (fs == NULL){
1534	goto dropit ; / this queue/pipe does not exist! /
1535	}
1536	pipe = fs->pipe ;
1537	if (pipe == NULL) { / must be a queue, try find a matching pipe /
1538	pipe = locate_pipe(fs->parent_nr);
1539
1540	if (pipe != NULL)
1541	fs->pipe = pipe ;
1542	else {
1543	printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1544	fs->parent_nr, fs->fs_nr);
1545	goto dropit ;
1546	}
1547	}
1548	q = find_queue(fs, &(fwa->fwa_id));
1549	if ( q == NULL )
1550	goto dropit ; / cannot allocate queue /
1551	/*
1552	* update statistics, then check reasons to drop pkt
1553	*/
1554	q->tot_bytes += len ;
1555	q->tot_pkts++ ;
1556	if ( fs->plr && (my_random() < fs->plr))
1557	goto dropit ; / random pkt drop /
1558	if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
1559	if (q->len_bytes > fs->qsize)
1560	goto dropit ; / queue size overflow /
1561	} else {
1562	if (q->len >= fs->qsize)
1563	goto dropit ; / queue count overflow /
1564	}
1565	if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
1566	goto dropit ;
1567
1568	/ XXX expensive to zero, see if we can remove it/
1569	mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET,
1570	sizeof(struct dn_pkt_tag), M_NOWAIT, m);
1571	if ( mtag == NULL )
1572	goto dropit ; / cannot allocate packet header /
1573	m_tag_prepend(m, mtag); / attach to mbuf chain /
1574
1575	pkt = (struct dn_pkt_tag *)(mtag+`1`);
1576	bzero(pkt, sizeof(struct dn_pkt_tag));
1577	/ ok, i can handle the pkt now... /
1578	/ build and enqueue packet + parameters /
1579	/*
1580	* PF is checked before ipfw so remember ipfw rule only when
1581	* the caller is ipfw. When the caller is PF, fwa_ipfw_rule
1582	* is a fake rule just used for convenience
1583	*/
1584	if (client == DN_CLIENT_IPFW)
1585	pkt->dn_ipfw_rule = fwa->fwa_ipfw_rule;
1586	pkt->dn_pf_rule = fwa->fwa_pf_rule;
1587	pkt->dn_dir = dir ;
1588	pkt->dn_client = client;
1589
1590	pkt->dn_ifp = fwa->fwa_oif;
1591	if (dir == DN_TO_IP_OUT) {
1592	/*
1593	* We need to copy *ro because for ICMP pkts (and maybe others)
1594	* the caller passed a pointer into the stack; dst might also be
1595	* a pointer into *ro so it needs to be updated.
1596	*/
1597	if (fwa->fwa_ro) {
1598	route_copyout(&pkt->dn_ro, fwa->fwa_ro, sizeof (pkt->dn_ro));
1599	}
1600	if (fwa->fwa_dst) {
1601	if (fwa->fwa_dst == (struct sockaddr_in )&fwa->fwa_ro->ro_dst) /* dst points into ro /
1602	fwa->fwa_dst = (struct sockaddr_in *)&(pkt->dn_ro.ro_dst) ;
1603
1604	bcopy (fwa->fwa_dst, &pkt->dn_dst, sizeof(pkt->dn_dst));
1605	}
1606	} else if (dir == DN_TO_IP6_OUT) {
1607	if (fwa->fwa_ro6) {
1608	route_copyout((struct route *)&pkt->dn_ro6,
1609	(struct route )fwa->fwa_ro6, sizeof* (pkt->dn_ro6));
1610	}
1611	if (fwa->fwa_ro6_pmtu) {
1612	route_copyout((struct route *)&pkt->dn_ro6_pmtu,
1613	(struct route )fwa->fwa_ro6_pmtu, sizeof* (pkt->dn_ro6_pmtu));
1614	}
1615	if (fwa->fwa_dst6) {
1616	if (fwa->fwa_dst6 == (struct sockaddr_in6 )&fwa->fwa_ro6->ro_dst) /* dst points into ro /
1617	fwa->fwa_dst6 = (struct sockaddr_in6 *)&(pkt->dn_ro6.ro_dst) ;
1618
1619	bcopy (fwa->fwa_dst6, &pkt->dn_dst6, sizeof(pkt->dn_dst6));
1620	}
1621	pkt->dn_origifp = fwa->fwa_origifp;
1622	pkt->dn_mtu = fwa->fwa_mtu;
1623	pkt->dn_alwaysfrag = fwa->fwa_alwaysfrag;
1624	pkt->dn_unfragpartlen = fwa->fwa_unfragpartlen;
1625	if (fwa->fwa_exthdrs) {
1626	bcopy (fwa->fwa_exthdrs, &pkt->dn_exthdrs, sizeof(pkt->dn_exthdrs));
1627	/*
1628	* Need to zero out the source structure so the mbufs
1629	* won't be freed by ip6_output()
1630	*/
1631	bzero(fwa->fwa_exthdrs, sizeof(struct ip6_exthdrs));
1632	}
1633	}
1634	if (dir == DN_TO_IP_OUT \|\| dir == DN_TO_IP6_OUT) {
1635	pkt->dn_flags = fwa->fwa_oflags;
1636	if (fwa->fwa_ipoa != NULL)
1637	pkt->dn_ipoa = *(fwa->fwa_ipoa);
1638	}
1639	if (q->head == NULL)
1640	q->head = m;
1641	else
1642	q->tail->m_nextpkt = m;
1643	q->tail = m;
1644	q->len++;
1645	q->len_bytes += len ;
1646
1647	if ( q->head != m ) / flow was not idle, we are done /
1648	goto done;
1649	/*
1650	* If we reach this point the flow was previously idle, so we need
1651	* to schedule it. This involves different actions for fixed-rate or
1652	* WF2Q queues.
1653	*/
1654	if (is_pipe) {
1655	/*
1656	* Fixed-rate queue: just insert into the ready_heap.
1657	*/
1658	dn_key t = `0` ;
1659	if (pipe->bandwidth)
1660	t = SET_TICKS(m, q, pipe);
1661	q->sched_time = curr_time ;
1662	if (t == `0`) / must process it now /
1663	ready_event( q , &head, &tail );
1664	else
1665	heap_insert(&ready_heap, curr_time + t , q );
1666	} else {
1667	/*
1668	* WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1669	* set S to the virtual time V for the controlling pipe, and update
1670	* the sum of weights for the pipe; otherwise, remove flow from
1671	* idle_heap and set S to max(F,V).
1672	* Second, compute finish time F = S + len/weight.
1673	* Third, if pipe was idle, update V=max(S, V).
1674	* Fourth, count one more backlogged flow.
1675	*/
1676	if (DN_KEY_GT(q->S, q->F)) { / means timestamps are invalid /
1677	q->S = pipe->V ;
1678	pipe->sum += fs->weight ; / add weight of new queue /
1679	} else {
1680	heap_extract(&(pipe->idle_heap), q);
1681	q->S = MAX64(q->F, pipe->V ) ;
1682	}
1683	q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
1684
1685	if (pipe->not_eligible_heap.elements == `0` &&
1686	pipe->scheduler_heap.elements == `0`)
1687	pipe->V = MAX64 ( q->S, pipe->V );
1688	fs->backlogged++ ;
1689	/*
1690	* Look at eligibility. A flow is not eligibile if S>V (when
1691	* this happens, it means that there is some other flow already
1692	* scheduled for the same pipe, so the scheduler_heap cannot be
1693	* empty). If the flow is not eligible we just store it in the
1694	* not_eligible_heap. Otherwise, we store in the scheduler_heap
1695	* and possibly invoke ready_event_wfq() right now if there is
1696	* leftover credit.
1697	* Note that for all flows in scheduler_heap (SCH), S_i <= V,
1698	* and for all flows in not_eligible_heap (NEH), S_i > V .
1699	* So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1700	* we only need to look into NEH.
1701	*/
1702	if (DN_KEY_GT(q->S, pipe->V) ) { / not eligible /
1703	if (pipe->scheduler_heap.elements == `0`)
1704	printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1705	heap_insert(&(pipe->not_eligible_heap), q->S, q);
1706	} else {
1707	heap_insert(&(pipe->scheduler_heap), q->F, q);
1708	if (pipe->numbytes >= `0`) { / pipe is idle /
1709	if (pipe->scheduler_heap.elements != `1`)
1710	printf("dummynet: OUCH! pipe should have been idle!\n");
1711	DPRINTF(("dummynet: waking up pipe %d at %d\n",
1712	pipe->pipe_nr, (int)(q->F >> MY_M)));
1713	pipe->sched_time = curr_time ;
1714	ready_event_wfq(pipe, &head, &tail);
1715	}
1716	}
1717	}
1718	done:
1719	/ start the timer and set global if not already set /
1720	if (!timer_enabled) {
1721	ts.tv_sec = `0`;
1722	ts.tv_nsec = `1` * `1000000`; // 1ms
1723	timer_enabled = `1`;
1724	bsd_timeout(dummynet, NULL, &ts);
1725	}
1726
1727	lck_mtx_unlock(dn_mutex);
1728
1729	if (head != NULL) {
1730	dummynet_send(head);
1731	}
1732
1733	return `0`;
1734
1735	dropit:
1736	if (q)
1737	q->drops++ ;
1738	lck_mtx_unlock(dn_mutex);
1739	m_freem(m);
1740	return ( (fs && (fs->flags_fs & DN_NOERROR)) ? `0` : ENOBUFS);
1741	}
1742
1743	/*
1744	* Below, the ROUTE_RELEASE is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1745	* Doing this would probably save us the initial bzero of dn_pkt
1746	*/
1747	#define DN_FREE_PKT(_m) do { \
1748	struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \
1749	if (tag) { \
1750	struct dn_pkt_tag n = (struct dn_pkt_tag )(tag+1); \
1751	ROUTE_RELEASE(&n->dn_ro); \
1752	} \
1753	m_tag_delete(_m, tag); \
1754	m_freem(_m); \
1755	} while (0)
1756
1757	/*
1758	* Dispose all packets and flow_queues on a flow_set.
1759	* If all=1, also remove red lookup table and other storage,
1760	* including the descriptor itself.
1761	* For the one in dn_pipe MUST also cleanup ready_heap...
1762	*/
1763	static void
1764	purge_flow_set(struct dn_flow_set fs, int* all)
1765	{
1766	struct dn_flow_queue q, qn ;
1767	int i ;
1768
1769	LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
1770
1771	for (i = `0` ; i <= fs->rq_size ; i++ ) {
1772	for (q = fs->rq[i] ; q ; q = qn ) {
1773	struct mbuf m, mnext;
1774
1775	mnext = q->head;
1776	while ((m = mnext) != NULL) {
1777	mnext = m->m_nextpkt;
1778	DN_FREE_PKT(m);
1779	}
1780	qn = q->next ;
1781	FREE(q, M_DUMMYNET);
1782	}
1783	fs->rq[i] = NULL ;
1784	}
1785	fs->rq_elements = `0` ;
1786	if (all) {
1787	/ RED - free lookup table /
1788	if (fs->w_q_lookup)
1789	FREE(fs->w_q_lookup, M_DUMMYNET);
1790	if (fs->rq)
1791	FREE(fs->rq, M_DUMMYNET);
1792	/ if this fs is not part of a pipe, free it /
1793	if (fs->pipe && fs != &(fs->pipe->fs) )
1794	FREE(fs, M_DUMMYNET);
1795	}
1796	}
1797
1798	/*
1799	* Dispose all packets queued on a pipe (not a flow_set).
1800	* Also free all resources associated to a pipe, which is about
1801	* to be deleted.
1802	*/
1803	static void
1804	purge_pipe(struct dn_pipe *pipe)
1805	{
1806	struct mbuf m, mnext;
1807
1808	purge_flow_set( &(pipe->fs), `1` );
1809
1810	mnext = pipe->head;
1811	while ((m = mnext) != NULL) {
1812	mnext = m->m_nextpkt;
1813	DN_FREE_PKT(m);
1814	}
1815
1816	heap_free( &(pipe->scheduler_heap) );
1817	heap_free( &(pipe->not_eligible_heap) );
1818	heap_free( &(pipe->idle_heap) );
1819	}
1820
1821	/*
1822	* Delete all pipes and heaps returning memory. Must also
1823	* remove references from all ipfw rules to all pipes.
1824	*/
1825	static void
1826	dummynet_flush(void)
1827	{
1828	struct dn_pipe pipe, pipe1;
1829	struct dn_flow_set fs, fs1;
1830	int i;
1831
1832	lck_mtx_lock(dn_mutex);
1833
1834	#if IPFW2
1835	/ remove all references to pipes .../
1836	flush_pipe_ptrs(NULL);
1837	#endif /* IPFW2 */
1838
1839	/ Free heaps so we don't have unwanted events. /
1840	heap_free(&ready_heap);
1841	heap_free(&wfq_ready_heap);
1842	heap_free(&extract_heap);
1843
1844	/*
1845	* Now purge all queued pkts and delete all pipes.
1846	*
1847	* XXXGL: can we merge the for(;;) cycles into one or not?
1848	*/
1849	for (i = `0`; i < HASHSIZE; i++)
1850	SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
1851	SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
1852	purge_flow_set(fs, `1`);
1853	}
1854	for (i = `0`; i < HASHSIZE; i++)
1855	SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
1856	SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
1857	purge_pipe(pipe);
1858	FREE(pipe, M_DUMMYNET);
1859	}
1860	lck_mtx_unlock(dn_mutex);
1861	}
1862
1863
1864	static void
1865	dn_ipfw_rule_delete_fs(struct dn_flow_set fs, void* *r)
1866	{
1867	int i ;
1868	struct dn_flow_queue *q ;
1869	struct mbuf *m ;
1870
1871	for (i = `0` ; i <= fs->rq_size ; i++) / last one is ovflow /
1872	for (q = fs->rq[i] ; q ; q = q->next )
1873	for (m = q->head ; m ; m = m->m_nextpkt ) {
1874	struct dn_pkt_tag *pkt = dn_tag_get(m) ;
1875	if (pkt->dn_ipfw_rule == r)
1876	pkt->dn_ipfw_rule = &default_rule ;
1877	}
1878	}
1879	/*
1880	* when a firewall rule is deleted, scan all queues and remove the flow-id
1881	* from packets matching this rule.
1882	*/
1883	void
1884	dn_ipfw_rule_delete(void *r)
1885	{
1886	struct dn_pipe *p ;
1887	struct dn_flow_set *fs ;
1888	struct dn_pkt_tag *pkt ;
1889	struct mbuf *m ;
1890	int i;
1891
1892	lck_mtx_lock(dn_mutex);
1893
1894	/*
1895	* If the rule references a queue (dn_flow_set), then scan
1896	* the flow set, otherwise scan pipes. Should do either, but doing
1897	* both does not harm.
1898	*/
1899	for (i = `0`; i < HASHSIZE; i++)
1900	SLIST_FOREACH(fs, &flowsethash[i], next)
1901	dn_ipfw_rule_delete_fs(fs, r);
1902
1903	for (i = `0`; i < HASHSIZE; i++)
1904	SLIST_FOREACH(p, &pipehash[i], next) {
1905	fs = &(p->fs);
1906	dn_ipfw_rule_delete_fs(fs, r);
1907	for (m = p->head ; m ; m = m->m_nextpkt ) {
1908	pkt = dn_tag_get(m);
1909	if (pkt->dn_ipfw_rule == r)
1910	pkt->dn_ipfw_rule = &default_rule;
1911	}
1912	}
1913	lck_mtx_unlock(dn_mutex);
1914	}
1915
1916	/*
1917	* setup RED parameters
1918	*/
1919	static int
1920	config_red(struct dn_flow_set p, struct* dn_flow_set * x)
1921	{
1922	int i;
1923
1924	x->w_q = p->w_q;
1925	x->min_th = SCALE(p->min_th);
1926	x->max_th = SCALE(p->max_th);
1927	x->max_p = p->max_p;
1928
1929	x->c_1 = p->max_p / (p->max_th - p->min_th);
1930	x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1931	if (x->flags_fs & DN_IS_GENTLE_RED) {
1932	x->c_3 = (SCALE(`1`) - p->max_p) / p->max_th;
1933	x->c_4 = (SCALE(`1`) - `2` * p->max_p);
1934	}
1935
1936	/ if the lookup table already exist, free and create it again /
1937	if (x->w_q_lookup) {
1938	FREE(x->w_q_lookup, M_DUMMYNET);
1939	x->w_q_lookup = NULL ;
1940	}
1941	if (red_lookup_depth == `0`) {
1942	printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1943	FREE(x, M_DUMMYNET);
1944	return EINVAL;
1945	}
1946	x->lookup_depth = red_lookup_depth;
1947	x->w_q_lookup = (u_int ) _MALLOC(x->lookup_depth sizeof(int),
1948	M_DUMMYNET, M_DONTWAIT);
1949	if (x->w_q_lookup == NULL) {
1950	printf("dummynet: sorry, cannot allocate red lookup table\n");
1951	FREE(x, M_DUMMYNET);
1952	return ENOSPC;
1953	}
1954
1955	/ fill the lookup table with (1 - w_q)^x /
1956	x->lookup_step = p->lookup_step ;
1957	x->lookup_weight = p->lookup_weight ;
1958	x->w_q_lookup[`0`] = SCALE(`1`) - x->w_q;
1959	for (i = `1`; i < x->lookup_depth; i++)
1960	x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - `1`], x->lookup_weight);
1961	if (red_avg_pkt_size < `1`)
1962	red_avg_pkt_size = `512` ;
1963	x->avg_pkt_size = red_avg_pkt_size ;
1964	if (red_max_pkt_size < `1`)
1965	red_max_pkt_size = `1500` ;
1966	x->max_pkt_size = red_max_pkt_size ;
1967	return `0` ;
1968	}
1969
1970	static int
1971	alloc_hash(struct dn_flow_set x, struct* dn_flow_set *pfs)
1972	{
1973	if (x->flags_fs & DN_HAVE_FLOW_MASK) { / allocate some slots /
1974	int l = pfs->rq_size;
1975
1976	if (l == `0`)
1977	l = dn_hash_size;
1978	if (l < `4`)
1979	l = `4`;
1980	else if (l > DN_MAX_HASH_SIZE)
1981	l = DN_MAX_HASH_SIZE;
1982	x->rq_size = l;
1983	} else / one is enough for null mask /
1984	x->rq_size = `1`;
1985	x->rq = _MALLOC((`1` + x->rq_size) * sizeof(struct dn_flow_queue *),
1986	M_DUMMYNET, M_DONTWAIT \| M_ZERO);
1987	if (x->rq == NULL) {
1988	printf("dummynet: sorry, cannot allocate queue\n");
1989	return ENOSPC;
1990	}
1991	x->rq_elements = `0`;
1992	return `0` ;
1993	}
1994
1995	static void
1996	set_fs_parms(struct dn_flow_set x, struct* dn_flow_set *src)
1997	{
1998	x->flags_fs = src->flags_fs;
1999	x->qsize = src->qsize;
2000	x->plr = src->plr;
2001	x->flow_mask = src->flow_mask;
2002	if (x->flags_fs & DN_QSIZE_IS_BYTES) {
2003	if (x->qsize > `1024`*`1024`)
2004	x->qsize = `1024`*`1024` ;
2005	} else {
2006	if (x->qsize == `0`)
2007	x->qsize = `50` ;
2008	if (x->qsize > `100`)
2009	x->qsize = `50` ;
2010	}
2011	/ configuring RED /
2012	if ( x->flags_fs & DN_IS_RED )
2013	config_red(src, x) ; / XXX should check errors /
2014	}
2015
2016	/*
2017	* setup pipe or queue parameters.
2018	*/
2019	static int
2020	config_pipe(struct dn_pipe *p)
2021	{
2022	int i, r;
2023	struct dn_flow_set *pfs = &(p->fs);
2024	struct dn_flow_queue *q;
2025
2026	/*
2027	* The config program passes parameters as follows:
2028	* bw = bits/second (0 means no limits),
2029	* delay = ms, must be translated into ticks.
2030	* qsize = slots/bytes
2031	*/
2032	p->delay = ( p->delay * (hz*`10`) ) / `1000` ;
2033	/ We need either a pipe number or a flow_set number /
2034	if (p->pipe_nr == `0` && pfs->fs_nr == `0`)
2035	return EINVAL ;
2036	if (p->pipe_nr != `0` && pfs->fs_nr != `0`)
2037	return EINVAL ;
2038	if (p->pipe_nr != `0`) { / this is a pipe /
2039	struct dn_pipe x, b;
2040	struct dummynet_event dn_event;
2041	lck_mtx_lock(dn_mutex);
2042
2043	/ locate pipe /
2044	b = locate_pipe(p->pipe_nr);
2045
2046	if (b == NULL \|\| b->pipe_nr != p->pipe_nr) { / new pipe /
2047	x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT \| M_ZERO) ;
2048	if (x == NULL) {
2049	lck_mtx_unlock(dn_mutex);
2050	printf("dummynet: no memory for new pipe\n");
2051	return ENOSPC;
2052	}
2053	x->pipe_nr = p->pipe_nr;
2054	x->fs.pipe = x ;
2055	/ idle_heap is the only one from which we extract from the middle.*
2056	*/
2057	x->idle_heap.size = x->idle_heap.elements = `0` ;
2058	x->idle_heap.offset=offsetof(struct dn_flow_queue, heap_pos);
2059	} else {
2060	x = b;
2061	/ Flush accumulated credit for all queues /
2062	for (i = `0`; i <= x->fs.rq_size; i++)
2063	for (q = x->fs.rq[i]; q; q = q->next)
2064	q->numbytes = `0`;
2065	}
2066
2067	x->bandwidth = p->bandwidth ;
2068	x->numbytes = `0`; / just in case... /
2069	bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
2070	x->ifp = NULL ; / reset interface ptr /
2071	x->delay = p->delay ;
2072	set_fs_parms(&(x->fs), pfs);
2073
2074
2075	if ( x->fs.rq == NULL ) { / a new pipe /
2076	r = alloc_hash(&(x->fs), pfs) ;
2077	if (r) {
2078	lck_mtx_unlock(dn_mutex);
2079	FREE(x, M_DUMMYNET);
2080	return r ;
2081	}
2082	SLIST_INSERT_HEAD(&pipehash[HASH(x->pipe_nr)],
2083	x, next);
2084	}
2085	lck_mtx_unlock(dn_mutex);
2086
2087	bzero(&dn_event, sizeof(dn_event));
2088	dn_event.dn_event_code = DUMMYNET_PIPE_CONFIG;
2089	dn_event.dn_event_pipe_config.bandwidth = p->bandwidth;
2090	dn_event.dn_event_pipe_config.delay = p->delay;
2091	dn_event.dn_event_pipe_config.plr = pfs->plr;
2092
2093	dummynet_event_enqueue_nwk_wq_entry(&dn_event);
2094	} else { / config queue /
2095	struct dn_flow_set x, b ;
2096
2097	lck_mtx_lock(dn_mutex);
2098	/ locate flow_set /
2099	b = locate_flowset(pfs->fs_nr);
2100
2101	if (b == NULL \|\| b->fs_nr != pfs->fs_nr) { / new /
2102	if (pfs->parent_nr == `0`) { / need link to a pipe /
2103	lck_mtx_unlock(dn_mutex);
2104	return EINVAL ;
2105	}
2106	x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT \| M_ZERO);
2107	if (x == NULL) {
2108	lck_mtx_unlock(dn_mutex);
2109	printf("dummynet: no memory for new flow_set\n");
2110	return ENOSPC;
2111	}
2112	x->fs_nr = pfs->fs_nr;
2113	x->parent_nr = pfs->parent_nr;
2114	x->weight = pfs->weight ;
2115	if (x->weight == `0`)
2116	x->weight = `1` ;
2117	else if (x->weight > `100`)
2118	x->weight = `100` ;
2119	} else {
2120	/ Change parent pipe not allowed; must delete and recreate /
2121	if (pfs->parent_nr != `0` && b->parent_nr != pfs->parent_nr) {
2122	lck_mtx_unlock(dn_mutex);
2123	return EINVAL ;
2124	}
2125	x = b;
2126	}
2127	set_fs_parms(x, pfs);
2128
2129	if ( x->rq == NULL ) { / a new flow_set /
2130	r = alloc_hash(x, pfs) ;
2131	if (r) {
2132	lck_mtx_unlock(dn_mutex);
2133	FREE(x, M_DUMMYNET);
2134	return r ;
2135	}
2136	SLIST_INSERT_HEAD(&flowsethash[HASH(x->fs_nr)],
2137	x, next);
2138	}
2139	lck_mtx_unlock(dn_mutex);
2140	}
2141	return `0` ;
2142	}
2143
2144	/*
2145	* Helper function to remove from a heap queues which are linked to
2146	* a flow_set about to be deleted.
2147	*/
2148	static void
2149	fs_remove_from_heap(struct dn_heap h, struct* dn_flow_set *fs)
2150	{
2151	int i = `0`, found = `0` ;
2152	for (; i < h->elements ;)
2153	if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
2154	h->elements-- ;
2155	h->p[i] = h->p[h->elements] ;
2156	found++ ;
2157	} else
2158	i++ ;
2159	if (found)
2160	heapify(h);
2161	}
2162
2163	/*
2164	* helper function to remove a pipe from a heap (can be there at most once)
2165	*/
2166	static void
2167	pipe_remove_from_heap(struct dn_heap h, struct* dn_pipe *p)
2168	{
2169	if (h->elements > `0`) {
2170	int i = `0` ;
2171	for (i=`0`; i < h->elements ; i++ ) {
2172	if (h->p[i].object == p) { / found it /
2173	h->elements-- ;
2174	h->p[i] = h->p[h->elements] ;
2175	heapify(h);
2176	break ;
2177	}
2178	}
2179	}
2180	}
2181
2182	/*
2183	* drain all queues. Called in case of severe mbuf shortage.
2184	*/
2185	void
2186	dummynet_drain(void)
2187	{
2188	struct dn_flow_set *fs;
2189	struct dn_pipe *p;
2190	struct mbuf m, mnext;
2191	int i;
2192
2193	LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
2194
2195	heap_free(&ready_heap);
2196	heap_free(&wfq_ready_heap);
2197	heap_free(&extract_heap);
2198	/ remove all references to this pipe from flow_sets /
2199	for (i = `0`; i < HASHSIZE; i++)
2200	SLIST_FOREACH(fs, &flowsethash[i], next)
2201	purge_flow_set(fs, `0`);
2202
2203	for (i = `0`; i < HASHSIZE; i++)
2204	SLIST_FOREACH(p, &pipehash[i], next) {
2205	purge_flow_set(&(p->fs), `0`);
2206
2207	mnext = p->head;
2208	while ((m = mnext) != NULL) {
2209	mnext = m->m_nextpkt;
2210	DN_FREE_PKT(m);
2211	}
2212	p->head = p->tail = NULL ;
2213	}
2214	}
2215
2216	/*
2217	* Fully delete a pipe or a queue, cleaning up associated info.
2218	*/
2219	static int
2220	delete_pipe(struct dn_pipe *p)
2221	{
2222	if (p->pipe_nr == `0` && p->fs.fs_nr == `0`)
2223	return EINVAL ;
2224	if (p->pipe_nr != `0` && p->fs.fs_nr != `0`)
2225	return EINVAL ;
2226	if (p->pipe_nr != `0`) { / this is an old-style pipe /
2227	struct dn_pipe *b;
2228	struct dn_flow_set *fs;
2229	int i;
2230
2231	lck_mtx_lock(dn_mutex);
2232	/ locate pipe /
2233	b = locate_pipe(p->pipe_nr);
2234	if(b == NULL){
2235	lck_mtx_unlock(dn_mutex);
2236	return EINVAL ; / not found /
2237	}
2238
2239	/ Unlink from list of pipes. /
2240	SLIST_REMOVE(&pipehash[HASH(b->pipe_nr)], b, dn_pipe, next);
2241
2242	#if IPFW2
2243	/ remove references to this pipe from the ip_fw rules. /
2244	flush_pipe_ptrs(&(b->fs));
2245	#endif /* IPFW2 */
2246
2247	/ Remove all references to this pipe from flow_sets. /
2248	for (i = `0`; i < HASHSIZE; i++)
2249	SLIST_FOREACH(fs, &flowsethash[i], next)
2250	if (fs->pipe == b) {
2251	printf("dummynet: ++ ref to pipe %d from fs %d\n",
2252	p->pipe_nr, fs->fs_nr);
2253	fs->pipe = NULL ;
2254	purge_flow_set(fs, `0`);
2255	}
2256	fs_remove_from_heap(&ready_heap, &(b->fs));
2257
2258	purge_pipe(b); / remove all data associated to this pipe /
2259	/ remove reference to here from extract_heap and wfq_ready_heap /
2260	pipe_remove_from_heap(&extract_heap, b);
2261	pipe_remove_from_heap(&wfq_ready_heap, b);
2262	lck_mtx_unlock(dn_mutex);
2263
2264	FREE(b, M_DUMMYNET);
2265	} else { / this is a WF2Q queue (dn_flow_set) /
2266	struct dn_flow_set *b;
2267
2268	lck_mtx_lock(dn_mutex);
2269	/ locate set /
2270	b = locate_flowset(p->fs.fs_nr);
2271	if (b == NULL) {
2272	lck_mtx_unlock(dn_mutex);
2273	return EINVAL ; / not found /
2274	}
2275
2276	#if IPFW2
2277	/ remove references to this flow_set from the ip_fw rules. /
2278	flush_pipe_ptrs(b);
2279	#endif /* IPFW2 */
2280
2281	/ Unlink from list of flowsets. /
2282	SLIST_REMOVE( &flowsethash[HASH(b->fs_nr)], b, dn_flow_set, next);
2283
2284	if (b->pipe != NULL) {
2285	/ Update total weight on parent pipe and cleanup parent heaps /
2286	b->pipe->sum -= b->weight * b->backlogged ;
2287	fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
2288	fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
2289	#if 1 /* XXX should i remove from idle_heap as well ? */
2290	fs_remove_from_heap(&(b->pipe->idle_heap), b);
2291	#endif
2292	}
2293	purge_flow_set(b, `1`);
2294	lck_mtx_unlock(dn_mutex);
2295	}
2296	return `0` ;
2297	}
2298
2299	/*
2300	* helper function used to copy data from kernel in DUMMYNET_GET
2301	*/
2302	static
2303	char* dn_copy_set_32(struct dn_flow_set set, char* *bp)
2304	{
2305	int i, copied = `0` ;
2306	struct dn_flow_queue *q;
2307	struct dn_flow_queue_32 qp = (struct* dn_flow_queue_32 *)bp;
2308
2309	LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
2310
2311	for (i = `0` ; i <= set->rq_size ; i++)
2312	for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
2313	if (q->hash_slot != i)
2314	printf("dummynet: ++ at %d: wrong slot (have %d, "
2315	"should be %d)\n", copied, q->hash_slot, i);
2316	if (q->fs != set)
2317	printf("dummynet: ++ at %d: wrong fs ptr "
2318	"(have 0x%llx, should be 0x%llx)\n", i,
2319	(uint64_t)VM_KERNEL_ADDRPERM(q->fs),
2320	(uint64_t)VM_KERNEL_ADDRPERM(set));
2321	copied++ ;
2322	cp_queue_to_32_user( q, qp );
2323	/ cleanup pointers /
2324	qp->next = (user32_addr_t)`0` ;
2325	qp->head = qp->tail = (user32_addr_t)`0` ;
2326	qp->fs = (user32_addr_t)`0` ;
2327	}
2328	if (copied != set->rq_elements)
2329	printf("dummynet: ++ wrong count, have %d should be %d\n",
2330	copied, set->rq_elements);
2331	return (char *)qp ;
2332	}
2333
2334	static
2335	char* dn_copy_set_64(struct dn_flow_set set, char* *bp)
2336	{
2337	int i, copied = `0` ;
2338	struct dn_flow_queue *q;
2339	struct dn_flow_queue_64 qp = (struct* dn_flow_queue_64 *)bp;
2340
2341	LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
2342
2343	for (i = `0` ; i <= set->rq_size ; i++)
2344	for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
2345	if (q->hash_slot != i)
2346	printf("dummynet: ++ at %d: wrong slot (have %d, "
2347	"should be %d)\n", copied, q->hash_slot, i);
2348	if (q->fs != set)
2349	printf("dummynet: ++ at %d: wrong fs ptr "
2350	"(have 0x%llx, should be 0x%llx)\n", i,
2351	(uint64_t)VM_KERNEL_ADDRPERM(q->fs),
2352	(uint64_t)VM_KERNEL_ADDRPERM(set));
2353	copied++ ;
2354	//bcopy(q, qp, sizeof(q));*
2355	cp_queue_to_64_user( q, qp );
2356	/ cleanup pointers /
2357	qp->next = USER_ADDR_NULL ;
2358	qp->head = qp->tail = USER_ADDR_NULL ;
2359	qp->fs = USER_ADDR_NULL ;
2360	}
2361	if (copied != set->rq_elements)
2362	printf("dummynet: ++ wrong count, have %d should be %d\n",
2363	copied, set->rq_elements);
2364	return (char *)qp ;
2365	}
2366
2367	static size_t
2368	dn_calc_size(int is64user)
2369	{
2370	struct dn_flow_set *set ;
2371	struct dn_pipe *p ;
2372	size_t size = `0` ;
2373	size_t pipesize;
2374	size_t queuesize;
2375	size_t setsize;
2376	int i;
2377
2378	LCK_MTX_ASSERT(dn_mutex, LCK_MTX_ASSERT_OWNED);
2379	if ( is64user ){
2380	pipesize = sizeof(struct dn_pipe_64);
2381	queuesize = sizeof(struct dn_flow_queue_64);
2382	setsize = sizeof(struct dn_flow_set_64);
2383	}
2384	else {
2385	pipesize = sizeof(struct dn_pipe_32);
2386	queuesize = sizeof( struct dn_flow_queue_32 );
2387	setsize = sizeof(struct dn_flow_set_32);
2388	}
2389	/*
2390	* compute size of data structures: list of pipes and flow_sets.
2391	*/
2392	for (i = `0`; i < HASHSIZE; i++) {
2393	SLIST_FOREACH(p, &pipehash[i], next)
2394	size += sizeof(*p) +
2395	p->fs.rq_elements * sizeof(struct dn_flow_queue);
2396	SLIST_FOREACH(set, &flowsethash[i], next)
2397	size += sizeof (*set) +
2398	set->rq_elements * sizeof(struct dn_flow_queue);
2399	}
2400	return size;
2401	}
2402
2403	static int
2404	dummynet_get(struct sockopt *sopt)
2405	{
2406	char buf = NULL, bp = NULL; / bp is the "copy-pointer" /
2407	size_t size = `0`;
2408	struct dn_flow_set *set;
2409	struct dn_pipe *p;
2410	int error = `0`, i;
2411	int is64user = `0`;
2412
2413	/ XXX lock held too long /
2414	lck_mtx_lock(dn_mutex);
2415	/*
2416	* XXX: Ugly, but we need to allocate memory with M_WAITOK flag
2417	* and we cannot use this flag while holding a mutex.
2418	*/
2419	if (proc_is64bit(sopt->sopt_p))
2420	is64user = `1`;
2421	for (i = `0`; i < `10`; i++) {
2422	size = dn_calc_size(is64user);
2423	lck_mtx_unlock(dn_mutex);
2424	buf = _MALLOC(size, M_TEMP, M_WAITOK \| M_ZERO);
2425	if (buf == NULL)
2426	return(ENOBUFS);
2427	lck_mtx_lock(dn_mutex);
2428	if (size == dn_calc_size(is64user))
2429	break;
2430	FREE(buf, M_TEMP);
2431	buf = NULL;
2432	}
2433	if (buf == NULL) {
2434	lck_mtx_unlock(dn_mutex);
2435	return(ENOBUFS);
2436	}
2437
2438	bp = buf;
2439	for (i = `0`; i < HASHSIZE; i++) {
2440	SLIST_FOREACH(p, &pipehash[i], next) {
2441	/*
2442	* copy pipe descriptor into *bp, convert delay
2443	* back to ms, then copy the flow_set descriptor(s)
2444	* one at a time. After each flow_set, copy the
2445	* queue descriptor it owns.
2446	*/
2447	if ( is64user ) {
2448	bp = cp_pipe_to_64_user(p,
2449	(struct dn_pipe_64 *)bp);
2450	} else {
2451	bp = cp_pipe_to_32_user(p,
2452	(struct dn_pipe_32 *)bp);
2453	}
2454	}
2455	}
2456	for (i = `0`; i < HASHSIZE; i++) {
2457	SLIST_FOREACH(set, &flowsethash[i], next) {
2458	struct dn_flow_set_64 *fs_bp =
2459	(struct dn_flow_set_64 *)bp ;
2460	cp_flow_set_to_64_user(set, fs_bp);
2461	/ XXX same hack as above /
2462	fs_bp->next = CAST_DOWN(user64_addr_t,
2463	DN_IS_QUEUE);
2464	fs_bp->pipe = USER_ADDR_NULL;
2465	fs_bp->rq = USER_ADDR_NULL ;
2466	bp += sizeof(struct dn_flow_set_64);
2467	bp = dn_copy_set_64( set, bp );
2468	}
2469	}
2470	lck_mtx_unlock(dn_mutex);
2471	error = sooptcopyout(sopt, buf, size);
2472	FREE(buf, M_TEMP);
2473	return(error);
2474	}
2475
2476	/*
2477	* Handler for the various dummynet socket options (get, flush, config, del)
2478	*/
2479	static int
2480	ip_dn_ctl(struct sockopt *sopt)
2481	{
2482	int error = `0` ;
2483	struct dn_pipe *p, tmp_pipe;
2484
2485	/ Disallow sets in really-really secure mode. /
2486	if (sopt->sopt_dir == SOPT_SET && securelevel >= `3`)
2487	return (EPERM);
2488
2489	switch (sopt->sopt_name) {
2490	default :
2491	printf("dummynet: -- unknown option %d", sopt->sopt_name);
2492	return EINVAL ;
2493
2494	case IP_DUMMYNET_GET :
2495	error = dummynet_get(sopt);
2496	break ;
2497
2498	case IP_DUMMYNET_FLUSH :
2499	dummynet_flush() ;
2500	break ;
2501
2502	case IP_DUMMYNET_CONFIGURE :
2503	p = &tmp_pipe ;
2504	if (proc_is64bit(sopt->sopt_p))
2505	error = cp_pipe_from_user_64( sopt, p );
2506	else
2507	error = cp_pipe_from_user_32( sopt, p );
2508
2509	if (error)
2510	break ;
2511	error = config_pipe(p);
2512	break ;
2513
2514	case IP_DUMMYNET_DEL : / remove a pipe or queue /
2515	p = &tmp_pipe ;
2516	if (proc_is64bit(sopt->sopt_p))
2517	error = cp_pipe_from_user_64( sopt, p );
2518	else
2519	error = cp_pipe_from_user_32( sopt, p );
2520	if (error)
2521	break ;
2522
2523	error = delete_pipe(p);
2524	break ;
2525	}
2526	return error ;
2527	}
2528
2529	void
2530	dummynet_init(void)
2531	{
2532	eventhandler_lists_ctxt_init(&dummynet_evhdlr_ctxt);
2533	}
2534
2535	void
2536	ip_dn_init(void)
2537	{
2538	/ setup locks /
2539	dn_mutex_grp_attr = lck_grp_attr_alloc_init();
2540	dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr);
2541	dn_mutex_attr = lck_attr_alloc_init();
2542	lck_mtx_init(dn_mutex, dn_mutex_grp, dn_mutex_attr);
2543
2544	ready_heap.size = ready_heap.elements = `0` ;
2545	ready_heap.offset = `0` ;
2546
2547	wfq_ready_heap.size = wfq_ready_heap.elements = `0` ;
2548	wfq_ready_heap.offset = `0` ;
2549
2550	extract_heap.size = extract_heap.elements = `0` ;
2551	extract_heap.offset = `0` ;
2552	ip_dn_ctl_ptr = ip_dn_ctl;
2553	ip_dn_io_ptr = dummynet_io;
2554
2555	bzero(&default_rule, sizeof default_rule);
2556	#if IPFIREWALL
2557	default_rule.act_ofs = `0`;
2558	default_rule.rulenum = IPFW_DEFAULT_RULE;
2559	default_rule.cmd_len = `1`;
2560	default_rule.set = RESVD_SET;
2561
2562	default_rule.cmd[`0`].len = `1`;
2563	default_rule.cmd[`0`].opcode =
2564	#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
2565	(`1`) ? O_ACCEPT :
2566	#endif
2567	O_DENY;
2568	#endif
2569	}
2570
2571	struct dn_event_nwk_wq_entry
2572	{
2573	struct nwk_wq_entry nwk_wqe;
2574	struct dummynet_event dn_ev_arg;
2575	};
2576
2577	static void
2578	dummynet_event_callback(void *arg)
2579	{
2580	struct dummynet_event p_dn_ev = (struct* dummynet_event *)arg;
2581
2582	EVENTHANDLER_INVOKE(&dummynet_evhdlr_ctxt, dummynet_event, p_dn_ev);
2583	return;
2584	}
2585
2586	void
2587	dummynet_event_enqueue_nwk_wq_entry(struct dummynet_event *p_dn_event)
2588	{
2589	struct dn_event_nwk_wq_entry *p_dn_ev = NULL;
2590
2591	MALLOC(p_dn_ev, struct dn_event_nwk_wq_entry *,
2592	sizeof(struct dn_event_nwk_wq_entry),
2593	M_NWKWQ, M_WAITOK \| M_ZERO);
2594
2595	p_dn_ev->nwk_wqe.func = dummynet_event_callback;
2596	p_dn_ev->nwk_wqe.is_arg_managed = TRUE;
2597	p_dn_ev->nwk_wqe.arg = &p_dn_ev->dn_ev_arg;
2598
2599	bcopy(p_dn_event, &(p_dn_ev->dn_ev_arg),
2600	sizeof(struct dummynet_event));
2601	nwk_wq_enqueue((struct nwk_wq_entry*)p_dn_ev);
2602	}
2603

Browse the source code of xnu/bsd/netinet/ip_dummynet.c