ip_dummynet.c source code [xnu/bsd/netinet/ip_dummynet.c]

1	/*
2	* Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
30	* Portions Copyright (c) 2000 Akamba Corp.
31	* All rights reserved
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	*
42	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52	* SUCH DAMAGE.
53	*
54	* $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $
55	*/
56
57	#define DUMMYNET_DEBUG
58
59	/*
60	* This module implements IP dummynet, a bandwidth limiter/delay emulator
61	* Description of the data structures used is in ip_dummynet.h
62	* Here you mainly find the following blocks of code:
63	* + variable declarations;
64	* + heap management functions;
65	* + scheduler and dummynet functions;
66	* + configuration and initialization.
67	*
68	* NOTA BENE: critical sections are protected by the "dummynet lock".
69	*
70	* Most important Changes:
71	*
72	* 010124: Fixed WF2Q behaviour
73	* 010122: Fixed spl protection.
74	* 000601: WF2Q support
75	* 000106: large rewrite, use heaps to handle very many pipes.
76	* 980513: initial release
77	*
78	* include files marked with XXX are probably not needed
79	*/
80
81	#include <sys/param.h>
82	#include <sys/systm.h>
83	#include <sys/malloc.h>
84	#include <sys/mbuf.h>
85	#include <sys/queue.h> /* XXX */
86	#include <sys/kernel.h>
87	#include <sys/random.h>
88	#include <sys/socket.h>
89	#include <sys/socketvar.h>
90	#include <sys/time.h>
91	#include <sys/sysctl.h>
92	#include <net/if.h>
93	#include <net/route.h>
94	#include <net/kpi_protocol.h>
95	#if DUMMYNET
96	#include <net/kpi_protocol.h>
97	#endif /* DUMMYNET */
98	#include <net/nwk_wq.h>
99	#include <net/pfvar.h>
100	#include <netinet/in.h>
101	#include <netinet/in_systm.h>
102	#include <netinet/in_var.h>
103	#include <netinet/ip.h>
104	#include <netinet/ip_dummynet.h>
105	#include <netinet/ip_var.h>
106
107	#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */
108	#include <netinet6/ip6_var.h>
109
110	#include <stdbool.h>
111	#include <net/sockaddr_utils.h>
112
113	/*
114	* We keep a private variable for the simulation time, but we could
115	* probably use an existing one ("softticks" in sys/kern/kern_timer.c)
116	*/
117	static dn_key curr_time = `0`; / current simulation time /
118
119	/ this is for the timer that fires to call dummynet() - we only enable the timer when*
120	* there are packets to process, otherwise it's disabled */
121	static int timer_enabled = `0`;
122
123	static int dn_hash_size = `64`; / default hash size /
124
125	/ statistics on number of queue searches and search steps /
126	static int searches, search_steps;
127	static int pipe_expire = `1`; / expire queue if empty /
128	static int dn_max_ratio = `16`; / max queues/buckets ratio /
129
130	static int red_lookup_depth = `256`; / RED - default lookup table depth /
131	static int red_avg_pkt_size = `512`; / RED - default medium packet size /
132	static int red_max_pkt_size = `1500`; / RED - default max packet size /
133
134	static int serialize = `0`;
135
136	/*
137	* Three heaps contain queues and pipes that the scheduler handles:
138	*
139	* ready_heap contains all dn_flow_queue related to fixed-rate pipes.
140	*
141	* wfq_ready_heap contains the pipes associated with WF2Q flows
142	*
143	* extract_heap contains pipes associated with delay lines.
144	*
145	*/
146	static struct dn_heap ready_heap, extract_heap, wfq_ready_heap;
147
148	static int heap_init(struct dn_heap h, int* size);
149	static int heap_insert(struct dn_heap h, dn_key key1, void* *p);
150	static void heap_extract(struct dn_heap h, void* *obj);
151
152
153	static void transmit_event(struct dn_pipe pipe, struct* mbuf **head,
154	struct mbuf **tail);
155	static void ready_event(struct dn_flow_queue q, struct* mbuf **head,
156	struct mbuf **tail);
157	static void ready_event_wfq(struct dn_pipe p, struct* mbuf **head,
158	struct mbuf **tail);
159
160	/*
161	* Packets are retrieved from queues in Dummynet in chains instead of
162	* packet-by-packet. The entire list of packets is first dequeued and
163	* sent out by the following function.
164	*/
165	static void dummynet_send(struct mbuf *m);
166
167	#define HASHSIZE 16
168	#define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
169	static struct dn_pipe_head pipehash[HASHSIZE]; / all pipes /
170	static struct dn_flow_set_head flowsethash[HASHSIZE]; / all flowsets /
171
172	#ifdef SYSCTL_NODE
173	SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
174	CTLFLAG_RW \| CTLFLAG_LOCKED, `0`, "Dummynet");
175	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
176	CTLFLAG_RW \| CTLFLAG_LOCKED, &dn_hash_size, `0`, "Default hash table size");
177	SYSCTL_QUAD(_net_inet_ip_dummynet, OID_AUTO, curr_time,
178	CTLFLAG_RD \| CTLFLAG_LOCKED, &curr_time, "Current tick");
179	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
180	CTLFLAG_RD \| CTLFLAG_LOCKED, &ready_heap.size, `0`, "Size of ready heap");
181	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
182	CTLFLAG_RD \| CTLFLAG_LOCKED, &extract_heap.size, `0`, "Size of extract heap");
183	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
184	CTLFLAG_RD \| CTLFLAG_LOCKED, &searches, `0`, "Number of queue searches");
185	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
186	CTLFLAG_RD \| CTLFLAG_LOCKED, &search_steps, `0`, "Number of queue search steps");
187	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
188	CTLFLAG_RW \| CTLFLAG_LOCKED, &pipe_expire, `0`, "Expire queue if empty");
189	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
190	CTLFLAG_RW \| CTLFLAG_LOCKED, &dn_max_ratio, `0`,
191	"Max ratio between dynamic queues and buckets");
192	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
193	CTLFLAG_RD \| CTLFLAG_LOCKED, &red_lookup_depth, `0`, "Depth of RED lookup table");
194	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
195	CTLFLAG_RD \| CTLFLAG_LOCKED, &red_avg_pkt_size, `0`, "RED Medium packet size");
196	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
197	CTLFLAG_RD \| CTLFLAG_LOCKED, &red_max_pkt_size, `0`, "RED Max packet size");
198	#endif
199
200	#ifdef DUMMYNET_DEBUG
201	int dummynet_debug = `0`;
202	#ifdef SYSCTL_NODE
203	SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW \| CTLFLAG_LOCKED, &dummynet_debug,
204	`0`, "control debugging printfs");
205	#endif
206	#define DPRINTF(X) if (dummynet_debug) printf X
207	#else
208	#define DPRINTF(X)
209	#endif
210
211	/ dummynet lock /
212	static LCK_GRP_DECLARE(dn_mutex_grp, "dn");
213	static LCK_MTX_DECLARE(dn_mutex, &dn_mutex_grp);
214
215	static int config_pipe(struct dn_pipe *p);
216	static int ip_dn_ctl(struct sockopt *sopt);
217
218	static void dummynet(void *);
219	static void dummynet_flush(void);
220	void dummynet_drain(void);
221	static ip_dn_io_t dummynet_io;
222
223	static void cp_flow_set_to_64_user(struct dn_flow_set set, struct* dn_flow_set_64 *fs_bp);
224	static void cp_queue_to_64_user( struct dn_flow_queue q, struct* dn_flow_queue_64 *qp);
225	static char cp_pipe_to_64_user(struct* dn_pipe p, struct* dn_pipe_64 *pipe_bp);
226	static char* dn_copy_set_64(struct dn_flow_set set, char* *bp);
227	static int cp_pipe_from_user_64( struct sockopt sopt, struct* dn_pipe *p );
228
229	static void cp_flow_set_to_32_user(struct dn_flow_set set, struct* dn_flow_set_32 *fs_bp);
230	static void cp_queue_to_32_user( struct dn_flow_queue q, struct* dn_flow_queue_32 *qp);
231	static char cp_pipe_to_32_user(struct* dn_pipe p, struct* dn_pipe_32 *pipe_bp);
232	static char* dn_copy_set_32(struct dn_flow_set set, char* *bp);
233	static int cp_pipe_from_user_32( struct sockopt sopt, struct* dn_pipe *p );
234
235	static struct m_tag * m_tag_kalloc_dummynet(u_int32_t id, u_int16_t type, uint16_t len, int wait);
236	static void m_tag_kfree_dummynet(struct m_tag *tag);
237
238	struct eventhandler_lists_ctxt dummynet_evhdlr_ctxt;
239
240	uint32_t
241	my_random(void)
242	{
243	uint32_t val;
244	read_frandom(buffer: &val, numBytes: sizeof(val));
245	val &= `0x7FFFFFFF`;
246
247	return val;
248	}
249
250	/*
251	* Heap management functions.
252	*
253	* In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
254	* Some macros help finding parent/children so we can optimize them.
255	*
256	* heap_init() is called to expand the heap when needed.
257	* Increment size in blocks of 16 entries.
258	* XXX failure to allocate a new element is a pretty bad failure
259	* as we basically stall a whole queue forever!!
260	* Returns 1 on error, 0 on success
261	*/
262	#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
263	#define HEAP_LEFT(x) ( 2*(x) + 1 )
264	#define HEAP_IS_LEFT(x) ( (x) & 1 )
265	#define HEAP_RIGHT(x) ( 2*(x) + 2 )
266	#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
267	#define HEAP_INCREMENT 15
268
269
270	int
271	cp_pipe_from_user_32( struct sockopt sopt, struct* dn_pipe *p )
272	{
273	struct dn_pipe_32 user_pipe_32;
274	int error = `0`;
275
276	error = sooptcopyin(sopt, &user_pipe_32, len: sizeof(struct dn_pipe_32), minlen: sizeof(struct dn_pipe_32));
277	if (!error) {
278	p->pipe_nr = user_pipe_32.pipe_nr;
279	p->bandwidth = user_pipe_32.bandwidth;
280	p->delay = user_pipe_32.delay;
281	p->V = user_pipe_32.V;
282	p->sum = user_pipe_32.sum;
283	p->numbytes = user_pipe_32.numbytes;
284	p->sched_time = user_pipe_32.sched_time;
285	bcopy( src: user_pipe_32.if_name, dst: p->if_name, IFNAMSIZ);
286	p->if_name[IFNAMSIZ - `1`] = `'\0'`;
287	p->ready = user_pipe_32.ready;
288
289	p->fs.fs_nr = user_pipe_32.fs.fs_nr;
290	p->fs.flags_fs = user_pipe_32.fs.flags_fs;
291	p->fs.parent_nr = user_pipe_32.fs.parent_nr;
292	p->fs.weight = user_pipe_32.fs.weight;
293	p->fs.qsize = user_pipe_32.fs.qsize;
294	p->fs.plr = user_pipe_32.fs.plr;
295	p->fs.flow_mask = user_pipe_32.fs.flow_mask;
296	p->fs.rq_size = user_pipe_32.fs.rq_size;
297	p->fs.rq_elements = user_pipe_32.fs.rq_elements;
298	p->fs.last_expired = user_pipe_32.fs.last_expired;
299	p->fs.backlogged = user_pipe_32.fs.backlogged;
300	p->fs.w_q = user_pipe_32.fs.w_q;
301	p->fs.max_th = user_pipe_32.fs.max_th;
302	p->fs.min_th = user_pipe_32.fs.min_th;
303	p->fs.max_p = user_pipe_32.fs.max_p;
304	p->fs.c_1 = user_pipe_32.fs.c_1;
305	p->fs.c_2 = user_pipe_32.fs.c_2;
306	p->fs.c_3 = user_pipe_32.fs.c_3;
307	p->fs.c_4 = user_pipe_32.fs.c_4;
308	p->fs.lookup_depth = user_pipe_32.fs.lookup_depth;
309	p->fs.lookup_step = user_pipe_32.fs.lookup_step;
310	p->fs.lookup_weight = user_pipe_32.fs.lookup_weight;
311	p->fs.avg_pkt_size = user_pipe_32.fs.avg_pkt_size;
312	p->fs.max_pkt_size = user_pipe_32.fs.max_pkt_size;
313	}
314	return error;
315	}
316
317
318	int
319	cp_pipe_from_user_64( struct sockopt sopt, struct* dn_pipe *p )
320	{
321	struct dn_pipe_64 user_pipe_64;
322	int error = `0`;
323
324	error = sooptcopyin(sopt, &user_pipe_64, len: sizeof(struct dn_pipe_64), minlen: sizeof(struct dn_pipe_64));
325	if (!error) {
326	p->pipe_nr = user_pipe_64.pipe_nr;
327	p->bandwidth = user_pipe_64.bandwidth;
328	p->delay = user_pipe_64.delay;
329	p->V = user_pipe_64.V;
330	p->sum = user_pipe_64.sum;
331	p->numbytes = user_pipe_64.numbytes;
332	p->sched_time = user_pipe_64.sched_time;
333	bcopy( src: user_pipe_64.if_name, dst: p->if_name, IFNAMSIZ);
334	p->if_name[IFNAMSIZ - `1`] = `'\0'`;
335	p->ready = user_pipe_64.ready;
336
337	p->fs.fs_nr = user_pipe_64.fs.fs_nr;
338	p->fs.flags_fs = user_pipe_64.fs.flags_fs;
339	p->fs.parent_nr = user_pipe_64.fs.parent_nr;
340	p->fs.weight = user_pipe_64.fs.weight;
341	p->fs.qsize = user_pipe_64.fs.qsize;
342	p->fs.plr = user_pipe_64.fs.plr;
343	p->fs.flow_mask = user_pipe_64.fs.flow_mask;
344	p->fs.rq_size = user_pipe_64.fs.rq_size;
345	p->fs.rq_elements = user_pipe_64.fs.rq_elements;
346	p->fs.last_expired = user_pipe_64.fs.last_expired;
347	p->fs.backlogged = user_pipe_64.fs.backlogged;
348	p->fs.w_q = user_pipe_64.fs.w_q;
349	p->fs.max_th = user_pipe_64.fs.max_th;
350	p->fs.min_th = user_pipe_64.fs.min_th;
351	p->fs.max_p = user_pipe_64.fs.max_p;
352	p->fs.c_1 = user_pipe_64.fs.c_1;
353	p->fs.c_2 = user_pipe_64.fs.c_2;
354	p->fs.c_3 = user_pipe_64.fs.c_3;
355	p->fs.c_4 = user_pipe_64.fs.c_4;
356	p->fs.lookup_depth = user_pipe_64.fs.lookup_depth;
357	p->fs.lookup_step = user_pipe_64.fs.lookup_step;
358	p->fs.lookup_weight = user_pipe_64.fs.lookup_weight;
359	p->fs.avg_pkt_size = user_pipe_64.fs.avg_pkt_size;
360	p->fs.max_pkt_size = user_pipe_64.fs.max_pkt_size;
361	}
362	return error;
363	}
364
365	static void
366	cp_flow_set_to_32_user(struct dn_flow_set set, struct* dn_flow_set_32 *fs_bp)
367	{
368	fs_bp->fs_nr = set->fs_nr;
369	fs_bp->flags_fs = set->flags_fs;
370	fs_bp->parent_nr = set->parent_nr;
371	fs_bp->weight = set->weight;
372	fs_bp->qsize = set->qsize;
373	fs_bp->plr = set->plr;
374	fs_bp->flow_mask = set->flow_mask;
375	fs_bp->rq_size = set->rq_size;
376	fs_bp->rq_elements = set->rq_elements;
377	fs_bp->last_expired = set->last_expired;
378	fs_bp->backlogged = set->backlogged;
379	fs_bp->w_q = set->w_q;
380	fs_bp->max_th = set->max_th;
381	fs_bp->min_th = set->min_th;
382	fs_bp->max_p = set->max_p;
383	fs_bp->c_1 = set->c_1;
384	fs_bp->c_2 = set->c_2;
385	fs_bp->c_3 = set->c_3;
386	fs_bp->c_4 = set->c_4;
387	fs_bp->w_q_lookup = CAST_DOWN_EXPLICIT(user32_addr_t, VM_KERNEL_ADDRHIDE(set->w_q_lookup));
388	fs_bp->lookup_depth = set->lookup_depth;
389	fs_bp->lookup_step = set->lookup_step;
390	fs_bp->lookup_weight = set->lookup_weight;
391	fs_bp->avg_pkt_size = set->avg_pkt_size;
392	fs_bp->max_pkt_size = set->max_pkt_size;
393	}
394
395	static void
396	cp_flow_set_to_64_user(struct dn_flow_set set, struct* dn_flow_set_64 *fs_bp)
397	{
398	fs_bp->fs_nr = set->fs_nr;
399	fs_bp->flags_fs = set->flags_fs;
400	fs_bp->parent_nr = set->parent_nr;
401	fs_bp->weight = set->weight;
402	fs_bp->qsize = set->qsize;
403	fs_bp->plr = set->plr;
404	fs_bp->flow_mask = set->flow_mask;
405	fs_bp->rq_size = set->rq_size;
406	fs_bp->rq_elements = set->rq_elements;
407	fs_bp->last_expired = set->last_expired;
408	fs_bp->backlogged = set->backlogged;
409	fs_bp->w_q = set->w_q;
410	fs_bp->max_th = set->max_th;
411	fs_bp->min_th = set->min_th;
412	fs_bp->max_p = set->max_p;
413	fs_bp->c_1 = set->c_1;
414	fs_bp->c_2 = set->c_2;
415	fs_bp->c_3 = set->c_3;
416	fs_bp->c_4 = set->c_4;
417	fs_bp->w_q_lookup = CAST_DOWN(user64_addr_t, VM_KERNEL_ADDRHIDE(set->w_q_lookup));
418	fs_bp->lookup_depth = set->lookup_depth;
419	fs_bp->lookup_step = set->lookup_step;
420	fs_bp->lookup_weight = set->lookup_weight;
421	fs_bp->avg_pkt_size = set->avg_pkt_size;
422	fs_bp->max_pkt_size = set->max_pkt_size;
423	}
424
425	static
426	void
427	cp_queue_to_32_user( struct dn_flow_queue q, struct* dn_flow_queue_32 *qp)
428	{
429	qp->id = q->id;
430	qp->len = q->len;
431	qp->len_bytes = q->len_bytes;
432	qp->numbytes = q->numbytes;
433	qp->tot_pkts = q->tot_pkts;
434	qp->tot_bytes = q->tot_bytes;
435	qp->drops = q->drops;
436	qp->hash_slot = q->hash_slot;
437	qp->avg = q->avg;
438	qp->count = q->count;
439	qp->random = q->random;
440	qp->q_time = (u_int32_t)q->q_time;
441	qp->heap_pos = q->heap_pos;
442	qp->sched_time = q->sched_time;
443	qp->S = q->S;
444	qp->F = q->F;
445	}
446
447	static
448	void
449	cp_queue_to_64_user( struct dn_flow_queue q, struct* dn_flow_queue_64 *qp)
450	{
451	qp->id = q->id;
452	qp->len = q->len;
453	qp->len_bytes = q->len_bytes;
454	qp->numbytes = q->numbytes;
455	qp->tot_pkts = q->tot_pkts;
456	qp->tot_bytes = q->tot_bytes;
457	qp->drops = q->drops;
458	qp->hash_slot = q->hash_slot;
459	qp->avg = q->avg;
460	qp->count = q->count;
461	qp->random = q->random;
462	qp->q_time = (u_int32_t)q->q_time;
463	qp->heap_pos = q->heap_pos;
464	qp->sched_time = q->sched_time;
465	qp->S = q->S;
466	qp->F = q->F;
467	}
468
469	static
470	char *
471	cp_pipe_to_32_user(struct dn_pipe p, struct* dn_pipe_32 *pipe_bp)
472	{
473	char *bp;
474
475	pipe_bp->pipe_nr = p->pipe_nr;
476	pipe_bp->bandwidth = p->bandwidth;
477	pipe_bp->delay = p->delay;
478	bcopy( src: &(p->scheduler_heap), dst: &(pipe_bp->scheduler_heap), n: sizeof(struct dn_heap_32));
479	pipe_bp->scheduler_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, VM_KERNEL_ADDRHIDE(pipe_bp->scheduler_heap.p));
480	bcopy( src: &(p->not_eligible_heap), dst: &(pipe_bp->not_eligible_heap), n: sizeof(struct dn_heap_32));
481	pipe_bp->not_eligible_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, VM_KERNEL_ADDRHIDE(pipe_bp->not_eligible_heap.p));
482	bcopy( src: &(p->idle_heap), dst: &(pipe_bp->idle_heap), n: sizeof(struct dn_heap_32));
483	pipe_bp->idle_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, VM_KERNEL_ADDRHIDE(pipe_bp->idle_heap.p));
484	pipe_bp->V = p->V;
485	pipe_bp->sum = p->sum;
486	pipe_bp->numbytes = p->numbytes;
487	pipe_bp->sched_time = p->sched_time;
488	bcopy( src: p->if_name, dst: pipe_bp->if_name, IFNAMSIZ);
489	pipe_bp->ifp = CAST_DOWN_EXPLICIT(user32_addr_t, VM_KERNEL_ADDRHIDE(p->ifp));
490	pipe_bp->ready = p->ready;
491
492	cp_flow_set_to_32_user( set: &(p->fs), fs_bp: &(pipe_bp->fs));
493
494	pipe_bp->delay = (pipe_bp->delay * `1000`) / (hz * `10`);
495	/*
496	* XXX the following is a hack based on ->next being the
497	* first field in dn_pipe and dn_flow_set. The correct
498	* solution would be to move the dn_flow_set to the beginning
499	* of struct dn_pipe.
500	*/
501	pipe_bp->next = CAST_DOWN_EXPLICIT( user32_addr_t, DN_IS_PIPE );
502	/ clean pointers /
503	pipe_bp->head = pipe_bp->tail = (user32_addr_t) `0`;
504	pipe_bp->fs.next = (user32_addr_t)`0`;
505	pipe_bp->fs.pipe = (user32_addr_t)`0`;
506	pipe_bp->fs.rq = (user32_addr_t)`0`;
507	bp = ((char )pipe_bp) + sizeof(struct* dn_pipe_32);
508	return dn_copy_set_32( set: &(p->fs), bp);
509	}
510
511	static
512	char *
513	cp_pipe_to_64_user(struct dn_pipe p, struct* dn_pipe_64 *pipe_bp)
514	{
515	char *bp;
516
517	pipe_bp->pipe_nr = p->pipe_nr;
518	pipe_bp->bandwidth = p->bandwidth;
519	pipe_bp->delay = p->delay;
520	bcopy( src: &(p->scheduler_heap), dst: &(pipe_bp->scheduler_heap), n: sizeof(struct dn_heap_64));
521	pipe_bp->scheduler_heap.p = CAST_DOWN(user64_addr_t, VM_KERNEL_ADDRHIDE(pipe_bp->scheduler_heap.p));
522	bcopy( src: &(p->not_eligible_heap), dst: &(pipe_bp->not_eligible_heap), n: sizeof(struct dn_heap_64));
523	pipe_bp->not_eligible_heap.p = CAST_DOWN(user64_addr_t, VM_KERNEL_ADDRHIDE(pipe_bp->not_eligible_heap.p));
524	bcopy( src: &(p->idle_heap), dst: &(pipe_bp->idle_heap), n: sizeof(struct dn_heap_64));
525	pipe_bp->idle_heap.p = CAST_DOWN(user64_addr_t, VM_KERNEL_ADDRHIDE(pipe_bp->idle_heap.p));
526	pipe_bp->V = p->V;
527	pipe_bp->sum = p->sum;
528	pipe_bp->numbytes = p->numbytes;
529	pipe_bp->sched_time = p->sched_time;
530	bcopy( src: p->if_name, dst: pipe_bp->if_name, IFNAMSIZ);
531	pipe_bp->ifp = CAST_DOWN(user64_addr_t, VM_KERNEL_ADDRHIDE(p->ifp));
532	pipe_bp->ready = p->ready;
533
534	cp_flow_set_to_64_user( set: &(p->fs), fs_bp: &(pipe_bp->fs));
535
536	pipe_bp->delay = (pipe_bp->delay * `1000`) / (hz * `10`);
537	/*
538	* XXX the following is a hack based on ->next being the
539	* first field in dn_pipe and dn_flow_set. The correct
540	* solution would be to move the dn_flow_set to the beginning
541	* of struct dn_pipe.
542	*/
543	pipe_bp->next = CAST_DOWN( user64_addr_t, DN_IS_PIPE );
544	/ clean pointers /
545	pipe_bp->head = pipe_bp->tail = USER_ADDR_NULL;
546	pipe_bp->fs.next = USER_ADDR_NULL;
547	pipe_bp->fs.pipe = USER_ADDR_NULL;
548	pipe_bp->fs.rq = USER_ADDR_NULL;
549	bp = ((char )pipe_bp) + sizeof(struct* dn_pipe_64);
550	return dn_copy_set_64( set: &(p->fs), bp);
551	}
552
553	static int
554	heap_init(struct dn_heap h, int* new_size)
555	{
556	struct dn_heap_entry *p;
557
558	if (h->size >= new_size) {
559	printf("dummynet: heap_init, Bogus call, have %d want %d\n",
560	h->size, new_size);
561	return `0`;
562	}
563	new_size = (new_size + HEAP_INCREMENT) & ~HEAP_INCREMENT;
564	p = krealloc_type(struct dn_heap_entry, h->size, new_size,
565	h->p, Z_NOWAIT \| Z_ZERO);
566	if (p == NULL) {
567	printf("dummynet: heap_init, resize %d failed\n", new_size );
568	return `1`; / error /
569	}
570	h->p = p;
571	h->size = new_size;
572	return `0`;
573	}
574
575	/*
576	* Insert element in heap. Normally, p != NULL, we insert p in
577	* a new position and bubble up. If p == NULL, then the element is
578	* already in place, and key is the position where to start the
579	* bubble-up.
580	* Returns 1 on failure (cannot allocate new heap entry)
581	*
582	* If offset > 0 the position (index, int) of the element in the heap is
583	* also stored in the element itself at the given offset in bytes.
584	*/
585	#define SET_OFFSET(heap, node) \
586	if (heap->offset > 0) \
587	((int )(void )((char )(heap->p[node].object) + heap->offset)) = node ;
588	/*
589	* RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
590	*/
591	#define RESET_OFFSET(heap, node) \
592	if (heap->offset > 0) \
593	((int )(void )((char )(heap->p[node].object) + heap->offset)) = -1 ;
594	static int
595	heap_insert(struct dn_heap h, dn_key key1, void* *p)
596	{
597	int son = h->elements;
598
599	if (p == NULL) { / data already there, set starting point /
600	VERIFY(key1 < INT_MAX);
601	son = (int)key1;
602	} else { / insert new element at the end, possibly resize /
603	son = h->elements;
604	if (son == h->size) { / need resize... /
605	if (heap_init(h, new_size: h->elements + `1`)) {
606	return `1`; / failure... /
607	}
608	}
609	h->p[son].object = p;
610	h->p[son].key = key1;
611	h->elements++;
612	}
613	while (son > `0`) { / bubble up /
614	int father = HEAP_FATHER(son);
615	struct dn_heap_entry tmp;
616
617	if (DN_KEY_LT( h->p[father].key, h->p[son].key )) {
618	break; / found right position /
619	}
620	/ son smaller than father, swap and repeat /
621	HEAP_SWAP(h->p[son], h->p[father], tmp);
622	SET_OFFSET(h, son);
623	son = father;
624	}
625	SET_OFFSET(h, son);
626	return `0`;
627	}
628
629	/*
630	* remove top element from heap, or obj if obj != NULL
631	*/
632	static void
633	heap_extract(struct dn_heap h, void* *obj)
634	{
635	int child, father, maxelt = h->elements - `1`;
636
637	if (maxelt < `0`) {
638	printf("dummynet: warning, extract from empty heap 0x%llx\n",
639	(uint64_t)VM_KERNEL_ADDRPERM(h));
640	return;
641	}
642	father = `0`; / default: move up smallest child /
643	if (obj != NULL) { / extract specific element, index is at offset /
644	if (h->offset <= `0`) {
645	panic("dummynet: heap_extract from middle not supported on this heap!!!");
646	}
647	father = ((int* )(void* )((char* *)obj + h->offset));
648	if (father < `0` \|\| father >= h->elements) {
649	printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
650	father, h->elements);
651	panic("dummynet: heap_extract");
652	}
653	}
654	RESET_OFFSET(h, father);
655	child = HEAP_LEFT(father); / left child /
656	while (child <= maxelt) { / valid entry /
657	if (child != maxelt && DN_KEY_LT(h->p[child + `1`].key, h->p[child].key)) {
658	child = child + `1`; / take right child, otherwise left /
659	}
660	h->p[father] = h->p[child];
661	SET_OFFSET(h, father);
662	father = child;
663	child = HEAP_LEFT(child); / left child for next loop /
664	}
665	h->elements--;
666	if (father != maxelt) {
667	/*
668	* Fill hole with last entry and bubble up, reusing the insert code
669	*/
670	h->p[father] = h->p[maxelt];
671	heap_insert(h, key1: father, NULL); / this one cannot fail /
672	}
673	}
674
675	/*
676	* heapify() will reorganize data inside an array to maintain the
677	* heap property. It is needed when we delete a bunch of entries.
678	*/
679	static void
680	heapify(struct dn_heap *h)
681	{
682	int i;
683
684	for (i = `0`; i < h->elements; i++) {
685	heap_insert(h, key1: i, NULL);
686	}
687	}
688
689	/*
690	* cleanup the heap and free data structure
691	*/
692	static void
693	heap_free(struct dn_heap *h)
694	{
695	kfree_type(struct dn_heap_entry, h->size, h->p);
696	bzero(s: h, n: sizeof(*h));
697	}
698
699	/*
700	* --- end of heap management functions ---
701	*/
702
703	/*
704	* Return the mbuf tag holding the dummynet state. As an optimization
705	* this is assumed to be the first tag on the list. If this turns out
706	* wrong we'll need to search the list.
707	*/
708	static struct dn_pkt_tag *
709	dn_tag_get(struct mbuf *m)
710	{
711	struct m_tag *mtag = m_tag_first(m);
712
713	if (!(mtag != NULL &&
714	mtag->m_tag_id == KERNEL_MODULE_TAG_ID &&
715	mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET)) {
716	panic("packet on dummynet queue w/o dummynet tag: 0x%llx",
717	(uint64_t)VM_KERNEL_ADDRPERM(m));
718	}
719
720	return (struct dn_pkt_tag *)(mtag->m_tag_data);
721	}
722
723	/*
724	* Scheduler functions:
725	*
726	* transmit_event() is called when the delay-line needs to enter
727	* the scheduler, either because of existing pkts getting ready,
728	* or new packets entering the queue. The event handled is the delivery
729	* time of the packet.
730	*
731	* ready_event() does something similar with fixed-rate queues, and the
732	* event handled is the finish time of the head pkt.
733	*
734	* wfq_ready_event() does something similar with WF2Q queues, and the
735	* event handled is the start time of the head pkt.
736	*
737	* In all cases, we make sure that the data structures are consistent
738	* before passing pkts out, because this might trigger recursive
739	* invocations of the procedures.
740	*/
741	static void
742	transmit_event(struct dn_pipe pipe, struct* mbuf head, struct mbuf tail)
743	{
744	struct mbuf *m;
745	struct dn_pkt_tag *pkt = NULL;
746	u_int64_t schedule_time;
747
748	LCK_MTX_ASSERT(&dn_mutex, LCK_MTX_ASSERT_OWNED);
749	ASSERT(serialize >= `0`);
750	if (serialize == `0`) {
751	while ((m = pipe->head) != NULL) {
752	pkt = dn_tag_get(m);
753	if (!DN_KEY_LEQ(pkt->dn_output_time, curr_time)) {
754	break;
755	}
756
757	pipe->head = m->m_nextpkt;
758	if (*tail != NULL) {
759	(*tail)->m_nextpkt = m;
760	} else {
761	*head = m;
762	}
763	*tail = m;
764	}
765
766	if (*tail != NULL) {
767	(*tail)->m_nextpkt = NULL;
768	}
769	}
770
771	schedule_time = pkt == NULL \|\| DN_KEY_LEQ(pkt->dn_output_time, curr_time) ?
772	curr_time + `1` : pkt->dn_output_time;
773
774	/ if there are leftover packets, put the pipe into the heap for next ready event /
775	if ((m = pipe->head) != NULL) {
776	pkt = dn_tag_get(m);
777	/ XXX should check errors on heap_insert, by draining the*
778	* whole pipe p and hoping in the future we are more successful
779	*/
780	heap_insert(h: &extract_heap, key1: schedule_time, p: pipe);
781	}
782	}
783
784	/*
785	* the following macro computes how many ticks we have to wait
786	* before being able to transmit a packet. The credit is taken from
787	* either a pipe (WF2Q) or a flow_queue (per-flow queueing)
788	*/
789
790	/ hz is 100, which gives a granularity of 10ms in the old timer.*
791	* The timer has been changed to fire every 1ms, so the use of
792	* hz has been modified here. All instances of hz have been left
793	* in place but adjusted by a factor of 10 so that hz is functionally
794	* equal to 1000.
795	*/
796	#define SET_TICKS(_m, q, p) \
797	((_m)->m_pkthdr.len8(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \
798	p->bandwidth ;
799
800	/*
801	* extract pkt from queue, compute output time (could be now)
802	* and put into delay line (p_queue)
803	*/
804	static void
805	move_pkt(struct mbuf pkt, struct* dn_flow_queue *q,
806	struct dn_pipe p, int* len)
807	{
808	struct dn_pkt_tag *dt = dn_tag_get(m: pkt);
809
810	q->head = pkt->m_nextpkt;
811	q->len--;
812	q->len_bytes -= len;
813
814	dt->dn_output_time = curr_time + p->delay;
815
816	if (p->head == NULL) {
817	p->head = pkt;
818	} else {
819	p->tail->m_nextpkt = pkt;
820	}
821	p->tail = pkt;
822	p->tail->m_nextpkt = NULL;
823	}
824
825	/*
826	* ready_event() is invoked every time the queue must enter the
827	* scheduler, either because the first packet arrives, or because
828	* a previously scheduled event fired.
829	* On invokation, drain as many pkts as possible (could be 0) and then
830	* if there are leftover packets reinsert the pkt in the scheduler.
831	*/
832	static void
833	ready_event(struct dn_flow_queue q, struct* mbuf head, struct mbuf tail)
834	{
835	struct mbuf *pkt;
836	struct dn_pipe *p = q->fs->pipe;
837	int p_was_empty;
838
839	LCK_MTX_ASSERT(&dn_mutex, LCK_MTX_ASSERT_OWNED);
840
841	if (p == NULL) {
842	printf("dummynet: ready_event pipe is gone\n");
843	return;
844	}
845	p_was_empty = (p->head == NULL);
846
847	/*
848	* schedule fixed-rate queues linked to this pipe:
849	* Account for the bw accumulated since last scheduling, then
850	* drain as many pkts as allowed by q->numbytes and move to
851	* the delay line (in p) computing output time.
852	* bandwidth==0 (no limit) means we can drain the whole queue,
853	* setting len_scaled = 0 does the job.
854	*/
855	q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
856	while ((pkt = q->head) != NULL) {
857	int len = pkt->m_pkthdr.len;
858	int len_scaled = p->bandwidth ? len * `8` * (hz * `10`) : `0`;
859	if (len_scaled > q->numbytes) {
860	break;
861	}
862	q->numbytes -= len_scaled;
863	move_pkt(pkt, q, p, len);
864	}
865	/*
866	* If we have more packets queued, schedule next ready event
867	* (can only occur when bandwidth != 0, otherwise we would have
868	* flushed the whole queue in the previous loop).
869	* To this purpose we record the current time and compute how many
870	* ticks to go for the finish time of the packet.
871	*/
872	if ((pkt = q->head) != NULL) { / this implies bandwidth != 0 /
873	dn_key t = SET_TICKS(pkt, q, p); / ticks i have to wait /
874	q->sched_time = curr_time;
875	heap_insert(h: &ready_heap, key1: curr_time + t, p: (void *)q );
876	/ XXX should check errors on heap_insert, and drain the whole*
877	* queue on error hoping next time we are luckier.
878	*/
879	} else { / RED needs to know when the queue becomes empty /
880	q->q_time = curr_time;
881	q->numbytes = `0`;
882	}
883	/*
884	* If the delay line was empty call transmit_event(p) now.
885	* Otherwise, the scheduler will take care of it.
886	*/
887	if (p_was_empty) {
888	transmit_event(pipe: p, head, tail);
889	}
890	}
891
892	/*
893	* Called when we can transmit packets on WF2Q queues. Take pkts out of
894	* the queues at their start time, and enqueue into the delay line.
895	* Packets are drained until p->numbytes < 0. As long as
896	* len_scaled >= p->numbytes, the packet goes into the delay line
897	* with a deadline p->delay. For the last packet, if p->numbytes<0,
898	* there is an additional delay.
899	*/
900	static void
901	ready_event_wfq(struct dn_pipe p, struct* mbuf head, struct mbuf tail)
902	{
903	int p_was_empty = (p->head == NULL);
904	struct dn_heap *sch = &(p->scheduler_heap);
905	struct dn_heap *neh = &(p->not_eligible_heap);
906	int64_t p_numbytes = p->numbytes;
907
908	LCK_MTX_ASSERT(&dn_mutex, LCK_MTX_ASSERT_OWNED);
909
910	if (p->if_name[`0`] == `0`) { / tx clock is simulated /
911	p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
912	} else { / tx clock is for real, the ifq must be empty or this is a NOP /
913	if (p->ifp && !IFCQ_IS_EMPTY(p->ifp->if_snd)) {
914	return;
915	} else {
916	DPRINTF(("dummynet: pipe %d ready from %s --\n",
917	p->pipe_nr, p->if_name));
918	}
919	}
920
921	/*
922	* While we have backlogged traffic AND credit, we need to do
923	* something on the queue.
924	*/
925	while (p_numbytes >= `0` && (sch->elements > `0` \|\| neh->elements > `0`)) {
926	if (sch->elements > `0`) { / have some eligible pkts to send out /
927	struct dn_flow_queue *q = sch->p[`0`].object;
928	struct mbuf *pkt = q->head;
929	struct dn_flow_set *fs = q->fs;
930	u_int32_t len = pkt->m_pkthdr.len;
931	u_int64_t len_scaled = p->bandwidth ? len * `8` * (hz * `10`) : `0`;
932
933	heap_extract(h: sch, NULL); / remove queue from heap /
934	p_numbytes -= len_scaled;
935	move_pkt(pkt, q, p, len);
936
937	p->V += (len << MY_M) / p->sum; / update V /
938	q->S = q->F; / update start time /
939	if (q->len == `0`) { / Flow not backlogged any more /
940	fs->backlogged--;
941	heap_insert(h: &(p->idle_heap), key1: q->F, p: q);
942	} else { / still backlogged /
943	/*
944	* update F and position in backlogged queue, then
945	* put flow in not_eligible_heap (we will fix this later).
946	*/
947	len = (q->head)->m_pkthdr.len;
948	q->F += (len << MY_M) / (u_int64_t) fs->weight;
949	if (DN_KEY_LEQ(q->S, p->V)) {
950	heap_insert(h: neh, key1: q->S, p: q);
951	} else {
952	heap_insert(h: sch, key1: q->F, p: q);
953	}
954	}
955	}
956	/*
957	* now compute V = max(V, min(S_i)). Remember that all elements in sch
958	* have by definition S_i <= V so if sch is not empty, V is surely
959	* the max and we must not update it. Conversely, if sch is empty
960	* we only need to look at neh.
961	*/
962	if (sch->elements == `0` && neh->elements > `0`) {
963	p->V = MAX64( p->V, neh->p[`0`].key );
964	}
965	/ move from neh to sch any packets that have become eligible /
966	while (neh->elements > `0` && DN_KEY_LEQ(neh->p[`0`].key, p->V)) {
967	struct dn_flow_queue *q = neh->p[`0`].object;
968	heap_extract(h: neh, NULL);
969	heap_insert(h: sch, key1: q->F, p: q);
970	}
971
972	if (p->if_name[`0`] != `'\0'`) {/ tx clock is from a real thing /
973	p_numbytes = -`1`; / mark not ready for I/O /
974	break;
975	}
976	}
977	if (sch->elements == `0` && neh->elements == `0` && p_numbytes >= `0`
978	&& p->idle_heap.elements > `0`) {
979	/*
980	* no traffic and no events scheduled. We can get rid of idle-heap.
981	*/
982	int i;
983
984	for (i = `0`; i < p->idle_heap.elements; i++) {
985	struct dn_flow_queue *q = p->idle_heap.p[i].object;
986
987	q->F = `0`;
988	q->S = q->F + `1`;
989	}
990	p->sum = `0`;
991	p->V = `0`;
992	p->idle_heap.elements = `0`;
993	}
994	/*
995	* If we are getting clocks from dummynet (not a real interface) and
996	* If we are under credit, schedule the next ready event.
997	* Also fix the delivery time of the last packet.
998	*/
999	if (p->if_name[`0`] == `0` && p_numbytes < `0`) { / this implies bandwidth >0 /
1000	dn_key t = `0`; / number of ticks i have to wait /
1001
1002	if (p->bandwidth > `0`) {
1003	t = (p->bandwidth - `1` - p_numbytes) / p->bandwidth;
1004	}
1005	dn_tag_get(m: p->tail)->dn_output_time += t;
1006	p->sched_time = curr_time;
1007	heap_insert(h: &wfq_ready_heap, key1: curr_time + t, p: (void *)p);
1008	/ XXX should check errors on heap_insert, and drain the whole*
1009	* queue on error hoping next time we are luckier.
1010	*/
1011	}
1012
1013	/ Fit (adjust if necessary) 64bit result into 32bit variable. /
1014	if (p_numbytes > INT_MAX) {
1015	p->numbytes = INT_MAX;
1016	} else if (p_numbytes < INT_MIN) {
1017	p->numbytes = INT_MIN;
1018	} else {
1019	p->numbytes = (int)p_numbytes;
1020	}
1021
1022	/*
1023	* If the delay line was empty call transmit_event(p) now.
1024	* Otherwise, the scheduler will take care of it.
1025	*/
1026	if (p_was_empty) {
1027	transmit_event(pipe: p, head, tail);
1028	}
1029	}
1030
1031	/*
1032	* This is called every 1ms. It is used to
1033	* increment the current tick counter and schedule expired events.
1034	*/
1035	static void
1036	dummynet(__unused void * unused)
1037	{
1038	void p; /* generic parameter to handler /
1039	struct dn_heap *h;
1040	struct dn_heap *heaps[`3`];
1041	struct mbuf head = NULL, tail = NULL;
1042	int i;
1043	struct dn_pipe *pe;
1044	struct timespec ts;
1045	struct timeval tv;
1046
1047	heaps[`0`] = &ready_heap; / fixed-rate queues /
1048	heaps[`1`] = &wfq_ready_heap; / wfq queues /
1049	heaps[`2`] = &extract_heap; / delay line /
1050
1051	lck_mtx_lock(lck: &dn_mutex);
1052
1053	/ make all time measurements in milliseconds (ms) -*
1054	* here we convert secs and usecs to msecs (just divide the
1055	* usecs and take the closest whole number).
1056	*/
1057	microuptime(tv: &tv);
1058	curr_time = (tv.tv_sec * `1000`) + (tv.tv_usec / `1000`);
1059
1060	for (i = `0`; i < `3`; i++) {
1061	h = heaps[i];
1062	while (h->elements > `0` && DN_KEY_LEQ(h->p[`0`].key, curr_time)) {
1063	if (h->p[`0`].key > curr_time) {
1064	printf("dummynet: warning, heap %d is %d ticks late\n",
1065	i, (int)(curr_time - h->p[`0`].key));
1066	}
1067	p = h->p[`0`].object; / store a copy before heap_extract /
1068	heap_extract(h, NULL); / need to extract before processing /
1069	if (i == `0`) {
1070	ready_event(q: p, head: &head, tail: &tail);
1071	} else if (i == `1`) {
1072	struct dn_pipe *pipe = p;
1073	if (pipe->if_name[`0`] != `'\0'`) {
1074	printf("dummynet: bad ready_event_wfq for pipe %s\n",
1075	pipe->if_name);
1076	} else {
1077	ready_event_wfq(p, head: &head, tail: &tail);
1078	}
1079	} else {
1080	transmit_event(pipe: p, head: &head, tail: &tail);
1081	}
1082	}
1083	}
1084	/ sweep pipes trying to expire idle flow_queues /
1085	for (i = `0`; i < HASHSIZE; i++) {
1086	SLIST_FOREACH(pe, &pipehash[i], next) {
1087	if (pe->idle_heap.elements > `0` &&
1088	DN_KEY_LT(pe->idle_heap.p[`0`].key, pe->V)) {
1089	struct dn_flow_queue *q = pe->idle_heap.p[`0`].object;
1090
1091	heap_extract(h: &(pe->idle_heap), NULL);
1092	q->S = q->F + `1`; / mark timestamp as invalid /
1093	pe->sum -= q->fs->weight;
1094	}
1095	}
1096	}
1097
1098	/ check the heaps to see if there's still stuff in there, and*
1099	* only set the timer if there are packets to process
1100	*/
1101	timer_enabled = `0`;
1102	for (i = `0`; i < `3`; i++) {
1103	h = heaps[i];
1104	if (h->elements > `0`) { // set the timer
1105	ts.tv_sec = `0`;
1106	ts.tv_nsec = `1` * `1000000`; // 1ms
1107	timer_enabled = `1`;
1108	bsd_timeout(dummynet, NULL, ts: &ts);
1109	break;
1110	}
1111	}
1112
1113	if (head != NULL) {
1114	serialize++;
1115	}
1116
1117	lck_mtx_unlock(lck: &dn_mutex);
1118
1119	/ Send out the de-queued list of ready-to-send packets /
1120	if (head != NULL) {
1121	dummynet_send(m: head);
1122	lck_mtx_lock(lck: &dn_mutex);
1123	serialize--;
1124	lck_mtx_unlock(lck: &dn_mutex);
1125	}
1126	}
1127
1128
1129	static void
1130	dummynet_send(struct mbuf *m)
1131	{
1132	struct dn_pkt_tag *pkt;
1133	struct mbuf *n;
1134
1135	for (; m != NULL; m = n) {
1136	n = m->m_nextpkt;
1137	m->m_nextpkt = NULL;
1138	pkt = dn_tag_get(m);
1139
1140	DPRINTF(("dummynet_send m: 0x%llx dn_dir: %d dn_flags: 0x%x\n",
1141	(uint64_t)VM_KERNEL_ADDRPERM(m), pkt->dn_dir,
1142	pkt->dn_flags));
1143
1144	switch (pkt->dn_dir) {
1145	case DN_TO_IP_OUT: {
1146	struct route tmp_rt;
1147
1148	/ route is already in the packet's dn_ro /
1149	bzero(s: &tmp_rt, n: sizeof(tmp_rt));
1150
1151	/ Force IP_RAWOUTPUT as the IP header is fully formed /
1152	pkt->dn_flags \|= IP_RAWOUTPUT \| IP_FORWARDING;
1153	(void)ip_output(m, NULL, &tmp_rt, pkt->dn_flags, NULL, NULL);
1154	ROUTE_RELEASE(&tmp_rt);
1155	break;
1156	}
1157	case DN_TO_IP_IN:
1158	proto_inject(PF_INET, packet: m);
1159	break;
1160	case DN_TO_IP6_OUT: {
1161	/ routes already in the packet's dn_{ro6,pmtu} /
1162	if (pkt->dn_origifp != NULL) {
1163	ip6_output_setsrcifscope(m, pkt->dn_origifp->if_index, NULL);
1164	ip6_output_setdstifscope(m, pkt->dn_origifp->if_index, NULL);
1165	} else {
1166	ip6_output_setsrcifscope(m, IFSCOPE_UNKNOWN, NULL);
1167	ip6_output_setdstifscope(m, IFSCOPE_UNKNOWN, NULL);
1168	}
1169
1170	ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL);
1171	break;
1172	}
1173	case DN_TO_IP6_IN:
1174	proto_inject(PF_INET6, packet: m);
1175	break;
1176	default:
1177	printf("dummynet: bad switch %d!\n", pkt->dn_dir);
1178	m_freem(m);
1179	break;
1180	}
1181	}
1182	}
1183
1184	/*
1185	* Unconditionally expire empty queues in case of shortage.
1186	* Returns the number of queues freed.
1187	*/
1188	static int
1189	expire_queues(struct dn_flow_set *fs)
1190	{
1191	struct dn_flow_queue q, prev;
1192	int i, initial_elements = fs->rq_elements;
1193	struct timeval timenow;
1194
1195	/ reviewed for getmicrotime usage /
1196	getmicrotime(&timenow);
1197
1198	if (fs->last_expired == timenow.tv_sec) {
1199	return `0`;
1200	}
1201	fs->last_expired = (int)timenow.tv_sec;
1202	for (i = `0`; i <= fs->rq_size; i++) { / last one is overflow /
1203	for (prev = NULL, q = fs->rq[i]; q != NULL;) {
1204	if (q->head != NULL \|\| q->S != q->F + `1`) {
1205	prev = q;
1206	q = q->next;
1207	} else { / entry is idle, expire it /
1208	struct dn_flow_queue *old_q = q;
1209
1210	if (prev != NULL) {
1211	prev->next = q = q->next;
1212	} else {
1213	fs->rq[i] = q = q->next;
1214	}
1215	fs->rq_elements--;
1216	kfree_type(struct dn_flow_queue, old_q);
1217	}
1218	}
1219	}
1220	return initial_elements - fs->rq_elements;
1221	}
1222
1223	/*
1224	* If room, create a new queue and put at head of slot i;
1225	* otherwise, create or use the default queue.
1226	*/
1227	static struct dn_flow_queue *
1228	create_queue(struct dn_flow_set fs, int* i)
1229	{
1230	struct dn_flow_queue *q;
1231
1232	if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
1233	expire_queues(fs) == `0`) {
1234	/*
1235	* No way to get room, use or create overflow queue.
1236	*/
1237	i = fs->rq_size;
1238	if (fs->rq[i] != NULL) {
1239	return fs->rq[i];
1240	}
1241	}
1242	q = kalloc_type(struct dn_flow_queue, Z_NOWAIT \| Z_ZERO);
1243	if (q == NULL) {
1244	printf("dummynet: sorry, cannot allocate queue for new flow\n");
1245	return NULL;
1246	}
1247	q->fs = fs;
1248	q->hash_slot = i;
1249	q->next = fs->rq[i];
1250	q->S = q->F + `1`; / hack - mark timestamp as invalid /
1251	fs->rq[i] = q;
1252	fs->rq_elements++;
1253	return q;
1254	}
1255
1256	/*
1257	* Given a flow_set and a pkt in last_pkt, find a matching queue
1258	* after appropriate masking. The queue is moved to front
1259	* so that further searches take less time.
1260	*/
1261	static struct dn_flow_queue *
1262	find_queue(struct dn_flow_set fs, struct* ip_flow_id *id)
1263	{
1264	int i = `0`; / we need i and q for new allocations /
1265	struct dn_flow_queue q, prev;
1266	int is_v6 = IS_IP6_FLOW_ID(id);
1267
1268	if (!(fs->flags_fs & DN_HAVE_FLOW_MASK)) {
1269	q = fs->rq[`0`];
1270	} else {
1271	/ first, do the masking, then hash /
1272	id->dst_port &= fs->flow_mask.dst_port;
1273	id->src_port &= fs->flow_mask.src_port;
1274	id->proto &= fs->flow_mask.proto;
1275	id->flags = `0`; / we don't care about this one /
1276	if (is_v6) {
1277	APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6);
1278	APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6);
1279	id->flow_id6 &= fs->flow_mask.flow_id6;
1280
1281	i = ((id->dst_ip6.__u6_addr.__u6_addr32[`0`]) & `0xffff`) ^
1282	((id->dst_ip6.__u6_addr.__u6_addr32[`1`]) & `0xffff`) ^
1283	((id->dst_ip6.__u6_addr.__u6_addr32[`2`]) & `0xffff`) ^
1284	((id->dst_ip6.__u6_addr.__u6_addr32[`3`]) & `0xffff`) ^
1285
1286	((id->dst_ip6.__u6_addr.__u6_addr32[`0`] >> `15`) & `0xffff`) ^
1287	((id->dst_ip6.__u6_addr.__u6_addr32[`1`] >> `15`) & `0xffff`) ^
1288	((id->dst_ip6.__u6_addr.__u6_addr32[`2`] >> `15`) & `0xffff`) ^
1289	((id->dst_ip6.__u6_addr.__u6_addr32[`3`] >> `15`) & `0xffff`) ^
1290
1291	((id->src_ip6.__u6_addr.__u6_addr32[`0`] << `1`) & `0xfffff`) ^
1292	((id->src_ip6.__u6_addr.__u6_addr32[`1`] << `1`) & `0xfffff`) ^
1293	((id->src_ip6.__u6_addr.__u6_addr32[`2`] << `1`) & `0xfffff`) ^
1294	((id->src_ip6.__u6_addr.__u6_addr32[`3`] << `1`) & `0xfffff`) ^
1295
1296	((id->src_ip6.__u6_addr.__u6_addr32[`0`] >> `16`) & `0xffff`) ^
1297	((id->src_ip6.__u6_addr.__u6_addr32[`1`] >> `16`) & `0xffff`) ^
1298	((id->src_ip6.__u6_addr.__u6_addr32[`2`] >> `16`) & `0xffff`) ^
1299	((id->src_ip6.__u6_addr.__u6_addr32[`3`] >> `16`) & `0xffff`) ^
1300
1301	(id->dst_port << `1`) ^ (id->src_port) ^
1302	(id->proto) ^
1303	(id->flow_id6);
1304	} else {
1305	id->dst_ip &= fs->flow_mask.dst_ip;
1306	id->src_ip &= fs->flow_mask.src_ip;
1307
1308	i = ((id->dst_ip) & `0xffff`) ^
1309	((id->dst_ip >> `15`) & `0xffff`) ^
1310	((id->src_ip << `1`) & `0xffff`) ^
1311	((id->src_ip >> `16`) & `0xffff`) ^
1312	(id->dst_port << `1`) ^ (id->src_port) ^
1313	(id->proto);
1314	}
1315	i = i % fs->rq_size;
1316	/ finally, scan the current list for a match /
1317	searches++;
1318	for (prev = NULL, q = fs->rq[i]; q;) {
1319	search_steps++;
1320	if (is_v6 &&
1321	IN6_ARE_ADDR_EQUAL(&id->dst_ip6, &q->id.dst_ip6) &&
1322	IN6_ARE_ADDR_EQUAL(&id->src_ip6, &q->id.src_ip6) &&
1323	id->dst_port == q->id.dst_port &&
1324	id->src_port == q->id.src_port &&
1325	id->proto == q->id.proto &&
1326	id->flags == q->id.flags &&
1327	id->flow_id6 == q->id.flow_id6) {
1328	break; / found /
1329	}
1330	if (!is_v6 && id->dst_ip == q->id.dst_ip &&
1331	id->src_ip == q->id.src_ip &&
1332	id->dst_port == q->id.dst_port &&
1333	id->src_port == q->id.src_port &&
1334	id->proto == q->id.proto &&
1335	id->flags == q->id.flags) {
1336	break; / found /
1337	}
1338	/ No match. Check if we can expire the entry /
1339	if (pipe_expire && q->head == NULL && q->S == q->F + `1`) {
1340	/ entry is idle and not in any heap, expire it /
1341	struct dn_flow_queue *old_q = q;
1342
1343	if (prev != NULL) {
1344	prev->next = q = q->next;
1345	} else {
1346	fs->rq[i] = q = q->next;
1347	}
1348	fs->rq_elements--;
1349	kfree_type(struct dn_flow_queue, old_q);
1350	continue;
1351	}
1352	prev = q;
1353	q = q->next;
1354	}
1355	if (q && prev != NULL) { / found and not in front /
1356	prev->next = q->next;
1357	q->next = fs->rq[i];
1358	fs->rq[i] = q;
1359	}
1360	}
1361	if (q == NULL) { / no match, need to allocate a new entry /
1362	q = create_queue(fs, i);
1363	if (q != NULL) {
1364	q->id = *id;
1365	}
1366	}
1367	return q;
1368	}
1369
1370	static int
1371	red_drops(struct dn_flow_set fs, struct* dn_flow_queue q, int* len)
1372	{
1373	/*
1374	* RED algorithm
1375	*
1376	* RED calculates the average queue size (avg) using a low-pass filter
1377	* with an exponential weighted (w_q) moving average:
1378	* avg <- (1-w_q) * avg + w_q * q_size
1379	* where q_size is the queue length (measured in bytes or * packets).
1380	*
1381	* If q_size == 0, we compute the idle time for the link, and set
1382	* avg = (1 - w_q)^(idle/s)
1383	* where s is the time needed for transmitting a medium-sized packet.
1384	*
1385	* Now, if avg < min_th the packet is enqueued.
1386	* If avg > max_th the packet is dropped. Otherwise, the packet is
1387	* dropped with probability P function of avg.
1388	*
1389	*/
1390
1391	int64_t p_b = `0`;
1392	/ queue in bytes or packets ? /
1393	u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
1394
1395	DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size));
1396
1397	/ average queue size estimation /
1398	if (q_size != `0`) {
1399	/*
1400	* queue is not empty, avg <- avg + (q_size - avg) * w_q
1401	*/
1402	int diff = SCALE(q_size) - q->avg;
1403	int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
1404
1405	q->avg += (int) v;
1406	} else {
1407	/*
1408	* queue is empty, find for how long the queue has been
1409	* empty and use a lookup table for computing
1410	* (1 - * w_q)^(idle_time/s) where s is the time to send a
1411	* (small) packet.
1412	* XXX check wraps...
1413	*/
1414	if (q->avg) {
1415	u_int64_t t = (curr_time - q->q_time) / fs->lookup_step;
1416
1417	q->avg = (t < fs->lookup_depth) ?
1418	SCALE_MUL(q->avg, fs->w_q_lookup[t]) : `0`;
1419	}
1420	}
1421	DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
1422
1423	/ should i drop ? /
1424
1425	if (q->avg < fs->min_th) {
1426	q->count = -`1`;
1427	return `0`; / accept packet ; /
1428	}
1429	if (q->avg >= fs->max_th) { / average queue >= max threshold /
1430	if (fs->flags_fs & DN_IS_GENTLE_RED) {
1431	/*
1432	* According to Gentle-RED, if avg is greater than max_th the
1433	* packet is dropped with a probability
1434	* p_b = c_3 * avg - c_4
1435	* where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1436	*/
1437	p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
1438	} else {
1439	q->count = -`1`;
1440	DPRINTF(("dummynet: - drop"));
1441	return `1`;
1442	}
1443	} else if (q->avg > fs->min_th) {
1444	/*
1445	* we compute p_b using the linear dropping function p_b = c_1 *
1446	* avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1447	* max_p * min_th / (max_th - min_th)
1448	*/
1449	p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
1450	}
1451	if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
1452	p_b = (p_b * len) / fs->max_pkt_size;
1453	}
1454	if (++q->count == `0`) {
1455	q->random = (my_random() & `0xffff`);
1456	} else {
1457	/*
1458	* q->count counts packets arrived since last drop, so a greater
1459	* value of q->count means a greater packet drop probability.
1460	*/
1461	if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
1462	q->count = `0`;
1463	DPRINTF(("dummynet: - red drop"));
1464	/ after a drop we calculate a new random value /
1465	q->random = (my_random() & `0xffff`);
1466	return `1`; / drop /
1467	}
1468	}
1469	/ end of RED algorithm /
1470	return `0`; / accept /
1471	}
1472
1473	static __inline
1474	struct dn_flow_set *
1475	locate_flowset(int fs_nr)
1476	{
1477	struct dn_flow_set *fs;
1478	SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next) {
1479	if (fs->fs_nr == fs_nr) {
1480	return fs;
1481	}
1482	}
1483
1484	return NULL;
1485	}
1486
1487	static __inline struct dn_pipe *
1488	locate_pipe(int pipe_nr)
1489	{
1490	struct dn_pipe *pipe;
1491
1492	SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next) {
1493	if (pipe->pipe_nr == pipe_nr) {
1494	return pipe;
1495	}
1496	}
1497
1498	return NULL;
1499	}
1500
1501
1502
1503	/*
1504	* dummynet hook for packets. Below 'pipe' is a pipe or a queue
1505	* depending on whether WF2Q or fixed bw is used.
1506	*
1507	* pipe_nr pipe or queue the packet is destined for.
1508	* dir where shall we send the packet after dummynet.
1509	* m the mbuf with the packet
1510	* ifp the 'ifp' parameter from the caller.
1511	* NULL in ip_input, destination interface in ip_output,
1512	* real_dst in bdg_forward
1513	* ro route parameter (only used in ip_output, NULL otherwise)
1514	* dst destination address, only used by ip_output
1515	* rule matching rule, in case of multiple passes
1516	* flags flags from the caller, only used in ip_output
1517	*
1518	*/
1519	static int
1520	dummynet_io(struct mbuf m, int* pipe_nr, int dir, struct ip_fw_args *fwa)
1521	{
1522	struct mbuf head = NULL, tail = NULL;
1523	struct dn_pkt_tag *pkt;
1524	struct m_tag *mtag;
1525	struct dn_flow_set *fs = NULL;
1526	struct dn_pipe *pipe;
1527	u_int32_t len = m->m_pkthdr.len;
1528	struct dn_flow_queue *q = NULL;
1529	int is_pipe = `0`;
1530	struct timespec ts;
1531	struct timeval tv;
1532
1533	DPRINTF(("dummynet_io m: 0x%llx pipe: %d dir: %d\n",
1534	(uint64_t)VM_KERNEL_ADDRPERM(m), pipe_nr, dir));
1535
1536
1537	#if DUMMYNET
1538	is_pipe = fwa->fwa_flags == DN_IS_PIPE ? `1` : `0`;
1539	#endif /* DUMMYNET */
1540
1541	pipe_nr &= `0xffff`;
1542
1543	lck_mtx_lock(lck: &dn_mutex);
1544
1545	/ make all time measurements in milliseconds (ms) -*
1546	* here we convert secs and usecs to msecs (just divide the
1547	* usecs and take the closest whole number).
1548	*/
1549	microuptime(tv: &tv);
1550	curr_time = (tv.tv_sec * `1000`) + (tv.tv_usec / `1000`);
1551
1552	/*
1553	* This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1554	*/
1555	if (is_pipe) {
1556	pipe = locate_pipe(pipe_nr);
1557	if (pipe != NULL) {
1558	fs = &(pipe->fs);
1559	}
1560	} else {
1561	fs = locate_flowset(fs_nr: pipe_nr);
1562	}
1563
1564
1565	if (fs == NULL) {
1566	goto dropit; / this queue/pipe does not exist! /
1567	}
1568	pipe = fs->pipe;
1569	if (pipe == NULL) { / must be a queue, try find a matching pipe /
1570	pipe = locate_pipe(pipe_nr: fs->parent_nr);
1571
1572	if (pipe != NULL) {
1573	fs->pipe = pipe;
1574	} else {
1575	printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1576	fs->parent_nr, fs->fs_nr);
1577	goto dropit;
1578	}
1579	}
1580	q = find_queue(fs, id: &(fwa->fwa_id));
1581	if (q == NULL) {
1582	goto dropit; / cannot allocate queue /
1583	}
1584	/*
1585	* update statistics, then check reasons to drop pkt
1586	*/
1587	q->tot_bytes += len;
1588	q->tot_pkts++;
1589	if (fs->plr && (my_random() < fs->plr)) {
1590	goto dropit; / random pkt drop /
1591	}
1592	if (fs->flags_fs & DN_QSIZE_IS_BYTES) {
1593	if (q->len_bytes > fs->qsize) {
1594	goto dropit; / queue size overflow /
1595	}
1596	} else {
1597	if (q->len >= fs->qsize) {
1598	goto dropit; / queue count overflow /
1599	}
1600	}
1601	if (fs->flags_fs & DN_IS_RED && red_drops(fs, q, len)) {
1602	goto dropit;
1603	}
1604
1605	/ XXX expensive to zero, see if we can remove it/
1606	mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET,
1607	sizeof(struct dn_pkt_tag), M_NOWAIT, m);
1608	if (mtag == NULL) {
1609	goto dropit; / cannot allocate packet header /
1610	}
1611	m_tag_prepend(m, mtag); / attach to mbuf chain /
1612
1613	pkt = (struct dn_pkt_tag *)(mtag->m_tag_data);
1614	bzero(s: pkt, n: sizeof(struct dn_pkt_tag));
1615	/ ok, i can handle the pkt now... /
1616	/ build and enqueue packet + parameters /
1617	pkt->dn_pf_rule = fwa->fwa_pf_rule;
1618	pkt->dn_dir = dir;
1619
1620	pkt->dn_ifp = fwa->fwa_oif;
1621	if (dir == DN_TO_IP_OUT) {
1622	/*
1623	* We need to copy *ro because for ICMP pkts (and maybe others)
1624	* the caller passed a pointer into the stack; dst might also be
1625	* a pointer into *ro so it needs to be updated.
1626	*/
1627	if (fwa->fwa_ro) {
1628	route_copyout(&pkt->dn_ro, fwa->fwa_ro, sizeof(pkt->dn_ro));
1629	}
1630	if (fwa->fwa_dst) {
1631	if (fwa->fwa_dst == SIN(&fwa->fwa_ro->ro_dst)) { / dst points into ro /
1632	fwa->fwa_dst = SIN(&(pkt->dn_ro.ro_dst));
1633	}
1634
1635	SOCKADDR_COPY(fwa->fwa_dst, &pkt->dn_dst, sizeof(pkt->dn_dst));
1636	}
1637	} else if (dir == DN_TO_IP6_OUT) {
1638	if (fwa->fwa_ro6) {
1639	route_copyout((struct route *)&pkt->dn_ro6,
1640	(struct route )fwa->fwa_ro6, sizeof*(pkt->dn_ro6));
1641	}
1642	if (fwa->fwa_ro6_pmtu) {
1643	route_copyout((struct route *)&pkt->dn_ro6_pmtu,
1644	(struct route )fwa->fwa_ro6_pmtu, sizeof*(pkt->dn_ro6_pmtu));
1645	}
1646	if (fwa->fwa_dst6) {
1647	if (fwa->fwa_dst6 == SIN6(&fwa->fwa_ro6->ro_dst)) { / dst points into ro /
1648	fwa->fwa_dst6 = SIN6(&(pkt->dn_ro6.ro_dst));
1649	}
1650
1651	SOCKADDR_COPY(fwa->fwa_dst6, &pkt->dn_dst6, sizeof(pkt->dn_dst6));
1652	}
1653	pkt->dn_origifp = fwa->fwa_origifp;
1654	pkt->dn_mtu = fwa->fwa_mtu;
1655	pkt->dn_unfragpartlen = fwa->fwa_unfragpartlen;
1656	if (fwa->fwa_exthdrs) {
1657	bcopy(src: fwa->fwa_exthdrs, dst: &pkt->dn_exthdrs, n: sizeof(pkt->dn_exthdrs));
1658	/*
1659	* Need to zero out the source structure so the mbufs
1660	* won't be freed by ip6_output()
1661	*/
1662	bzero(s: fwa->fwa_exthdrs, n: sizeof(struct ip6_exthdrs));
1663	}
1664	}
1665	if (dir == DN_TO_IP_OUT \|\| dir == DN_TO_IP6_OUT) {
1666	pkt->dn_flags = fwa->fwa_oflags;
1667	if (fwa->fwa_ipoa != NULL) {
1668	pkt->dn_ipoa = *(fwa->fwa_ipoa);
1669	}
1670	}
1671	if (q->head == NULL) {
1672	q->head = m;
1673	} else {
1674	q->tail->m_nextpkt = m;
1675	}
1676	q->tail = m;
1677	q->len++;
1678	q->len_bytes += len;
1679
1680	if (q->head != m) { / flow was not idle, we are done /
1681	goto done;
1682	}
1683	/*
1684	* If we reach this point the flow was previously idle, so we need
1685	* to schedule it. This involves different actions for fixed-rate or
1686	* WF2Q queues.
1687	*/
1688	if (is_pipe) {
1689	/*
1690	* Fixed-rate queue: just insert into the ready_heap.
1691	*/
1692	dn_key t = `0`;
1693	if (pipe->bandwidth) {
1694	t = SET_TICKS(m, q, pipe);
1695	}
1696	q->sched_time = curr_time;
1697	if (t == `0`) { / must process it now /
1698	ready_event( q, head: &head, tail: &tail );
1699	} else {
1700	heap_insert(h: &ready_heap, key1: curr_time + t, p: q );
1701	}
1702	} else {
1703	/*
1704	* WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1705	* set S to the virtual time V for the controlling pipe, and update
1706	* the sum of weights for the pipe; otherwise, remove flow from
1707	* idle_heap and set S to max(F,V).
1708	* Second, compute finish time F = S + len/weight.
1709	* Third, if pipe was idle, update V=max(S, V).
1710	* Fourth, count one more backlogged flow.
1711	*/
1712	if (DN_KEY_GT(q->S, q->F)) { / means timestamps are invalid /
1713	q->S = pipe->V;
1714	pipe->sum += fs->weight; / add weight of new queue /
1715	} else {
1716	heap_extract(h: &(pipe->idle_heap), obj: q);
1717	q->S = MAX64(q->F, pipe->V );
1718	}
1719	q->F = q->S + (len << MY_M) / (u_int64_t) fs->weight;
1720
1721	if (pipe->not_eligible_heap.elements == `0` &&
1722	pipe->scheduler_heap.elements == `0`) {
1723	pipe->V = MAX64( q->S, pipe->V );
1724	}
1725	fs->backlogged++;
1726	/*
1727	* Look at eligibility. A flow is not eligibile if S>V (when
1728	* this happens, it means that there is some other flow already
1729	* scheduled for the same pipe, so the scheduler_heap cannot be
1730	* empty). If the flow is not eligible we just store it in the
1731	* not_eligible_heap. Otherwise, we store in the scheduler_heap
1732	* and possibly invoke ready_event_wfq() right now if there is
1733	* leftover credit.
1734	* Note that for all flows in scheduler_heap (SCH), S_i <= V,
1735	* and for all flows in not_eligible_heap (NEH), S_i > V .
1736	* So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1737	* we only need to look into NEH.
1738	*/
1739	if (DN_KEY_GT(q->S, pipe->V)) { / not eligible /
1740	if (pipe->scheduler_heap.elements == `0`) {
1741	printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1742	}
1743	heap_insert(h: &(pipe->not_eligible_heap), key1: q->S, p: q);
1744	} else {
1745	heap_insert(h: &(pipe->scheduler_heap), key1: q->F, p: q);
1746	if (pipe->numbytes >= `0`) { / pipe is idle /
1747	if (pipe->scheduler_heap.elements != `1`) {
1748	printf("dummynet: OUCH! pipe should have been idle!\n");
1749	}
1750	DPRINTF(("dummynet: waking up pipe %d at %d\n",
1751	pipe->pipe_nr, (int)(q->F >> MY_M)));
1752	pipe->sched_time = curr_time;
1753	ready_event_wfq(p: pipe, head: &head, tail: &tail);
1754	}
1755	}
1756	}
1757	done:
1758	/ start the timer and set global if not already set /
1759	if (!timer_enabled) {
1760	ts.tv_sec = `0`;
1761	ts.tv_nsec = `1` * `1000000`; // 1ms
1762	timer_enabled = `1`;
1763	bsd_timeout(dummynet, NULL, ts: &ts);
1764	}
1765
1766	lck_mtx_unlock(lck: &dn_mutex);
1767
1768	if (head != NULL) {
1769	dummynet_send(m: head);
1770	}
1771
1772	return `0`;
1773
1774	dropit:
1775	if (q) {
1776	q->drops++;
1777	}
1778	lck_mtx_unlock(lck: &dn_mutex);
1779	m_freem(m);
1780	return (fs && (fs->flags_fs & DN_NOERROR)) ? `0` : ENOBUFS;
1781	}
1782
1783	/*
1784	* Below, the ROUTE_RELEASE is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1785	* Doing this would probably save us the initial bzero of dn_pkt
1786	*/
1787	#define DN_FREE_PKT(_m) do { \
1788	struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET); \
1789	if (tag) { \
1790	struct dn_pkt_tag n = (struct dn_pkt_tag )(tag->m_tag_data); \
1791	ROUTE_RELEASE(&n->dn_ro); \
1792	} \
1793	m_tag_delete(_m, tag); \
1794	m_freem(_m); \
1795	} while (0)
1796
1797	/*
1798	* Dispose all packets and flow_queues on a flow_set.
1799	* If all=1, also remove red lookup table and other storage,
1800	* including the descriptor itself.
1801	* For the one in dn_pipe MUST also cleanup ready_heap...
1802	*/
1803	static void
1804	purge_flow_set(struct dn_flow_set fs, int* all)
1805	{
1806	struct dn_flow_queue q, qn;
1807	int i;
1808
1809	LCK_MTX_ASSERT(&dn_mutex, LCK_MTX_ASSERT_OWNED);
1810
1811	for (i = `0`; i <= fs->rq_size; i++) {
1812	for (q = fs->rq[i]; q; q = qn) {
1813	struct mbuf m, mnext;
1814
1815	mnext = q->head;
1816	while ((m = mnext) != NULL) {
1817	mnext = m->m_nextpkt;
1818	DN_FREE_PKT(m);
1819	}
1820	qn = q->next;
1821	kfree_type(struct dn_flow_queue, q);
1822	}
1823	fs->rq[i] = NULL;
1824	}
1825	fs->rq_elements = `0`;
1826	if (all) {
1827	/ RED - free lookup table /
1828	if (fs->w_q_lookup) {
1829	kfree_data(fs->w_q_lookup, fs->lookup_depth * sizeof(int));
1830	}
1831	kfree_type(struct dn_flow_queue *, fs->rq_size + `1`, fs->rq);
1832	/ if this fs is not part of a pipe, free it /
1833	if (fs->pipe && fs != &(fs->pipe->fs)) {
1834	kfree_type(struct dn_flow_set, fs);
1835	}
1836	}
1837	}
1838
1839	/*
1840	* Dispose all packets queued on a pipe (not a flow_set).
1841	* Also free all resources associated to a pipe, which is about
1842	* to be deleted.
1843	*/
1844	static void
1845	purge_pipe(struct dn_pipe *pipe)
1846	{
1847	struct mbuf m, mnext;
1848
1849	purge_flow_set( fs: &(pipe->fs), all: `1` );
1850
1851	mnext = pipe->head;
1852	while ((m = mnext) != NULL) {
1853	mnext = m->m_nextpkt;
1854	DN_FREE_PKT(m);
1855	}
1856
1857	heap_free( h: &(pipe->scheduler_heap));
1858	heap_free( h: &(pipe->not_eligible_heap));
1859	heap_free( h: &(pipe->idle_heap));
1860	}
1861
1862	/*
1863	* Delete all pipes and heaps returning memory.
1864	*/
1865	static void
1866	dummynet_flush(void)
1867	{
1868	struct dn_pipe pipe, pipe1;
1869	struct dn_flow_set fs, fs1;
1870	int i;
1871
1872	lck_mtx_lock(lck: &dn_mutex);
1873
1874
1875	/ Free heaps so we don't have unwanted events. /
1876	heap_free(h: &ready_heap);
1877	heap_free(h: &wfq_ready_heap);
1878	heap_free(h: &extract_heap);
1879
1880	/*
1881	* Now purge all queued pkts and delete all pipes.
1882	*
1883	* XXXGL: can we merge the for(;;) cycles into one or not?
1884	*/
1885	for (i = `0`; i < HASHSIZE; i++) {
1886	SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) {
1887	SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next);
1888	purge_flow_set(fs, all: `1`);
1889	}
1890	}
1891	for (i = `0`; i < HASHSIZE; i++) {
1892	SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) {
1893	SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next);
1894	purge_pipe(pipe);
1895	kfree_type(struct dn_pipe, pipe);
1896	}
1897	}
1898	lck_mtx_unlock(lck: &dn_mutex);
1899	}
1900
1901	/*
1902	* setup RED parameters
1903	*/
1904	static int
1905	config_red(struct dn_flow_set p, struct* dn_flow_set * x)
1906	{
1907	int i;
1908
1909	x->w_q = p->w_q;
1910	x->min_th = SCALE(p->min_th);
1911	x->max_th = SCALE(p->max_th);
1912	x->max_p = p->max_p;
1913
1914	x->c_1 = p->max_p / (p->max_th - p->min_th);
1915	x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1916	if (x->flags_fs & DN_IS_GENTLE_RED) {
1917	x->c_3 = (SCALE(`1`) - p->max_p) / p->max_th;
1918	x->c_4 = (SCALE(`1`) - `2` * p->max_p);
1919	}
1920
1921	/ if the lookup table already exist, free and create it again /
1922	if (x->w_q_lookup) {
1923	kfree_data(x->w_q_lookup, x->lookup_depth * sizeof(int));
1924	x->w_q_lookup = NULL;
1925	}
1926	if (red_lookup_depth == `0`) {
1927	printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1928	return EINVAL;
1929	}
1930	x->lookup_depth = red_lookup_depth;
1931	x->w_q_lookup = (u_int ) kalloc_data(x->lookup_depth sizeof(int),
1932	Z_NOWAIT);
1933	if (x->w_q_lookup == NULL) {
1934	printf("dummynet: sorry, cannot allocate red lookup table\n");
1935	return ENOSPC;
1936	}
1937
1938	/ fill the lookup table with (1 - w_q)^x /
1939	x->lookup_step = p->lookup_step;
1940	x->lookup_weight = p->lookup_weight;
1941	x->w_q_lookup[`0`] = SCALE(`1`) - x->w_q;
1942	for (i = `1`; i < x->lookup_depth; i++) {
1943	x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - `1`], x->lookup_weight);
1944	}
1945	if (red_avg_pkt_size < `1`) {
1946	red_avg_pkt_size = `512`;
1947	}
1948	x->avg_pkt_size = red_avg_pkt_size;
1949	if (red_max_pkt_size < `1`) {
1950	red_max_pkt_size = `1500`;
1951	}
1952	x->max_pkt_size = red_max_pkt_size;
1953	return `0`;
1954	}
1955
1956	static int
1957	alloc_hash(struct dn_flow_set x, struct* dn_flow_set *pfs)
1958	{
1959	if (x->flags_fs & DN_HAVE_FLOW_MASK) { / allocate some slots /
1960	int l = pfs->rq_size;
1961
1962	if (l == `0`) {
1963	l = dn_hash_size;
1964	}
1965	if (l < `4`) {
1966	l = `4`;
1967	} else if (l > DN_MAX_HASH_SIZE) {
1968	l = DN_MAX_HASH_SIZE;
1969	}
1970	x->rq_size = l;
1971	} else { / one is enough for null mask /
1972	x->rq_size = `1`;
1973	}
1974	x->rq = kalloc_type(struct dn_flow_queue *, x->rq_size + `1`,
1975	Z_NOWAIT \| Z_ZERO);
1976	if (x->rq == NULL) {
1977	printf("dummynet: sorry, cannot allocate queue\n");
1978	return ENOSPC;
1979	}
1980	x->rq_elements = `0`;
1981	return `0`;
1982	}
1983
1984	static int
1985	set_fs_parms(struct dn_flow_set x, struct* dn_flow_set *src)
1986	{
1987	x->flags_fs = src->flags_fs;
1988	x->qsize = src->qsize;
1989	x->plr = src->plr;
1990	x->flow_mask = src->flow_mask;
1991	if (x->flags_fs & DN_QSIZE_IS_BYTES) {
1992	if (x->qsize > `1024` * `1024`) {
1993	x->qsize = `1024` * `1024`;
1994	}
1995	} else {
1996	if (x->qsize == `0`) {
1997	x->qsize = `50`;
1998	}
1999	if (x->qsize > `100`) {
2000	x->qsize = `50`;
2001	}
2002	}
2003	/ configuring RED /
2004	if (x->flags_fs & DN_IS_RED) {
2005	return config_red(p: src, x); / XXX should check errors /
2006	}
2007	return `0`;
2008	}
2009
2010	/*
2011	* setup pipe or queue parameters.
2012	*/
2013	static int
2014	config_pipe(struct dn_pipe *p)
2015	{
2016	int i, r;
2017	struct dn_flow_set *pfs = &(p->fs);
2018	struct dn_flow_queue *q;
2019	bool is_new = false;
2020
2021	/*
2022	* The config program passes parameters as follows:
2023	* bw = bits/second (0 means no limits),
2024	* delay = ms, must be translated into ticks.
2025	* qsize = slots/bytes
2026	*/
2027	p->delay = (p->delay * (hz * `10`)) / `1000`;
2028	/ We need either a pipe number or a flow_set number /
2029	if (p->pipe_nr == `0` && pfs->fs_nr == `0`) {
2030	return EINVAL;
2031	}
2032	if (p->pipe_nr != `0` && pfs->fs_nr != `0`) {
2033	return EINVAL;
2034	}
2035	if (p->pipe_nr != `0`) { / this is a pipe /
2036	struct dn_pipe x, b;
2037	struct dummynet_event dn_event;
2038	lck_mtx_lock(lck: &dn_mutex);
2039
2040	/ locate pipe /
2041	b = locate_pipe(pipe_nr: p->pipe_nr);
2042
2043	if (b == NULL \|\| b->pipe_nr != p->pipe_nr) { / new pipe /
2044	is_new = true;
2045	x = kalloc_type(struct dn_pipe, Z_NOWAIT \| Z_ZERO);
2046	if (x == NULL) {
2047	lck_mtx_unlock(lck: &dn_mutex);
2048	printf("dummynet: no memory for new pipe\n");
2049	return ENOSPC;
2050	}
2051	x->pipe_nr = p->pipe_nr;
2052	x->fs.pipe = x;
2053	/ idle_heap is the only one from which we extract from the middle.*
2054	*/
2055	x->idle_heap.size = x->idle_heap.elements = `0`;
2056	x->idle_heap.offset = offsetof(struct dn_flow_queue, heap_pos);
2057	} else {
2058	x = b;
2059	/ Flush accumulated credit for all queues /
2060	for (i = `0`; i <= x->fs.rq_size; i++) {
2061	for (q = x->fs.rq[i]; q; q = q->next) {
2062	q->numbytes = `0`;
2063	}
2064	}
2065	}
2066
2067	x->bandwidth = p->bandwidth;
2068	x->numbytes = `0`; / just in case... /
2069	bcopy(src: p->if_name, dst: x->if_name, n: sizeof(p->if_name));
2070	x->ifp = NULL; / reset interface ptr /
2071	x->delay = p->delay;
2072	r = set_fs_parms(x: &(x->fs), src: pfs);
2073	if (r != `0`) {
2074	lck_mtx_unlock(lck: &dn_mutex);
2075	if (is_new) { / a new pipe /
2076	kfree_type(struct dn_pipe, x);
2077	}
2078	return r;
2079	}
2080
2081	if (x->fs.rq == NULL) { / a new pipe /
2082	r = alloc_hash(x: &(x->fs), pfs);
2083	if (r) {
2084	lck_mtx_unlock(lck: &dn_mutex);
2085	if (is_new) {
2086	kfree_type(struct dn_pipe, x);
2087	}
2088	return r;
2089	}
2090	SLIST_INSERT_HEAD(&pipehash[HASH(x->pipe_nr)],
2091	x, next);
2092	}
2093	lck_mtx_unlock(lck: &dn_mutex);
2094
2095	bzero(s: &dn_event, n: sizeof(dn_event));
2096	dn_event.dn_event_code = DUMMYNET_PIPE_CONFIG;
2097	dn_event.dn_event_pipe_config.bandwidth = p->bandwidth;
2098	dn_event.dn_event_pipe_config.delay = p->delay;
2099	dn_event.dn_event_pipe_config.plr = pfs->plr;
2100
2101	dummynet_event_enqueue_nwk_wq_entry(&dn_event);
2102	} else { / config queue /
2103	struct dn_flow_set x, b;
2104
2105	lck_mtx_lock(lck: &dn_mutex);
2106	/ locate flow_set /
2107	b = locate_flowset(fs_nr: pfs->fs_nr);
2108
2109	if (b == NULL \|\| b->fs_nr != pfs->fs_nr) { / new /
2110	is_new = true;
2111	if (pfs->parent_nr == `0`) { / need link to a pipe /
2112	lck_mtx_unlock(lck: &dn_mutex);
2113	return EINVAL;
2114	}
2115	x = kalloc_type(struct dn_flow_set, Z_NOWAIT \| Z_ZERO);
2116	if (x == NULL) {
2117	lck_mtx_unlock(lck: &dn_mutex);
2118	printf("dummynet: no memory for new flow_set\n");
2119	return ENOSPC;
2120	}
2121	x->fs_nr = pfs->fs_nr;
2122	x->parent_nr = pfs->parent_nr;
2123	x->weight = pfs->weight;
2124	if (x->weight == `0`) {
2125	x->weight = `1`;
2126	} else if (x->weight > `100`) {
2127	x->weight = `100`;
2128	}
2129	} else {
2130	/ Change parent pipe not allowed; must delete and recreate /
2131	if (pfs->parent_nr != `0` && b->parent_nr != pfs->parent_nr) {
2132	lck_mtx_unlock(lck: &dn_mutex);
2133	return EINVAL;
2134	}
2135	x = b;
2136	}
2137	r = set_fs_parms(x, src: pfs);
2138	if (r != `0`) {
2139	lck_mtx_unlock(lck: &dn_mutex);
2140	printf("dummynet: no memory for new flow_set\n");
2141	if (is_new) {
2142	kfree_type(struct dn_flow_set, x);
2143	}
2144	return r;
2145	}
2146
2147	if (x->rq == NULL) { / a new flow_set /
2148	r = alloc_hash(x, pfs);
2149	if (r) {
2150	lck_mtx_unlock(lck: &dn_mutex);
2151	kfree_type(struct dn_flow_set, x);
2152	return r;
2153	}
2154	SLIST_INSERT_HEAD(&flowsethash[HASH(x->fs_nr)],
2155	x, next);
2156	}
2157	lck_mtx_unlock(lck: &dn_mutex);
2158	}
2159	return `0`;
2160	}
2161
2162	/*
2163	* Helper function to remove from a heap queues which are linked to
2164	* a flow_set about to be deleted.
2165	*/
2166	static void
2167	fs_remove_from_heap(struct dn_heap h, struct* dn_flow_set *fs)
2168	{
2169	int i = `0`, found = `0`;
2170	for (; i < h->elements;) {
2171	if (((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
2172	h->elements--;
2173	h->p[i] = h->p[h->elements];
2174	found++;
2175	} else {
2176	i++;
2177	}
2178	}
2179	if (found) {
2180	heapify(h);
2181	}
2182	}
2183
2184	/*
2185	* helper function to remove a pipe from a heap (can be there at most once)
2186	*/
2187	static void
2188	pipe_remove_from_heap(struct dn_heap h, struct* dn_pipe *p)
2189	{
2190	if (h->elements > `0`) {
2191	int i = `0`;
2192	for (i = `0`; i < h->elements; i++) {
2193	if (h->p[i].object == p) { / found it /
2194	h->elements--;
2195	h->p[i] = h->p[h->elements];
2196	heapify(h);
2197	break;
2198	}
2199	}
2200	}
2201	}
2202
2203	/*
2204	* drain all queues. Called in case of severe mbuf shortage.
2205	*/
2206	void
2207	dummynet_drain(void)
2208	{
2209	struct dn_flow_set *fs;
2210	struct dn_pipe *p;
2211	struct mbuf m, mnext;
2212	int i;
2213
2214	LCK_MTX_ASSERT(&dn_mutex, LCK_MTX_ASSERT_OWNED);
2215
2216	heap_free(h: &ready_heap);
2217	heap_free(h: &wfq_ready_heap);
2218	heap_free(h: &extract_heap);
2219	/ remove all references to this pipe from flow_sets /
2220	for (i = `0`; i < HASHSIZE; i++) {
2221	SLIST_FOREACH(fs, &flowsethash[i], next) {
2222	purge_flow_set(fs, all: `0`);
2223	}
2224	}
2225
2226	for (i = `0`; i < HASHSIZE; i++) {
2227	SLIST_FOREACH(p, &pipehash[i], next) {
2228	purge_flow_set(fs: &(p->fs), all: `0`);
2229
2230	mnext = p->head;
2231	while ((m = mnext) != NULL) {
2232	mnext = m->m_nextpkt;
2233	DN_FREE_PKT(m);
2234	}
2235	p->head = p->tail = NULL;
2236	}
2237	}
2238	}
2239
2240	/*
2241	* Fully delete a pipe or a queue, cleaning up associated info.
2242	*/
2243	static int
2244	delete_pipe(struct dn_pipe *p)
2245	{
2246	if (p->pipe_nr == `0` && p->fs.fs_nr == `0`) {
2247	return EINVAL;
2248	}
2249	if (p->pipe_nr != `0` && p->fs.fs_nr != `0`) {
2250	return EINVAL;
2251	}
2252	if (p->pipe_nr != `0`) { / this is an old-style pipe /
2253	struct dn_pipe *b;
2254	struct dn_flow_set *fs;
2255	int i;
2256
2257	lck_mtx_lock(lck: &dn_mutex);
2258	/ locate pipe /
2259	b = locate_pipe(pipe_nr: p->pipe_nr);
2260	if (b == NULL) {
2261	lck_mtx_unlock(lck: &dn_mutex);
2262	return EINVAL; / not found /
2263	}
2264
2265	/ Unlink from list of pipes. /
2266	SLIST_REMOVE(&pipehash[HASH(b->pipe_nr)], b, dn_pipe, next);
2267
2268
2269	/ Remove all references to this pipe from flow_sets. /
2270	for (i = `0`; i < HASHSIZE; i++) {
2271	SLIST_FOREACH(fs, &flowsethash[i], next) {
2272	if (fs->pipe == b) {
2273	printf("dummynet: ++ ref to pipe %d from fs %d\n",
2274	p->pipe_nr, fs->fs_nr);
2275	fs->pipe = NULL;
2276	purge_flow_set(fs, all: `0`);
2277	}
2278	}
2279	}
2280	fs_remove_from_heap(h: &ready_heap, fs: &(b->fs));
2281
2282	purge_pipe(pipe: b); / remove all data associated to this pipe /
2283	/ remove reference to here from extract_heap and wfq_ready_heap /
2284	pipe_remove_from_heap(h: &extract_heap, p: b);
2285	pipe_remove_from_heap(h: &wfq_ready_heap, p: b);
2286	lck_mtx_unlock(lck: &dn_mutex);
2287
2288	kfree_type(struct dn_pipe, b);
2289	} else { / this is a WF2Q queue (dn_flow_set) /
2290	struct dn_flow_set *b;
2291
2292	lck_mtx_lock(lck: &dn_mutex);
2293	/ locate set /
2294	b = locate_flowset(fs_nr: p->fs.fs_nr);
2295	if (b == NULL) {
2296	lck_mtx_unlock(lck: &dn_mutex);
2297	return EINVAL; / not found /
2298	}
2299
2300
2301	/ Unlink from list of flowsets. /
2302	SLIST_REMOVE( &flowsethash[HASH(b->fs_nr)], b, dn_flow_set, next);
2303
2304	if (b->pipe != NULL) {
2305	/ Update total weight on parent pipe and cleanup parent heaps /
2306	b->pipe->sum -= b->weight * b->backlogged;
2307	fs_remove_from_heap(h: &(b->pipe->not_eligible_heap), fs: b);
2308	fs_remove_from_heap(h: &(b->pipe->scheduler_heap), fs: b);
2309	#if 1 /* XXX should i remove from idle_heap as well ? */
2310	fs_remove_from_heap(h: &(b->pipe->idle_heap), fs: b);
2311	#endif
2312	}
2313	purge_flow_set(fs: b, all: `1`);
2314	lck_mtx_unlock(lck: &dn_mutex);
2315	}
2316	return `0`;
2317	}
2318
2319	/*
2320	* helper function used to copy data from kernel in DUMMYNET_GET
2321	*/
2322	static
2323	char*
2324	dn_copy_set_32(struct dn_flow_set set, char* *bp)
2325	{
2326	int i, copied = `0`;
2327	struct dn_flow_queue *q;
2328	struct dn_flow_queue_32 qp = (struct* dn_flow_queue_32 )(void* *)bp;
2329
2330	LCK_MTX_ASSERT(&dn_mutex, LCK_MTX_ASSERT_OWNED);
2331
2332	for (i = `0`; i <= set->rq_size; i++) {
2333	for (q = set->rq[i]; q; q = q->next, qp++) {
2334	if (q->hash_slot != i) {
2335	printf("dummynet: ++ at %d: wrong slot (have %d, "
2336	"should be %d)\n", copied, q->hash_slot, i);
2337	}
2338	if (q->fs != set) {
2339	printf("dummynet: ++ at %d: wrong fs ptr "
2340	"(have 0x%llx, should be 0x%llx)\n", i,
2341	(uint64_t)VM_KERNEL_ADDRPERM(q->fs),
2342	(uint64_t)VM_KERNEL_ADDRPERM(set));
2343	}
2344	copied++;
2345	cp_queue_to_32_user( q, qp );
2346	/ cleanup pointers /
2347	qp->next = (user32_addr_t)`0`;
2348	qp->head = qp->tail = (user32_addr_t)`0`;
2349	qp->fs = (user32_addr_t)`0`;
2350	}
2351	}
2352	if (copied != set->rq_elements) {
2353	printf("dummynet: ++ wrong count, have %d should be %d\n",
2354	copied, set->rq_elements);
2355	}
2356	return (char *)qp;
2357	}
2358
2359	static
2360	char*
2361	dn_copy_set_64(struct dn_flow_set set, char* *bp)
2362	{
2363	int i, copied = `0`;
2364	struct dn_flow_queue *q;
2365	struct dn_flow_queue_64 qp = (struct* dn_flow_queue_64 )(void* *)bp;
2366
2367	LCK_MTX_ASSERT(&dn_mutex, LCK_MTX_ASSERT_OWNED);
2368
2369	for (i = `0`; i <= set->rq_size; i++) {
2370	for (q = set->rq[i]; q; q = q->next, qp++) {
2371	if (q->hash_slot != i) {
2372	printf("dummynet: ++ at %d: wrong slot (have %d, "
2373	"should be %d)\n", copied, q->hash_slot, i);
2374	}
2375	if (q->fs != set) {
2376	printf("dummynet: ++ at %d: wrong fs ptr "
2377	"(have 0x%llx, should be 0x%llx)\n", i,
2378	(uint64_t)VM_KERNEL_ADDRPERM(q->fs),
2379	(uint64_t)VM_KERNEL_ADDRPERM(set));
2380	}
2381	copied++;
2382	//bcopy(q, qp, sizeof(q));*
2383	cp_queue_to_64_user( q, qp );
2384	/ cleanup pointers /
2385	qp->next = USER_ADDR_NULL;
2386	qp->head = qp->tail = USER_ADDR_NULL;
2387	qp->fs = USER_ADDR_NULL;
2388	}
2389	}
2390	if (copied != set->rq_elements) {
2391	printf("dummynet: ++ wrong count, have %d should be %d\n",
2392	copied, set->rq_elements);
2393	}
2394	return (char *)qp;
2395	}
2396
2397	static size_t
2398	dn_calc_size(int is64user)
2399	{
2400	struct dn_flow_set *set;
2401	struct dn_pipe *p;
2402	size_t size = `0`;
2403	size_t pipesize;
2404	size_t queuesize;
2405	size_t setsize;
2406	int i;
2407
2408	LCK_MTX_ASSERT(&dn_mutex, LCK_MTX_ASSERT_OWNED);
2409	if (is64user) {
2410	pipesize = sizeof(struct dn_pipe_64);
2411	queuesize = sizeof(struct dn_flow_queue_64);
2412	setsize = sizeof(struct dn_flow_set_64);
2413	} else {
2414	pipesize = sizeof(struct dn_pipe_32);
2415	queuesize = sizeof(struct dn_flow_queue_32);
2416	setsize = sizeof(struct dn_flow_set_32);
2417	}
2418	/*
2419	* compute size of data structures: list of pipes and flow_sets.
2420	*/
2421	for (i = `0`; i < HASHSIZE; i++) {
2422	SLIST_FOREACH(p, &pipehash[i], next) {
2423	size += sizeof(*p) +
2424	p->fs.rq_elements * sizeof(struct dn_flow_queue);
2425	}
2426	SLIST_FOREACH(set, &flowsethash[i], next) {
2427	size += sizeof(*set) +
2428	set->rq_elements * sizeof(struct dn_flow_queue);
2429	}
2430	}
2431	return size;
2432	}
2433
2434	static int
2435	dummynet_get(struct sockopt *sopt)
2436	{
2437	char buf = NULL, bp = NULL; / bp is the "copy-pointer" /
2438	size_t size = `0`;
2439	struct dn_flow_set *set;
2440	struct dn_pipe *p;
2441	int error = `0`, i;
2442	int is64user = `0`;
2443
2444	/ XXX lock held too long /
2445	lck_mtx_lock(lck: &dn_mutex);
2446	/*
2447	* XXX: Ugly, but we need to allocate memory with M_WAITOK flag
2448	* and we cannot use this flag while holding a mutex.
2449	*/
2450	if (proc_is64bit(sopt->sopt_p)) {
2451	is64user = `1`;
2452	}
2453	for (i = `0`; i < `10`; i++) {
2454	size = dn_calc_size(is64user);
2455	lck_mtx_unlock(lck: &dn_mutex);
2456	buf = kalloc_data(size, Z_WAITOK \| Z_ZERO);
2457	if (buf == NULL) {
2458	return ENOBUFS;
2459	}
2460	lck_mtx_lock(lck: &dn_mutex);
2461	if (size == dn_calc_size(is64user)) {
2462	break;
2463	}
2464	kfree_data(buf, size);
2465	buf = NULL;
2466	}
2467	if (buf == NULL) {
2468	lck_mtx_unlock(lck: &dn_mutex);
2469	return ENOBUFS;
2470	}
2471
2472	bp = buf;
2473	for (i = `0`; i < HASHSIZE; i++) {
2474	SLIST_FOREACH(p, &pipehash[i], next) {
2475	/*
2476	* copy pipe descriptor into *bp, convert delay
2477	* back to ms, then copy the flow_set descriptor(s)
2478	* one at a time. After each flow_set, copy the
2479	* queue descriptor it owns.
2480	*/
2481	if (is64user) {
2482	bp = cp_pipe_to_64_user(p,
2483	pipe_bp: (struct dn_pipe_64 )(void* *)bp);
2484	} else {
2485	bp = cp_pipe_to_32_user(p,
2486	pipe_bp: (struct dn_pipe_32 )(void* *)bp);
2487	}
2488	}
2489	}
2490	for (i = `0`; i < HASHSIZE; i++) {
2491	SLIST_FOREACH(set, &flowsethash[i], next) {
2492	struct dn_flow_set_64 *fs_bp =
2493	(struct dn_flow_set_64 )(void* *)bp;
2494	cp_flow_set_to_64_user(set, fs_bp);
2495	/ XXX same hack as above /
2496	fs_bp->next = CAST_DOWN(user64_addr_t,
2497	DN_IS_QUEUE);
2498	fs_bp->pipe = USER_ADDR_NULL;
2499	fs_bp->rq = USER_ADDR_NULL;
2500	bp += sizeof(struct dn_flow_set_64);
2501	bp = dn_copy_set_64( set, bp );
2502	}
2503	}
2504	lck_mtx_unlock(lck: &dn_mutex);
2505	error = sooptcopyout(sopt, data: buf, len: size);
2506	kfree_data(buf, size);
2507	return error;
2508	}
2509
2510	/*
2511	* Handler for the various dummynet socket options (get, flush, config, del)
2512	*/
2513	static int
2514	ip_dn_ctl(struct sockopt *sopt)
2515	{
2516	int error = `0`;
2517	struct dn_pipe *p, tmp_pipe;
2518
2519	/ Disallow sets in really-really secure mode. /
2520	if (sopt->sopt_dir == SOPT_SET && securelevel >= `3`) {
2521	return EPERM;
2522	}
2523
2524	switch (sopt->sopt_name) {
2525	default:
2526	printf("dummynet: -- unknown option %d", sopt->sopt_name);
2527	return EINVAL;
2528
2529	case IP_DUMMYNET_GET:
2530	error = dummynet_get(sopt);
2531	break;
2532
2533	case IP_DUMMYNET_FLUSH:
2534	dummynet_flush();
2535	break;
2536
2537	case IP_DUMMYNET_CONFIGURE:
2538	p = &tmp_pipe;
2539	if (proc_is64bit(sopt->sopt_p)) {
2540	error = cp_pipe_from_user_64( sopt, p );
2541	} else {
2542	error = cp_pipe_from_user_32( sopt, p );
2543	}
2544
2545	if (error) {
2546	break;
2547	}
2548	error = config_pipe(p);
2549	break;
2550
2551	case IP_DUMMYNET_DEL: / remove a pipe or queue /
2552	p = &tmp_pipe;
2553	if (proc_is64bit(sopt->sopt_p)) {
2554	error = cp_pipe_from_user_64( sopt, p );
2555	} else {
2556	error = cp_pipe_from_user_32( sopt, p );
2557	}
2558	if (error) {
2559	break;
2560	}
2561
2562	error = delete_pipe(p);
2563	break;
2564	}
2565	return error;
2566	}
2567
2568	void
2569	dummynet_init(void)
2570	{
2571	eventhandler_lists_ctxt_init(evthdlr_lists_ctxt: &dummynet_evhdlr_ctxt);
2572	}
2573
2574	void
2575	ip_dn_init(void)
2576	{
2577	/ setup locks /
2578	ready_heap.size = ready_heap.elements = `0`;
2579	ready_heap.offset = `0`;
2580
2581	wfq_ready_heap.size = wfq_ready_heap.elements = `0`;
2582	wfq_ready_heap.offset = `0`;
2583
2584	extract_heap.size = extract_heap.elements = `0`;
2585	extract_heap.offset = `0`;
2586	ip_dn_ctl_ptr = ip_dn_ctl;
2587	ip_dn_io_ptr = dummynet_io;
2588	}
2589
2590	struct dn_event_nwk_wq_entry {
2591	struct nwk_wq_entry nwk_wqe;
2592	struct dummynet_event dn_ev_arg;
2593	};
2594
2595	static void
2596	dummynet_event_callback(struct nwk_wq_entry *nwk_item)
2597	{
2598	struct dn_event_nwk_wq_entry *p_ev;
2599
2600	p_ev = __container_of(nwk_item, struct dn_event_nwk_wq_entry, nwk_wqe);
2601
2602	EVENTHANDLER_INVOKE(&dummynet_evhdlr_ctxt, dummynet_event, &p_ev->dn_ev_arg);
2603
2604	kfree_type(struct dn_event_nwk_wq_entry, p_ev);
2605	}
2606
2607	void
2608	dummynet_event_enqueue_nwk_wq_entry(struct dummynet_event *p_dn_event)
2609	{
2610	struct dn_event_nwk_wq_entry *p_ev = NULL;
2611
2612	p_ev = kalloc_type(struct dn_event_nwk_wq_entry,
2613	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
2614	p_ev->nwk_wqe.func = dummynet_event_callback;
2615	p_ev->dn_ev_arg = *p_dn_event;
2616	nwk_wq_enqueue(nwk_item: &p_ev->nwk_wqe);
2617	}
2618
2619	struct dummynet_tag_container {
2620	struct m_tag dtc_m_tag;
2621	struct dn_pkt_tag dtc_dn_pkt_tag;
2622	};
2623
2624	struct m_tag *
2625	m_tag_kalloc_dummynet(u_int32_t id, u_int16_t type, uint16_t len, int wait)
2626	{
2627	struct dummynet_tag_container *tag_container;
2628	struct m_tag *tag = NULL;
2629
2630	assert3u(id, ==, KERNEL_MODULE_TAG_ID);
2631	assert3u(type, ==, KERNEL_TAG_TYPE_DUMMYNET);
2632	assert3u(len, ==, sizeof(struct dn_pkt_tag));
2633
2634	if (len != sizeof(struct dn_pkt_tag)) {
2635	return NULL;
2636	}
2637
2638	tag_container = kalloc_type(struct dummynet_tag_container, wait \| M_ZERO);
2639	if (tag_container != NULL) {
2640	tag = &tag_container->dtc_m_tag;
2641
2642	assert3p(tag, ==, tag_container);
2643
2644	M_TAG_INIT(tag, id, type, len, &tag_container->dtc_dn_pkt_tag, NULL);
2645	}
2646
2647	return tag;
2648	}
2649
2650	void
2651	m_tag_kfree_dummynet(struct m_tag *tag)
2652	{
2653	struct dummynet_tag_container tag_container = (struct* dummynet_tag_container *)tag;
2654
2655	assert3u(tag->m_tag_len, ==, sizeof(struct dn_pkt_tag));
2656
2657	kfree_type(struct dummynet_tag_container, tag_container);
2658	}
2659
2660	void
2661	dummynet_register_m_tag(void)
2662	{
2663	int error;
2664
2665	error = m_register_internal_tag_type(type: KERNEL_TAG_TYPE_DUMMYNET, len: sizeof(struct dn_pkt_tag),
2666	alloc_func: m_tag_kalloc_dummynet, free_func: m_tag_kfree_dummynet);
2667
2668	assert3u(error, ==, `0`);
2669	}
2670

Browse the source code of xnu/bsd/netinet/ip_dummynet.c