uipc_mbuf.c source code [xnu/bsd/kern/uipc_mbuf.c]

1	/*
2	* Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/*
30	* Copyright (c) 1982, 1986, 1988, 1991, 1993
31	* The Regents of the University of California. All rights reserved.
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	* 3. All advertising materials mentioning features or use of this software
42	* must display the following acknowledgement:
43	* This product includes software developed by the University of
44	* California, Berkeley and its contributors.
45	* 4. Neither the name of the University nor the names of its contributors
46	* may be used to endorse or promote products derived from this software
47	* without specific prior written permission.
48	*
49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59	* SUCH DAMAGE.
60	*
61	* @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62	*/
63	/*
64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65	* support for mandatory and extensible security protections. This notice
66	* is included in support of clause 2.2 (b) of the Apple Public License,
67	* Version 2.0.
68	*/
69
70	#include <ptrauth.h>
71
72	#include <sys/param.h>
73	#include <sys/systm.h>
74	#include <sys/malloc.h>
75	#include <sys/mbuf.h>
76	#include <sys/kernel.h>
77	#include <sys/sysctl.h>
78	#include <sys/syslog.h>
79	#include <sys/protosw.h>
80	#include <sys/domain.h>
81	#include <sys/queue.h>
82	#include <sys/proc.h>
83	#include <sys/filedesc.h>
84	#include <sys/file_internal.h>
85
86	#include <dev/random/randomdev.h>
87
88	#include <kern/kern_types.h>
89	#include <kern/simple_lock.h>
90	#include <kern/queue.h>
91	#include <kern/sched_prim.h>
92	#include <kern/backtrace.h>
93	#include <kern/percpu.h>
94	#include <kern/zalloc.h>
95
96	#include <libkern/OSDebug.h>
97	#include <libkern/libkern.h>
98
99	#include <os/log.h>
100	#include <os/ptrtools.h>
101
102	#include <IOKit/IOMapper.h>
103
104	#include <machine/limits.h>
105	#include <machine/machine_routines.h>
106
107	#if CONFIG_MBUF_MCACHE
108	#include <sys/mcache.h>
109	#endif /* CONFIG_MBUF_MCACHE */
110	#include <net/ntstat.h>
111
112	#if INET
113	extern int dump_tcp_reass_qlen(char , int*);
114	extern int tcp_reass_qlen_space(struct socket *);
115	#endif /* INET */
116
117	#if MPTCP
118	extern int dump_mptcp_reass_qlen(char , int*);
119	#endif /* MPTCP */
120
121
122	#if NETWORKING
123	extern int dlil_dump_top_if_qlen(char , int*);
124	#endif /* NETWORKING */
125
126	#if CONFIG_MBUF_MCACHE
127	/*
128	* MBUF IMPLEMENTATION NOTES.
129	*
130	* There is a total of 5 per-CPU caches:
131	*
132	* MC_MBUF:
133	* This is a cache of rudimentary objects of _MSIZE in size; each
134	* object represents an mbuf structure. This cache preserves only
135	* the m_type field of the mbuf during its transactions.
136	*
137	* MC_CL:
138	* This is a cache of rudimentary objects of MCLBYTES in size; each
139	* object represents a mcluster structure. This cache does not
140	* preserve the contents of the objects during its transactions.
141	*
142	* MC_BIGCL:
143	* This is a cache of rudimentary objects of MBIGCLBYTES in size; each
144	* object represents a mbigcluster structure. This cache does not
145	* preserve the contents of the objects during its transaction.
146	*
147	* MC_MBUF_CL:
148	* This is a cache of mbufs each having a cluster attached to it.
149	* It is backed by MC_MBUF and MC_CL rudimentary caches. Several
150	* fields of the mbuf related to the external cluster are preserved
151	* during transactions.
152	*
153	* MC_MBUF_BIGCL:
154	* This is a cache of mbufs each having a big cluster attached to it.
155	* It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
156	* fields of the mbuf related to the external cluster are preserved
157	* during transactions.
158	*
159	* OBJECT ALLOCATION:
160	*
161	* Allocation requests are handled first at the per-CPU (mcache) layer
162	* before falling back to the slab layer. Performance is optimal when
163	* the request is satisfied at the CPU layer because global data/lock
164	* never gets accessed. When the slab layer is entered for allocation,
165	* the slab freelist will be checked first for available objects before
166	* the VM backing store is invoked. Slab layer operations are serialized
167	* for all of the caches as the mbuf global lock is held most of the time.
168	* Allocation paths are different depending on the class of objects:
169	*
170	* a. Rudimentary object:
171	*
172	* { m_get_common(), m_clattach(), m_mclget(),
173	* m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
174	* composite object allocation }
175	* \| ^
176	* \| \|
177	* \| +-----------------------+
178	* v \|
179	* mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
180	* \| ^
181	* v \|
182	* [CPU cache] -------> (found?) -------+
183	* \| \|
184	* v \|
185	* mbuf_slab_alloc() \|
186	* \| \|
187	* v \|
188	* +---------> [freelist] -------> (found?) -------+
189	* \| \|
190	* \| v
191	* \| m_clalloc()
192	* \| \|
193	* \| v
194	* +---<<---- kmem_mb_alloc()
195	*
196	* b. Composite object:
197	*
198	* { m_getpackets_internal(), m_allocpacket_internal() }
199	* \| ^
200	* \| \|
201	* \| +------ (done) ---------+
202	* v \|
203	* mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
204	* \| ^
205	* v \|
206	* [CPU cache] -------> (found?) -------+
207	* \| \|
208	* v \|
209	* mbuf_cslab_alloc() \|
210	* \| \|
211	* v \|
212	* [freelist] -------> (found?) -------+
213	* \| \|
214	* v \|
215	* (rudimentary object) \|
216	* mcache_alloc/mcache_alloc_ext() ------>>-----+
217	*
218	* Auditing notes: If auditing is enabled, buffers will be subjected to
219	* integrity checks by the audit routine. This is done by verifying their
220	* contents against DEADBEEF (free) pattern before returning them to caller.
221	* As part of this step, the routine will also record the transaction and
222	* pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
223	* also restore any constructed data structure fields if necessary.
224	*
225	* OBJECT DEALLOCATION:
226	*
227	* Freeing an object simply involves placing it into the CPU cache; this
228	* pollutes the cache to benefit subsequent allocations. The slab layer
229	* will only be entered if the object is to be purged out of the cache.
230	* During normal operations, this happens only when the CPU layer resizes
231	* its bucket while it's adjusting to the allocation load. Deallocation
232	* paths are different depending on the class of objects:
233	*
234	* a. Rudimentary object:
235	*
236	* { m_free(), m_freem_list(), composite object deallocation }
237	* \| ^
238	* \| \|
239	* \| +------ (done) ---------+
240	* v \|
241	* mcache_free/mcache_free_ext() \|
242	* \| \|
243	* v \|
244	* mbuf_slab_audit() \|
245	* \| \|
246	* v \|
247	* [CPU cache] ---> (not purging?) -----+
248	* \| \|
249	* v \|
250	* mbuf_slab_free() \|
251	* \| \|
252	* v \|
253	* [freelist] ----------->>------------+
254	* (objects get purged to VM only on demand)
255	*
256	* b. Composite object:
257	*
258	* { m_free(), m_freem_list() }
259	* \| ^
260	* \| \|
261	* \| +------ (done) ---------+
262	* v \|
263	* mcache_free/mcache_free_ext() \|
264	* \| \|
265	* v \|
266	* mbuf_cslab_audit() \|
267	* \| \|
268	* v \|
269	* [CPU cache] ---> (not purging?) -----+
270	* \| \|
271	* v \|
272	* mbuf_cslab_free() \|
273	* \| \|
274	* v \|
275	* [freelist] ---> (not purging?) -----+
276	* \| \|
277	* v \|
278	* (rudimentary object) \|
279	* mcache_free/mcache_free_ext() ------->>------+
280	*
281	* Auditing notes: If auditing is enabled, the audit routine will save
282	* any constructed data structure fields (if necessary) before filling the
283	* contents of the buffers with DEADBEEF (free) pattern and recording the
284	* transaction. Buffers that are freed (whether at CPU or slab layer) are
285	* expected to contain the free pattern.
286	*
287	* DEBUGGING:
288	*
289	* Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
290	* translates to the mcache flags (MCF_VERIFY \| MCF_AUDIT). Additionally,
291	* the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
292	* i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
293	* detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
294	* "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
295	*
296	* Each object is associated with exactly one mcache_audit_t structure that
297	* contains the information related to its last buffer transaction. Given
298	* an address of an object, the audit structure can be retrieved by finding
299	* the position of the object relevant to the base address of the cluster:
300	*
301	* +------------+ +=============+
302	* \| mbuf addr \| \| mclaudit[i] \|
303	* +------------+ +=============+
304	* \| \| cl_audit[0] \|
305	* i = MTOBG(addr) +-------------+
306	* \| +-----> \| cl_audit[1] \| -----> mcache_audit_t
307	* b = BGTOM(i) \| +-------------+
308	* \| \| \| ... \|
309	* x = MCLIDX(b, addr) \| +-------------+
310	* \| \| \| cl_audit[7] \|
311	* +-----------------+ +-------------+
312	* (e.g. x == 1)
313	*
314	* The mclaudit[] array is allocated at initialization time, but its contents
315	* get populated when the corresponding cluster is created. Because a page
316	* can be turned into NMBPG number of mbufs, we preserve enough space for the
317	* mbufs so that there is a 1-to-1 mapping between them. A page that never
318	* gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
319	* remaining entries unused. For 16KB cluster, only one entry from the first
320	* page is allocated and used for the entire object.
321	*/
322	#else
323	/*
324	* MBUF IMPLEMENTATION NOTES (using zalloc).
325	*
326	* There are a total of 4 zones and 3 zcaches.
327	*
328	* MC_MBUF:
329	* This is a zone of rudimentary objects of _MSIZE in size; each
330	* object represents an mbuf structure. This cache preserves only
331	* the m_type field of the mbuf during its transactions.
332	*
333	* MC_CL:
334	* This is a zone of rudimentary objects of MCLBYTES in size; each
335	* object represents a mcluster structure. This cache does not
336	* preserve the contents of the objects during its transactions.
337	*
338	* MC_BIGCL:
339	* This is a zone of rudimentary objects of MBIGCLBYTES in size; each
340	* object represents a mbigcluster structure. This cache does not
341	* preserve the contents of the objects during its transaction.
342	*
343	* MC_16KCL:
344	* This is a zone of rudimentary objects of M16KCLBYTES in size; each
345	* object represents a m16kcluster structure. This cache does not
346	* preserve the contents of the objects during its transaction.
347	*
348	* MC_MBUF_CL:
349	* This is a cache of mbufs each having a cluster attached to it.
350	* It is backed by MC_MBUF and MC_CL rudimentary caches. Several
351	* fields of the mbuf related to the external cluster are preserved
352	* during transactions.
353	*
354	* MC_MBUF_BIGCL:
355	* This is a cache of mbufs each having a big cluster attached to it.
356	* It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
357	* fields of the mbuf related to the external cluster are preserved
358	* during transactions.
359	*
360	* MC_MBUF_16KCL:
361	* This is a cache of mbufs each having a big cluster attached to it.
362	* It is backed by MC_MBUF and MC_16KCL rudimentary caches. Several
363	* fields of the mbuf related to the external cluster are preserved
364	* during transactions.
365	*
366	* OBJECT ALLOCATION:
367	*
368	* Allocation requests are handled first at the zalloc per-CPU layer
369	* before falling back to the zalloc depot. Performance is optimal when
370	* the request is satisfied at the CPU layer. zalloc has an additional
371	* overflow layer called the depot, not pictured in the diagram below.
372	*
373	* Allocation paths are different depending on the class of objects:
374	*
375	* a. Rudimentary object:
376	*
377	* { m_get_common(), m_clattach(), m_mclget(),
378	* m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
379	* composite object allocation }
380	* \| ^
381	* \| \|
382	* \| +------- (done) --------+
383	* v \|
384	* zalloc_flags/zalloc_n() KASAN
385	* \| ^
386	* v \|
387	* +----> [zalloc per-CPU cache] -----> (found?) --+
388	* \| \| \|
389	* \| v \|
390	* \| [zalloc recirculation layer] --> (found?) ---+
391	* \| \|
392	* \| v
393	* +--<<-- [zone backing store]
394	*
395	* b. Composite object:
396	*
397	* { m_getpackets_internal(), m_allocpacket_internal() }
398	* \| ^
399	* \| \|
400	* \| +------ (done) ---------+
401	* v \|
402	* mz_composite_alloc() KASAN
403	* \| ^
404	* v \|
405	* zcache_alloc_n() \|
406	* \| \|
407	* v \|
408	* [zalloc per-CPU cache] --> mark_valid() ---+
409	* \| \|
410	* v \|
411	* [zalloc recirculation layer] -> mark_valid() -+
412	* \| \|
413	* v \|
414	* mz_composite_build() \|
415	* \| \|
416	* v \|
417	* (rudimentary objects) \|
418	* zalloc_id() ---------------->>-----+
419	*
420	* Auditing notes: If KASAN enabled, buffers will be subjected to
421	* integrity checks by the AddressSanitizer.
422	*
423	* OBJECT DEALLOCATION:
424	*
425	* Freeing an object simply involves placing it into the CPU cache; this
426	* pollutes the cache to benefit subsequent allocations. The depot
427	* will only be entered if the object is to be purged out of the cache.
428	* Objects may be purged based on the overall memory pressure or
429	* during zone garbage collection.
430	* To improve performance, objects are not zero-filled when freed
431	* as it's custom for other zalloc zones.
432	*
433	* Deallocation paths are different depending on the class of objects:
434	*
435	* a. Rudimentary object:
436	*
437	* { m_free(), m_freem_list(), composite object deallocation }
438	* \| ^
439	* \| \|
440	* \| +------ (done) ---------+
441	* v \|
442	* zfree_nozero() \|
443	* \| \|
444	* v \|
445	* KASAN \|
446	* \| \|
447	* v \|
448	* [zalloc per-CPU cache] -> (not purging?) --+
449	* \| \|
450	* v \|
451	* [zalloc recirculation layer] --->>----------+
452	*
453	*
454	* b. Composite object:
455	*
456	* { m_free(), m_freem_list() }
457	* \| ^
458	* \| \|
459	* \| +------ (done) ---------+
460	* v \|
461	* mz_composite_free() \|
462	* \| \|
463	* v \|
464	* zcache_free_n() \|
465	* \| \|
466	* v \|
467	* KASAN \|
468	* \| \|
469	* v \|
470	* [zalloc per-CPU cache] -> mark_invalid() --+
471	* \| \|
472	* v \|
473	* mz_composite_destroy() \|
474	* \| \|
475	* v \|
476	* (rudimentary object) \|
477	* zfree_nozero() -------------->>------+
478	*
479	* Auditing notes: If KASAN enabled, buffers will be subjected to
480	* integrity checks by the AddressSanitizer.
481	*
482	* DEBUGGING:
483	*
484	* Debugging mbufs can be done by booting a KASAN enabled kernel.
485	*/
486
487	#endif /* CONFIG_MBUF_MCACHE */
488
489	/ TODO: should be in header file /
490	/ kernel translater /
491	extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
492	extern vm_map_t mb_map; / special map /
493
494	#if CONFIG_MBUF_MCACHE
495	static uint32_t mb_kmem_contig_failed;
496	static uint32_t mb_kmem_failed;
497	static uint32_t mb_kmem_one_failed;
498	/ Timestamp of allocation failures. /
499	static uint64_t mb_kmem_contig_failed_ts;
500	static uint64_t mb_kmem_failed_ts;
501	static uint64_t mb_kmem_one_failed_ts;
502	static uint64_t mb_kmem_contig_failed_size;
503	static uint64_t mb_kmem_failed_size;
504	static uint32_t mb_kmem_stats[`6`];
505	#endif /* CONFIG_MBUF_MCACHE */
506
507	/ Global lock /
508	static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
509	static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
510	static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
511
512	#if CONFIG_MBUF_MCACHE
513	/ Back-end (common) layer /
514	static uint64_t mb_expand_cnt;
515	static uint64_t mb_expand_cl_cnt;
516	static uint64_t mb_expand_cl_total;
517	static uint64_t mb_expand_bigcl_cnt;
518	static uint64_t mb_expand_bigcl_total;
519	static uint64_t mb_expand_16kcl_cnt;
520	static uint64_t mb_expand_16kcl_total;
521	static boolean_t mbuf_worker_needs_wakeup; / wait channel for mbuf worker /
522	static uint32_t mbuf_worker_run_cnt;
523	static uint64_t mbuf_worker_last_runtime;
524	static uint64_t mbuf_drain_last_runtime;
525	static int mbuf_worker_ready; / worker thread is runnable /
526	static unsigned int ncpu; / number of CPUs /
527	static ppnum_t mcl_paddr; /* Array of cluster physical addresses /
528	static ppnum_t mcl_pages; / Size of array (# physical pages) /
529	static ppnum_t mcl_paddr_base; / Handle returned by IOMapper::iovmAlloc() /
530	static mcache_t ref_cache; /* Cache of cluster reference & flags /
531	static mcache_t mcl_audit_con_cache; /* Audit contents cache /
532	unsigned int mbuf_debug; / patchable mbuf mcache flags /
533	#endif /* CONFIG_MBUF_DEBUG */
534	static unsigned int mb_normalized; / number of packets "normalized" /
535
536	extern unsigned int mb_tag_mbuf;
537
538	#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
539	#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
540
541	typedef enum {
542	MC_MBUF = `0`, / Regular mbuf /
543	MC_CL, / Cluster /
544	MC_BIGCL, / Large (4KB) cluster /
545	MC_16KCL, / Jumbo (16KB) cluster /
546	MC_MBUF_CL, / mbuf + cluster /
547	MC_MBUF_BIGCL, / mbuf + large (4KB) cluster /
548	MC_MBUF_16KCL / mbuf + jumbo (16KB) cluster /
549	} mbuf_class_t;
550
551	#define MBUF_CLASS_MIN MC_MBUF
552	#define MBUF_CLASS_MAX MC_MBUF_16KCL
553	#define MBUF_CLASS_LAST MC_16KCL
554	#define MBUF_CLASS_VALID(c) \
555	((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
556	#define MBUF_CLASS_COMPOSITE(c) \
557	((int)(c) > MBUF_CLASS_LAST)
558
559
560	/*
561	* mbuf specific mcache allocation request flags.
562	*/
563	#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
564
565	/*
566	* Per-cluster slab structure.
567	*
568	* A slab is a cluster control structure that contains one or more object
569	* chunks; the available chunks are chained in the slab's freelist (sl_head).
570	* Each time a chunk is taken out of the slab, the slab's reference count
571	* gets incremented. When all chunks have been taken out, the empty slab
572	* gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
573	* returned to a slab causes the slab's reference count to be decremented;
574	* it also causes the slab to be reinserted back to class's slab list, if
575	* it's not already done.
576	*
577	* Compartmentalizing of the object chunks into slabs allows us to easily
578	* merge one or more slabs together when the adjacent slabs are idle, as
579	* well as to convert or move a slab from one class to another; e.g. the
580	* mbuf cluster slab can be converted to a regular cluster slab when all
581	* mbufs in the slab have been freed.
582	*
583	* A slab may also span across multiple clusters for chunks larger than
584	* a cluster's size. In this case, only the slab of the first cluster is
585	* used. The rest of the slabs are marked with SLF_PARTIAL to indicate
586	* that they are part of the larger slab.
587	*
588	* Each slab controls a page of memory.
589	*/
590	typedef struct mcl_slab {
591	struct mcl_slab sl_next; /* neighboring slab /
592	u_int8_t sl_class; / controlling mbuf class /
593	int8_t sl_refcnt; / outstanding allocations /
594	int8_t sl_chunks; / chunks (bufs) in this slab /
595	u_int16_t sl_flags; / slab flags (see below) /
596	u_int16_t sl_len; / slab length /
597	void sl_base; /* base of allocated memory /
598	void sl_head; /* first free buffer /
599	TAILQ_ENTRY(mcl_slab) sl_link; / next/prev slab on freelist /
600	} mcl_slab_t;
601
602	#define SLF_MAPPED 0x0001 /* backed by a mapped page */
603	#define SLF_PARTIAL 0x0002 /* part of another slab */
604	#define SLF_DETACHED 0x0004 /* not in slab freelist */
605
606	/*
607	* The array of slabs are broken into groups of arrays per 1MB of kernel
608	* memory to reduce the footprint. Each group is allocated on demand
609	* whenever a new piece of memory mapped in from the VM crosses the 1MB
610	* boundary.
611	*/
612	#define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
613
614	typedef struct mcl_slabg {
615	mcl_slab_t slg_slab; /* group of slabs /
616	} mcl_slabg_t;
617
618	/*
619	* Number of slabs needed to control a 16KB cluster object.
620	*/
621	#define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
622
623	#if CONFIG_MBUF_MCACHE
624	/*
625	* Per-cluster audit structure.
626	*/
627	typedef struct {
628	mcache_audit_t *cl_audit; /* array of audits /
629	} mcl_audit_t;
630
631	typedef struct {
632	struct thread msa_thread; /* thread doing transaction /
633	struct thread msa_pthread; /* previous transaction thread /
634	uint32_t msa_tstamp; / transaction timestamp (ms) /
635	uint32_t msa_ptstamp; / prev transaction timestamp (ms) /
636	uint16_t msa_depth; / pc stack depth /
637	uint16_t msa_pdepth; / previous transaction pc stack /
638	void *msa_stack[MCACHE_STACK_DEPTH];
639	void *msa_pstack[MCACHE_STACK_DEPTH];
640	} mcl_scratch_audit_t;
641
642	typedef struct {
643	/*
644	* Size of data from the beginning of an mbuf that covers m_hdr,
645	* pkthdr and m_ext structures. If auditing is enabled, we allocate
646	* a shadow mbuf structure of this size inside each audit structure,
647	* and the contents of the real mbuf gets copied into it when the mbuf
648	* is freed. This allows us to pattern-fill the mbuf for integrity
649	* check, and to preserve any constructed mbuf fields (e.g. mbuf +
650	* cluster cache case). Note that we don't save the contents of
651	* clusters when they are freed; we simply pattern-fill them.
652	*/
653	u_int8_t sc_mbuf[(_MSIZE - _MHLEN) + sizeof(_m_ext_t)];
654	mcl_scratch_audit_t sc_scratch __attribute__((aligned(`8`)));
655	} mcl_saved_contents_t;
656
657	#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
658
659	#define MCA_SAVED_MBUF_PTR(_mca) \
660	((struct mbuf )(void )((mcl_saved_contents_t *) \
661	(_mca)->mca_contents)->sc_mbuf)
662	#define MCA_SAVED_MBUF_SIZE \
663	(sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
664	#define MCA_SAVED_SCRATCH_PTR(_mca) \
665	(&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
666
667	/*
668	* mbuf specific mcache audit flags
669	*/
670	#define MB_INUSE 0x01 /* object has not been returned to slab */
671	#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
672	#define MB_SCVALID 0x04 /* object has valid saved contents */
673
674	/*
675	* Each of the following two arrays hold up to nmbclusters elements.
676	*/
677	static mcl_audit_t mclaudit; /* array of cluster audit information /
678	static unsigned int maxclaudit; / max # of entries in audit table /
679	static mcl_slabg_t *slabstbl; /* cluster slabs table /
680	static unsigned int maxslabgrp; / max # of entries in slabs table /
681	static unsigned int slabgrp; / # of entries in slabs table /
682	#endif /* CONFIG_MBUF_MCACHE */
683
684	/ Globals /
685	int nclusters; / # of clusters for non-jumbo (legacy) sizes /
686	int njcl; / # of clusters for jumbo sizes /
687	int njclbytes; / size of a jumbo cluster /
688	unsigned char mbutl; /* first mapped cluster address /
689	unsigned char embutl; /* ending virtual address of mclusters /
690	int max_linkhdr; / largest link-level header /
691	int max_protohdr; / largest protocol header /
692	int max_hdr; / largest link+protocol header /
693	int max_datalen; / MHLEN - max_hdr /
694
695	#if CONFIG_MBUF_MCACHE
696	static boolean_t mclverify; / debug: pattern-checking /
697	static boolean_t mcltrace; / debug: stack tracing /
698	static boolean_t mclfindleak; / debug: leak detection /
699	static boolean_t mclexpleak; / debug: expose leak info to user space /
700
701	static struct timeval mb_start; / beginning of time /
702
703	/ mbuf leak detection variables /
704	static struct mleak_table mleak_table;
705	static mleak_stat_t *mleak_stat;
706
707	#define MLEAK_STAT_SIZE(n) \
708	__builtin_offsetof(mleak_stat_t, ml_trace[n])
709
710	struct mallocation {
711	mcache_obj_t element; /* the alloc'ed element, NULL if unused /
712	u_int32_t trace_index; / mtrace index for corresponding backtrace /
713	u_int32_t count; / How many objects were requested /
714	u_int64_t hitcount; / for determining hash effectiveness /
715	};
716
717	struct mtrace {
718	u_int64_t collisions;
719	u_int64_t hitcount;
720	u_int64_t allocs;
721	u_int64_t depth;
722	uintptr_t addr[MLEAK_STACK_DEPTH];
723	};
724
725	/ Size must be a power of two for the zhash to be able to just mask off bits /
726	#define MLEAK_ALLOCATION_MAP_NUM 512
727	#define MLEAK_TRACE_MAP_NUM 256
728
729	/*
730	* Sample factor for how often to record a trace. This is overwritable
731	* by the boot-arg mleak_sample_factor.
732	*/
733	#define MLEAK_SAMPLE_FACTOR 500
734
735	/*
736	* Number of top leakers recorded.
737	*/
738	#define MLEAK_NUM_TRACES 5
739
740	#define MB_LEAK_SPACING_64 " "
741	#define MB_LEAK_SPACING_32 " "
742
743
744	#define MB_LEAK_HDR_32 "\n\
745	trace [1] trace [2] trace [3] trace [4] trace [5] \n\
746	---------- ---------- ---------- ---------- ---------- \n\
747	"
748
749	#define MB_LEAK_HDR_64 "\n\
750	trace [1] trace [2] trace [3] \
751	trace [4] trace [5] \n\
752	------------------ ------------------ ------------------ \
753	------------------ ------------------ \n\
754	"
755
756	static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
757	static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
758
759	/ Hashmaps of allocations and their corresponding traces /
760	static struct mallocation *mleak_allocations;
761	static struct mtrace *mleak_traces;
762	static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
763
764	/ Lock to protect mleak tables from concurrent modification /
765	static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
766	static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
767	static lck_mtx_t *const mleak_lock = &mleak_lock_data;
768
769	/ Failed large allocations. /
770	struct mtracelarge {
771	uint64_t size;
772	uint64_t depth;
773	uintptr_t addr[MLEAK_STACK_DEPTH];
774	};
775
776	#define MTRACELARGE_NUM_TRACES 5
777	static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
778
779	static void mtracelarge_register(size_t size);
780	#endif /* CONFIG_MBUF_MCACHE */
781
782	/ Lock to protect the completion callback table /
783	static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
784	LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
785
786	extern u_int32_t high_sb_max;
787
788	/ The minimum number of objects that are allocated, to start. /
789	#define MINCL 32
790	#define MINBIGCL (MINCL >> 1)
791	#define MIN16KCL (MINCL >> 2)
792
793	/ Low watermarks (only map in pages once free counts go below) /
794	#define MBIGCL_LOWAT MINBIGCL
795	#define M16KCL_LOWAT MIN16KCL
796
797	typedef struct {
798	mbuf_class_t mtbl_class; / class type /
799	#if CONFIG_MBUF_MCACHE
800	mcache_t mtbl_cache; /* mcache for this buffer class /
801	TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; / slab list /
802	mcache_obj_t mtbl_cobjlist; /* composite objects freelist /
803	#endif /* CONFIG_MBUF_MCACHE */
804	mb_class_stat_t mtbl_stats; /* statistics fetchable via sysctl /
805	u_int32_t mtbl_maxsize; / maximum buffer size /
806	int mtbl_minlimit; / minimum allowed /
807	int mtbl_maxlimit; / maximum allowed /
808	u_int32_t mtbl_wantpurge; / purge during next reclaim /
809	uint32_t mtbl_avgtotal; / average total on iOS /
810	u_int32_t mtbl_expand; / worker should expand the class /
811	} mbuf_table_t;
812
813	#define m_class(c) mbuf_table[c].mtbl_class
814	#if CONFIG_MBUF_MCACHE
815	#define m_cache(c) mbuf_table[c].mtbl_cache
816	#define m_slablist(c) mbuf_table[c].mtbl_slablist
817	#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
818	#else
819	#define m_stats(c) mbuf_table[c].mtbl_stats
820	#endif /* CONFIG_MBUF_MCACHE */
821	#define m_maxsize(c) mbuf_table[c].mtbl_maxsize
822	#define m_minlimit(c) mbuf_table[c].mtbl_minlimit
823	#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
824	#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
825	#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
826	#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
827	#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
828	#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
829	#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
830	#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
831	#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
832	#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
833	#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
834	#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
835	#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
836	#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
837	#define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
838	#define m_region_expand(c) mbuf_table[c].mtbl_expand
839
840	static mbuf_table_t mbuf_table[] = {
841	#if CONFIG_MBUF_MCACHE
842	/*
843	* The caches for mbufs, regular clusters and big clusters.
844	* The average total values were based on data gathered by actual
845	* usage patterns on iOS.
846	*/
847	{ MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
848	NULL, NULL, `0`, `0`, `0`, `0`, `3000`, `0` },
849	{ MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
850	NULL, NULL, `0`, `0`, `0`, `0`, `2000`, `0` },
851	{ MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
852	NULL, NULL, `0`, `0`, `0`, `0`, `1000`, `0` },
853	{ MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
854	NULL, NULL, `0`, `0`, `0`, `0`, `200`, `0` },
855	/*
856	* The following are special caches; they serve as intermediate
857	* caches backed by the above rudimentary caches. Each object
858	* in the cache is an mbuf with a cluster attached to it. Unlike
859	* the above caches, these intermediate caches do not directly
860	* deal with the slab structures; instead, the constructed
861	* cached elements are simply stored in the freelists.
862	*/
863	{ MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, `0`, `0`, `0`, `0`, `2000`, `0` },
864	{ MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, `0`, `0`, `0`, `0`, `1000`, `0` },
865	{ MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, `0`, `0`, `0`, `0`, `200`, `0` },
866	#else
867	{ .mtbl_class = MC_MBUF },
868	{ .mtbl_class = MC_CL },
869	{ .mtbl_class = MC_BIGCL },
870	{ .mtbl_class = MC_16KCL },
871	{ .mtbl_class = MC_MBUF_CL },
872	{ .mtbl_class = MC_MBUF_BIGCL },
873	{ .mtbl_class = MC_MBUF_16KCL },
874	#endif /* CONFIG_MBUF_MCACHE */
875	};
876
877	#define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
878
879	#if SKYWALK && CONFIG_MBUF_MCACHE
880	#define MC_THRESHOLD_SCALE_DOWN_FACTOR 2
881	static unsigned int mc_threshold_scale_down_factor =
882	MC_THRESHOLD_SCALE_DOWN_FACTOR;
883	#endif /* SKYWALK */
884
885	#if CONFIG_MBUF_MCACHE
886	static uint32_t
887	m_avgtotal(mbuf_class_t c)
888	{
889	#if SKYWALK
890	return if_is_fsw_transport_netagent_enabled() ?
891	(mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) :
892	mbuf_table[c].mtbl_avgtotal;
893	#else /* !SKYWALK */
894	return mbuf_table[c].mtbl_avgtotal;
895	#endif /* SKYWALK */
896	}
897	#endif /* CONFIG_MBUF_MCACHE */
898
899	#if CONFIG_MBUF_MCACHE
900	static void mb_waitchan = &mbuf_table; /* wait channel for all caches /
901	static int mb_waiters; / number of waiters /
902	#endif /* CONFIG_MBUF_MCACHE */
903
904	#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
905	#if CONFIG_MBUF_MCACHE
906	static struct timeval mb_wdtstart; / watchdog start timestamp /
907	static char *mbuf_dump_buf;
908
909	#define MBUF_DUMP_BUF_SIZE 4096
910
911	/*
912	* mbuf watchdog is enabled by default. It is also toggeable via the
913	* kern.ipc.mb_watchdog sysctl.
914	* Garbage collection is enabled by default on embedded platforms.
915	* mb_drain_maxint controls the amount of time to wait (in seconds) before
916	* consecutive calls to mbuf_drain().
917	*/
918	static unsigned int mb_watchdog = `1`;
919	#if !XNU_TARGET_OS_OSX
920	static unsigned int mb_drain_maxint = `60`;
921	#else /* XNU_TARGET_OS_OSX */
922	static unsigned int mb_drain_maxint = `0`;
923	#endif /* XNU_TARGET_OS_OSX */
924	#endif /* CONFIG_MBUF_MCACHE */
925	static unsigned int mb_memory_pressure_percentage = `80`;
926
927	uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
928	uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
929
930	/ Red zone /
931	static u_int32_t mb_redzone_cookie;
932	static void m_redzone_init(struct mbuf *);
933	static void m_redzone_verify(struct mbuf *m);
934
935	static void m_set_rfa(struct mbuf , struct* ext_ref *);
936
937	#if CONFIG_MBUF_MCACHE
938	/ The following are used to serialize m_clalloc() /
939	static boolean_t mb_clalloc_busy;
940	static void *mb_clalloc_waitchan = &mb_clalloc_busy;
941	static int mb_clalloc_waiters;
942	#endif /* CONFIG_MBUF_MCACHE */
943
944	static void mbuf_mtypes_sync(boolean_t);
945	static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
946	static void mbuf_stat_sync(void);
947	static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
948	#if CONFIG_MBUF_MCACHE
949	static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
950	static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
951	static char mbuf_dump(void*);
952	#endif /* CONFIG_MBUF_MCACHE */
953	static void mbuf_table_init(void);
954	static inline void m_incref(struct mbuf *);
955	static inline u_int16_t m_decref(struct mbuf *);
956	static void mbuf_watchdog_defunct(thread_call_param_t, thread_call_param_t);
957	#if CONFIG_MBUF_MCACHE
958	static int m_clalloc(const u_int32_t, const int, const u_int32_t);
959	static void mbuf_worker_thread_init(void);
960	static mcache_obj_t slab_alloc(mbuf_class_t, int*);
961	static void slab_free(mbuf_class_t, mcache_obj_t *);
962	static unsigned int mbuf_slab_alloc(void , mcache_obj_t **,
963	unsigned int, int);
964	static void mbuf_slab_free(void , mcache_obj_t , int);
965	static void mbuf_slab_audit(void , mcache_obj_t , boolean_t);
966	static void mbuf_slab_notify(void *, u_int32_t);
967	static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
968	unsigned int);
969	static unsigned int cslab_free(mbuf_class_t, mcache_obj_t , int*);
970	static unsigned int mbuf_cslab_alloc(void , mcache_obj_t **,
971	unsigned int, int);
972	static void mbuf_cslab_free(void , mcache_obj_t , int);
973	static void mbuf_cslab_audit(void , mcache_obj_t , boolean_t);
974	static int freelist_populate(mbuf_class_t, unsigned int, int);
975	static void freelist_init(mbuf_class_t);
976	static boolean_t mbuf_cached_above(mbuf_class_t, int);
977	static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
978	static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
979	static int m_howmany(int, size_t);
980	static void mbuf_worker_thread(void);
981	static void mbuf_watchdog(void);
982	static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
983
984	static void mcl_audit_init(void , mcache_audit_t , mcache_obj_t *,
985	size_t, unsigned int);
986	static void mcl_audit_free(void , unsigned* int);
987	static mcache_audit_t mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t );
988	static void mcl_audit_mbuf(mcache_audit_t , void* *, boolean_t, boolean_t);
989	static void mcl_audit_cluster(mcache_audit_t , void* *, size_t, boolean_t,
990	boolean_t);
991	static void mcl_audit_restore_mbuf(struct mbuf , mcache_audit_t , boolean_t);
992	static void mcl_audit_save_mbuf(struct mbuf , mcache_audit_t );
993	static void mcl_audit_scratch(mcache_audit_t *);
994	static void mcl_audit_mcheck_panic(struct mbuf *);
995	static void mcl_audit_verify_nextptr(void , mcache_audit_t );
996
997	static void mleak_activate(void);
998	static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
999	static boolean_t mleak_log(uintptr_t , mcache_obj_t , uint32_t, int);
1000	static void mleak_free(mcache_obj_t *);
1001	static void mleak_sort_traces(void);
1002	static void mleak_update_stats(void);
1003
1004	static mcl_slab_t slab_get(void* *);
1005	static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
1006	void , void* , unsigned* int, int, int);
1007	static void slab_insert(mcl_slab_t *, mbuf_class_t);
1008	static void slab_remove(mcl_slab_t *, mbuf_class_t);
1009	static boolean_t slab_inrange(mcl_slab_t , void* *);
1010	static void slab_nextptr_panic(mcl_slab_t , void* *);
1011	static void slab_detach(mcl_slab_t *);
1012	static boolean_t slab_is_detached(mcl_slab_t *);
1013	#else /* !CONFIG_MBUF_MCACHE */
1014	static void mbuf_watchdog_drain_composite(thread_call_param_t, thread_call_param_t);
1015	static struct mbuf *mz_alloc(zalloc_flags_t);
1016	static void mz_free(struct mbuf *);
1017	static struct ext_ref *mz_ref_alloc(zalloc_flags_t);
1018	static void mz_ref_free(struct ext_ref *);
1019	static void *mz_cl_alloc(zone_id_t, zalloc_flags_t);
1020	static void mz_cl_free(zone_id_t, void *);
1021	static struct mbuf *mz_composite_alloc(mbuf_class_t, zalloc_flags_t);
1022	static zstack_t mz_composite_alloc_n(mbuf_class_t, unsigned int, zalloc_flags_t);
1023	static void mz_composite_free(mbuf_class_t, struct mbuf *);
1024	static void mz_composite_free_n(mbuf_class_t, zstack_t);
1025	static void *mz_composite_build(zone_id_t, zalloc_flags_t);
1026	static void mz_composite_mark_valid(zone_id_t, void* *);
1027	static void mz_composite_mark_invalid(zone_id_t, void* *);
1028	static void mz_composite_destroy(zone_id_t, void *);
1029
1030	ZONE_DEFINE_ID(ZONE_ID_MBUF_REF, "mbuf.ref", struct ext_ref,
1031	ZC_CACHING \| ZC_NOPGZ \| ZC_KASAN_NOQUARANTINE);
1032	ZONE_DEFINE_ID(ZONE_ID_MBUF, "mbuf", struct mbuf,
1033	ZC_CACHING \| ZC_NOPGZ \| ZC_KASAN_NOQUARANTINE);
1034	ZONE_DEFINE_ID(ZONE_ID_CLUSTER_2K, "mbuf.cluster.2k", union mcluster,
1035	ZC_CACHING \| ZC_NOPGZ \| ZC_KASAN_NOQUARANTINE \| ZC_DATA);
1036	ZONE_DEFINE_ID(ZONE_ID_CLUSTER_4K, "mbuf.cluster.4k", union mbigcluster,
1037	ZC_CACHING \| ZC_NOPGZ \| ZC_KASAN_NOQUARANTINE \| ZC_DATA);
1038	ZONE_DEFINE_ID(ZONE_ID_CLUSTER_16K, "mbuf.cluster.16k", union m16kcluster,
1039	ZC_CACHING \| ZC_NOPGZ \| ZC_KASAN_NOQUARANTINE \| ZC_DATA);
1040	static_assert(sizeof(union mcluster) == MCLBYTES);
1041	static_assert(sizeof(union mbigcluster) == MBIGCLBYTES);
1042	static_assert(sizeof(union m16kcluster) == M16KCLBYTES);
1043
1044	static const struct zone_cache_ops mz_composite_ops = {
1045	.zc_op_alloc = mz_composite_build,
1046	.zc_op_mark_valid = mz_composite_mark_valid,
1047	.zc_op_mark_invalid = mz_composite_mark_invalid,
1048	.zc_op_free = mz_composite_destroy,
1049	};
1050	ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_2K, "mbuf.composite.2k", struct mbuf,
1051	sizeof(struct mbuf) + sizeof(struct ext_ref) + MCLBYTES,
1052	&mz_composite_ops);
1053	ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_4K, "mbuf.composite.4k", struct mbuf,
1054	sizeof(struct mbuf) + sizeof(struct ext_ref) + MBIGCLBYTES,
1055	&mz_composite_ops);
1056	ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_16K, "mbuf.composite.16k", struct mbuf,
1057	sizeof(struct mbuf) + sizeof(struct ext_ref) + M16KCLBYTES,
1058	&mz_composite_ops);
1059	static_assert(ZONE_ID_MBUF + MC_MBUF == ZONE_ID_MBUF);
1060	static_assert(ZONE_ID_MBUF + MC_CL == ZONE_ID_CLUSTER_2K);
1061	static_assert(ZONE_ID_MBUF + MC_BIGCL == ZONE_ID_CLUSTER_4K);
1062	static_assert(ZONE_ID_MBUF + MC_16KCL == ZONE_ID_CLUSTER_16K);
1063	static_assert(ZONE_ID_MBUF + MC_MBUF_CL == ZONE_ID_MBUF_CLUSTER_2K);
1064	static_assert(ZONE_ID_MBUF + MC_MBUF_BIGCL == ZONE_ID_MBUF_CLUSTER_4K);
1065	static_assert(ZONE_ID_MBUF + MC_MBUF_16KCL == ZONE_ID_MBUF_CLUSTER_16K);
1066
1067	/ Converts a an mbuf class to a zalloc zone ID. /
1068	__attribute__((always_inline))
1069	static inline zone_id_t
1070	m_class_to_zid(mbuf_class_t class)
1071	{
1072	return ZONE_ID_MBUF + class - MC_MBUF;
1073	}
1074
1075	__attribute__((always_inline))
1076	static inline mbuf_class_t
1077	m_class_from_zid(zone_id_t zid)
1078	{
1079	return MC_MBUF + zid - ZONE_ID_MBUF;
1080	}
1081
1082	static thread_call_t mbuf_defunct_tcall;
1083	static thread_call_t mbuf_drain_tcall;
1084	#endif /* CONFIG_MBUF_MCACHE */
1085
1086	static int m_copyback0(struct mbuf *, int, int, const* void , int, int*);
1087	static struct mbuf m_split0(struct* mbuf , int, int, int*);
1088	#if CONFIG_MBUF_MCACHE && (DEBUG \|\| DEVELOPMENT)
1089	#define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
1090	static void _mbwdog_logger(const char func, const* int line, const char *fmt, ...);
1091	static char *mbwdog_logging;
1092	const unsigned mbwdog_logging_size = `4096`;
1093	static size_t mbwdog_logging_used;
1094	#else
1095	#define mbwdog_logger(fmt, ...) do { } while (0)
1096	#endif /* CONFIG_MBUF_MCACHE &&DEBUG \|\| DEVELOPMENT */
1097	#if CONFIG_MBUF_MCACHE
1098	static void mbuf_drain_locked(boolean_t);
1099	#endif /* CONFIG_MBUF_MCACHE */
1100
1101	/ flags for m_copyback0 /
1102	#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
1103	#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
1104	#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
1105	#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
1106
1107	/*
1108	* This flag is set for all mbufs that come out of and into the composite
1109	* mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
1110	* are marked with such a flag have clusters attached to them, and will be
1111	* treated differently when they are freed; instead of being placed back
1112	* into the mbuf and cluster freelists, the composite mbuf + cluster objects
1113	* are placed back into the appropriate composite cache's freelist, and the
1114	* actual freeing is deferred until the composite objects are purged. At
1115	* such a time, this flag will be cleared from the mbufs and the objects
1116	* will be freed into their own separate freelists.
1117	*/
1118	#define EXTF_COMPOSITE 0x1
1119
1120	/*
1121	* This flag indicates that the external cluster is read-only, i.e. it is
1122	* or was referred to by more than one mbufs. Once set, this flag is never
1123	* cleared.
1124	*/
1125	#define EXTF_READONLY 0x2
1126	/*
1127	* This flag indicates that the external cluster is paired with the mbuf.
1128	* Pairing implies an external free routine defined which will be invoked
1129	* when the reference count drops to the minimum at m_free time. This
1130	* flag is never cleared.
1131	*/
1132	#define EXTF_PAIRED 0x4
1133
1134	#define EXTF_MASK \
1135	(EXTF_COMPOSITE \| EXTF_READONLY \| EXTF_PAIRED)
1136
1137	#define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
1138	#define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
1139	#define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
1140	#define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
1141	#define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
1142	#define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
1143	#define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
1144	#define MBUF_IS_COMPOSITE(m) \
1145	(MEXT_REF(m) == MEXT_MINREF(m) && \
1146	(MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
1147	/*
1148	* This macro can be used to test if the mbuf is paired to an external
1149	* cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
1150	* is important, as EXTF_PAIRED alone is insufficient since it is immutable,
1151	* and thus survives calls to m_free_paired.
1152	*/
1153	#define MBUF_IS_PAIRED(m) \
1154	(((m)->m_flags & M_EXT) && \
1155	(MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
1156	MEXT_PMBUF(m) == (m))
1157
1158	/*
1159	* Macros used to verify the integrity of the mbuf.
1160	*/
1161	#if CONFIG_MBUF_MCACHE
1162	#define _MCHECK(m) { \
1163	if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
1164	if (mclaudit == NULL) \
1165	panic("MCHECK: m_type=%d m=%p", \
1166	(u_int16_t)(m)->m_type, m); \
1167	else \
1168	mcl_audit_mcheck_panic(m); \
1169	} \
1170	}
1171	#else
1172	#define _MCHECK(m) \
1173	if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
1174	panic("MCHECK: m_type=%d m=%p", \
1175	(u_int16_t)(m)->m_type, m); \
1176	}
1177	#endif /* CONFIG_MBUF_MCACHE */
1178
1179	/*
1180	* Macro version of mtod.
1181	*/
1182	#define MTOD(m, t) ((t)((m)->m_data))
1183
1184	#if CONFIG_MBUF_MCACHE
1185	#define MBUF_IN_MAP(addr) \
1186	((unsigned char *)(addr) >= mbutl && \
1187	(unsigned char *)(addr) < embutl)
1188
1189	#define MRANGE(addr) { \
1190	if (!MBUF_IN_MAP(addr)) \
1191	panic("MRANGE: address out of range 0x%p", addr); \
1192	}
1193
1194	/*
1195	* Macros to obtain page index given a base cluster address
1196	*/
1197	#define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
1198	#define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
1199
1200	/*
1201	* Macro to find the mbuf index relative to a base.
1202	*/
1203	#define MBPAGEIDX(c, m) \
1204	(((unsigned char )(m) - (unsigned char )(c)) >> _MSIZESHIFT)
1205
1206	/*
1207	* Same thing for 2KB cluster index.
1208	*/
1209	#define CLPAGEIDX(c, m) \
1210	(((unsigned char )(m) - (unsigned char )(c)) >> MCLSHIFT)
1211
1212	/*
1213	* Macro to find 4KB cluster index relative to a base
1214	*/
1215	#define BCLPAGEIDX(c, m) \
1216	(((unsigned char )(m) - (unsigned char )(c)) >> MBIGCLSHIFT)
1217	#endif /* CONFIG_MBUF_MCACHE */
1218
1219	/*
1220	* Macros used during mbuf and cluster initialization.
1221	*/
1222	#define MBUF_INIT_PKTHDR(m) { \
1223	(m)->m_pkthdr.rcvif = NULL; \
1224	(m)->m_pkthdr.pkt_hdr = NULL; \
1225	(m)->m_pkthdr.len = 0; \
1226	(m)->m_pkthdr.csum_flags = 0; \
1227	(m)->m_pkthdr.csum_data = 0; \
1228	(m)->m_pkthdr.vlan_tag = 0; \
1229	(m)->m_pkthdr.comp_gencnt = 0; \
1230	(m)->m_pkthdr.pkt_crumbs = 0; \
1231	m_classifier_init(m, 0); \
1232	m_tag_init(m, 1); \
1233	m_scratch_init(m); \
1234	m_redzone_init(m); \
1235	}
1236
1237	#define MBUF_INIT(m, pkthdr, type) { \
1238	_MCHECK(m); \
1239	(m)->m_next = (m)->m_nextpkt = NULL; \
1240	(m)->m_len = 0; \
1241	(m)->m_type = type; \
1242	if ((pkthdr) == 0) { \
1243	(m)->m_data = (uintptr_t)(m)->m_dat; \
1244	(m)->m_flags = 0; \
1245	} else { \
1246	(m)->m_data = (uintptr_t)(m)->m_pktdat; \
1247	(m)->m_flags = M_PKTHDR; \
1248	MBUF_INIT_PKTHDR(m); \
1249	} \
1250	}
1251
1252	#define MEXT_INIT mext_init
1253
1254	#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
1255	MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
1256	ref, 0, flag, 0, NULL)
1257
1258	#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
1259	MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
1260	ref, 0, flag, 0, NULL)
1261
1262	#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
1263	MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
1264	ref, 0, flag, 0, NULL)
1265
1266	/*
1267	* Macro to convert BSD malloc sleep flag to mcache's
1268	*/
1269	#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
1270
1271	/*
1272	* The structure that holds all mbuf class statistics exportable via sysctl.
1273	* Similar to mbstat structure, the mb_stat structure is protected by the
1274	* global mbuf lock. It contains additional information about the classes
1275	* that allows for a more accurate view of the state of the allocator.
1276	*/
1277	struct mb_stat *mb_stat;
1278	struct omb_stat omb_stat; /* For backwards compatibility /
1279
1280	#define MB_STAT_SIZE(n) \
1281	__builtin_offsetof(mb_stat_t, mbs_class[n])
1282	#define OMB_STAT_SIZE(n) \
1283	__builtin_offsetof(struct omb_stat, mbs_class[n])
1284
1285	/*
1286	* The legacy structure holding all of the mbuf allocation statistics.
1287	* The actual statistics used by the kernel are stored in the mbuf_table
1288	* instead, and are updated atomically while the global mbuf lock is held.
1289	* They are mirrored in mbstat to support legacy applications (e.g. netstat).
1290	* Unlike before, the kernel no longer relies on the contents of mbstat for
1291	* its operations (e.g. cluster expansion) because the structure is exposed
1292	* to outside and could possibly be modified, therefore making it unsafe.
1293	* With the exception of the mbstat.m_mtypes array (see below), all of the
1294	* statistics are updated as they change.
1295	*/
1296	struct mbstat mbstat;
1297
1298	#define MBSTAT_MTYPES_MAX \
1299	(sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1300
1301	/*
1302	* Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1303	* atomically and stored in a per-CPU structure which is lock-free; this is
1304	* done in order to avoid writing to the global mbstat data structure which
1305	* would cause false sharing. During sysctl request for kern.ipc.mbstat,
1306	* the statistics across all CPUs will be converged into the mbstat.m_mtypes
1307	* array and returned to the application. Any updates for types greater or
1308	* equal than MT_MAX would be done atomically to the mbstat; this slows down
1309	* performance but is okay since the kernel uses only up to MT_MAX-1 while
1310	* anything beyond that (up to type 255) is considered a corner case.
1311	*/
1312	typedef struct {
1313	unsigned int cpu_mtypes[MT_MAX];
1314	} mbuf_mtypes_t;
1315
1316	static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
1317
1318	#define mtype_stat_add(type, n) { \
1319	if ((unsigned)(type) < MT_MAX) { \
1320	mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \
1321	os_atomic_add(&mbs->cpu_mtypes[type], n, relaxed); \
1322	} else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
1323	os_atomic_add((int16_t *)&mbstat.m_mtypes[type], n, relaxed); \
1324	} \
1325	}
1326
1327	#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
1328	#define mtype_stat_inc(t) mtype_stat_add(t, 1)
1329	#define mtype_stat_dec(t) mtype_stat_sub(t, 1)
1330
1331	static inline void
1332	mext_init(struct mbuf m, void* *__sized_by(size)buf, u_int size,
1333	m_ext_free_func_t free, caddr_t free_arg, struct ext_ref *rfa,
1334	u_int16_t min, u_int16_t ref, u_int16_t pref, u_int16_t flag,
1335	u_int32_t priv, struct mbuf *pm)
1336	{
1337	m->m_ext.ext_buf = buf;
1338	m->m_ext.ext_size = size;
1339	m->m_data = (uintptr_t)m->m_ext.ext_buf;
1340	m->m_len = `0`;
1341	m->m_flags \|= M_EXT;
1342	m_set_ext(m, rfa, free, free_arg);
1343	MEXT_MINREF(m) = min;
1344	MEXT_REF(m) = ref;
1345	MEXT_PREF(m) = pref;
1346	MEXT_FLAGS(m) = flag;
1347	MEXT_PRIV(m) = priv;
1348	MEXT_PMBUF(m) = pm;
1349	}
1350
1351	static void
1352	mbuf_mtypes_sync(boolean_t locked)
1353	{
1354	mbuf_mtypes_t mtc;
1355
1356	if (locked) {
1357	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1358	}
1359
1360	mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
1361	percpu_foreach_secondary(mtype, mbuf_mtypes) {
1362	for (int n = `0`; n < MT_MAX; n++) {
1363	mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
1364	}
1365	}
1366
1367	if (!locked) {
1368	lck_mtx_lock(lck: mbuf_mlock);
1369	}
1370	for (int n = `0`; n < MT_MAX; n++) {
1371	mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1372	}
1373	if (!locked) {
1374	lck_mtx_unlock(lck: mbuf_mlock);
1375	}
1376	}
1377
1378	static int
1379	mbstat_sysctl SYSCTL_HANDLER_ARGS
1380	{
1381	#pragma unused(oidp, arg1, arg2)
1382
1383	#if CONFIG_MBUF_MCACHE
1384	mbuf_mtypes_sync(FALSE);
1385	#else
1386	lck_mtx_lock(lck: mbuf_mlock);
1387	mbuf_stat_sync();
1388	mbuf_mtypes_sync(TRUE);
1389	lck_mtx_unlock(lck: mbuf_mlock);
1390	#endif
1391
1392	return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
1393	}
1394
1395	static void
1396	mbuf_stat_sync(void)
1397	{
1398	mb_class_stat_t *sp;
1399	#if CONFIG_MBUF_MCACHE
1400	mcache_cpu_t *ccp;
1401	mcache_t *cp;
1402	int k, m, bktsize;
1403	#else
1404	int k;
1405	uint64_t drops = `0`;
1406	#endif /* CONFIG_MBUF_MCACHE */
1407
1408
1409	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1410
1411	#if CONFIG_MBUF_MCACHE
1412	for (k = `0`; k < NELEM(mbuf_table); k++) {
1413	cp = m_cache(k);
1414	ccp = &cp->mc_cpu[`0`];
1415	bktsize = ccp->cc_bktsize;
1416	sp = mbuf_table[k].mtbl_stats;
1417
1418	if (cp->mc_flags & MCF_NOCPUCACHE) {
1419	sp->mbcl_mc_state = MCS_DISABLED;
1420	} else if (cp->mc_purge_cnt > `0`) {
1421	sp->mbcl_mc_state = MCS_PURGING;
1422	} else if (bktsize == `0`) {
1423	sp->mbcl_mc_state = MCS_OFFLINE;
1424	} else {
1425	sp->mbcl_mc_state = MCS_ONLINE;
1426	}
1427
1428	sp->mbcl_mc_cached = `0`;
1429	for (m = `0`; m < ncpu; m++) {
1430	ccp = &cp->mc_cpu[m];
1431	if (ccp->cc_objs > `0`) {
1432	sp->mbcl_mc_cached += ccp->cc_objs;
1433	}
1434	if (ccp->cc_pobjs > `0`) {
1435	sp->mbcl_mc_cached += ccp->cc_pobjs;
1436	}
1437	}
1438	sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1439	sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1440	sp->mbcl_infree;
1441
1442	sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1443	sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1444	sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1445
1446	/ Calculate total count specific to each class /
1447	sp->mbcl_ctotal = sp->mbcl_total;
1448	switch (m_class(k)) {
1449	case MC_MBUF:
1450	/ Deduct mbufs used in composite caches /
1451	sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1452	m_total(MC_MBUF_BIGCL) - m_total(MC_MBUF_16KCL));
1453	break;
1454
1455	case MC_CL:
1456	/ Deduct clusters used in composite cache /
1457	sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1458	break;
1459
1460	case MC_BIGCL:
1461	/ Deduct clusters used in composite cache /
1462	sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1463	break;
1464
1465	case MC_16KCL:
1466	/ Deduct clusters used in composite cache /
1467	sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1468	break;
1469
1470	default:
1471	break;
1472	}
1473	}
1474	#else
1475	for (k = `0`; k < NELEM(mbuf_table); k++) {
1476	const zone_id_t zid = m_class_to_zid(m_class(k));
1477	const zone_t zone = zone_by_id(zid);
1478	struct zone_basic_stats stats = {};
1479
1480	sp = m_stats(k);
1481	zone_get_stats(zone, stats: &stats);
1482	drops += stats.zbs_alloc_fail;
1483	sp->mbcl_total = stats.zbs_avail;
1484	sp->mbcl_active = stats.zbs_alloc;
1485	/*
1486	* infree is what mcache considers the freelist (uncached)
1487	* free_cnt contains all the cached/uncached elements
1488	* in a zone.
1489	*/
1490	sp->mbcl_infree = stats.zbs_free - stats.zbs_cached;
1491	sp->mbcl_fail_cnt = stats.zbs_alloc_fail;
1492	sp->mbcl_ctotal = sp->mbcl_total;
1493
1494	/ These stats are not available in zalloc. /
1495	sp->mbcl_alloc_cnt = `0`;
1496	sp->mbcl_free_cnt = `0`;
1497	sp->mbcl_notified = `0`;
1498	sp->mbcl_purge_cnt = `0`;
1499	sp->mbcl_slab_cnt = `0`;
1500	sp->mbcl_release_cnt = `0`;
1501
1502	/ zalloc caches are always on. /
1503	sp->mbcl_mc_state = MCS_ONLINE;
1504	sp->mbcl_mc_cached = stats.zbs_cached;
1505	/ These stats are not collected by zalloc. /
1506	sp->mbcl_mc_waiter_cnt = `0`;
1507	sp->mbcl_mc_wretry_cnt = `0`;
1508	sp->mbcl_mc_nwretry_cnt = `0`;
1509	}
1510	/ Deduct clusters used in composite cache /
1511	m_ctotal(MC_MBUF) -= (m_total(MC_MBUF_CL) +
1512	m_total(MC_MBUF_BIGCL) -
1513	m_total(MC_MBUF_16KCL));
1514	m_ctotal(MC_CL) -= m_total(MC_MBUF_CL);
1515	m_ctotal(MC_BIGCL) -= m_total(MC_MBUF_BIGCL);
1516	m_ctotal(MC_16KCL) -= m_total(MC_MBUF_16KCL);
1517
1518	/ Update mbstat. /
1519	mbstat.m_mbufs = m_total(MC_MBUF);
1520	mbstat.m_clusters = m_total(MC_CL);
1521	mbstat.m_clfree = m_infree(MC_CL) + m_infree(MC_MBUF_CL);
1522	mbstat.m_drops = drops;
1523	mbstat.m_bigclusters = m_total(MC_BIGCL);
1524	mbstat.m_bigclfree = m_infree(MC_BIGCL) + m_infree(MC_MBUF_BIGCL);
1525	#endif /* CONFIG_MBUF_MCACHE */
1526	}
1527
1528	static int
1529	mb_stat_sysctl SYSCTL_HANDLER_ARGS
1530	{
1531	#pragma unused(oidp, arg1, arg2)
1532	void *statp;
1533	int k, statsz, proc64 = proc_is64bit(req->p);
1534
1535	lck_mtx_lock(lck: mbuf_mlock);
1536	mbuf_stat_sync();
1537
1538	if (!proc64) {
1539	struct omb_class_stat *oc;
1540	struct mb_class_stat *c;
1541
1542	omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1543	oc = &omb_stat->mbs_class[`0`];
1544	c = &mb_stat->mbs_class[`0`];
1545	for (k = `0`; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1546	(void) snprintf(oc->mbcl_cname, count: sizeof(oc->mbcl_cname),
1547	"%s", c->mbcl_cname);
1548	oc->mbcl_size = c->mbcl_size;
1549	oc->mbcl_total = c->mbcl_total;
1550	oc->mbcl_active = c->mbcl_active;
1551	oc->mbcl_infree = c->mbcl_infree;
1552	oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1553	oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1554	oc->mbcl_free_cnt = c->mbcl_free_cnt;
1555	oc->mbcl_notified = c->mbcl_notified;
1556	oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1557	oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1558	oc->mbcl_ctotal = c->mbcl_ctotal;
1559	oc->mbcl_release_cnt = c->mbcl_release_cnt;
1560	oc->mbcl_mc_state = c->mbcl_mc_state;
1561	oc->mbcl_mc_cached = c->mbcl_mc_cached;
1562	oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1563	oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1564	oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1565	}
1566	statp = omb_stat;
1567	statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1568	} else {
1569	statp = mb_stat;
1570	statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1571	}
1572
1573	lck_mtx_unlock(lck: mbuf_mlock);
1574
1575	return SYSCTL_OUT(req, statp, statsz);
1576	}
1577
1578	#if !CONFIG_MBUF_MCACHE
1579	/*
1580	* The following functions are wrappers around mbuf
1581	* allocation for zalloc. They all have the prefix "mz"
1582	* which was chosen to avoid conflicts with the mbuf KPIs.
1583	*
1584	* Z_NOPAGEWAIT is used in place of Z_NOWAIT because
1585	* Z_NOPAGEWAIT maps closer to MCR_TRYHARD. Z_NOWAIT will
1586	* fail immediately if it has to take a mutex and that
1587	* may cause packets to be dropped more frequently.
1588	* In general, the mbuf subsystem can sustain grabbing a mutex
1589	* during "non-blocking" allocation and that's the reason
1590	* why Z_NOPAGEWAIT was chosen.
1591	*
1592	* mbufs are elided (removed all pointers) before they are
1593	* returned to the cache. The exception are composite mbufs which
1594	* are re-initialized on allocation.
1595	*/
1596	__attribute__((always_inline))
1597	static inline void
1598	m_elide(struct mbuf *m)
1599	{
1600	m->m_next = m->m_nextpkt = NULL;
1601	m->m_data = `0`;
1602	memset(s: &m->m_ext, c: `0`, n: sizeof(m->m_ext));
1603	m->m_pkthdr.rcvif = NULL;
1604	m->m_pkthdr.pkt_hdr = NULL;
1605	m->m_flags \|= M_PKTHDR;
1606	m_tag_init(m, `1`);
1607	m->m_pkthdr.pkt_flags = `0`;
1608	m_scratch_init(m);
1609	m->m_pkthdr.redzone = `0`;
1610	m->m_flags &= ~M_PKTHDR;
1611	}
1612
1613	__attribute__((always_inline))
1614	static inline struct mbuf *
1615	mz_alloc(zalloc_flags_t flags)
1616	{
1617	if (flags & Z_NOWAIT) {
1618	flags ^= Z_NOWAIT \| Z_NOPAGEWAIT;
1619	} else if (!(flags & Z_NOPAGEWAIT)) {
1620	flags \|= Z_NOFAIL;
1621	}
1622	return zalloc_id(ZONE_ID_MBUF, flags \| Z_NOZZC);
1623	}
1624
1625	__attribute__((always_inline))
1626	static inline zstack_t
1627	mz_alloc_n(uint32_t count, zalloc_flags_t flags)
1628	{
1629	if (flags & Z_NOWAIT) {
1630	flags ^= Z_NOWAIT \| Z_NOPAGEWAIT;
1631	} else if (!(flags & Z_NOPAGEWAIT)) {
1632	flags \|= Z_NOFAIL;
1633	}
1634	return zalloc_n(zone_id: ZONE_ID_MBUF, count, flags: flags \| Z_NOZZC);
1635	}
1636
1637	__attribute__((always_inline))
1638	static inline void
1639	mz_free(struct mbuf *m)
1640	{
1641	#if KASAN
1642	zone_require(zone_by_id(ZONE_ID_MBUF), m);
1643	#endif
1644	m_elide(m);
1645	zfree_nozero(ZONE_ID_MBUF, m);
1646	}
1647
1648	__attribute__((always_inline))
1649	static inline void
1650	mz_free_n(zstack_t list)
1651	{
1652	/ Callers of this function have already elided the mbuf. /
1653	zfree_nozero_n(ZONE_ID_MBUF, list);
1654	}
1655
1656	__attribute__((always_inline))
1657	static inline struct ext_ref *
1658	mz_ref_alloc(zalloc_flags_t flags)
1659	{
1660	if (flags & Z_NOWAIT) {
1661	flags ^= Z_NOWAIT \| Z_NOPAGEWAIT;
1662	}
1663	return zalloc_id(ZONE_ID_MBUF_REF, flags \| Z_NOZZC);
1664	}
1665
1666	__attribute__((always_inline))
1667	static inline void
1668	mz_ref_free(struct ext_ref *rfa)
1669	{
1670	VERIFY(rfa->minref == rfa->refcnt);
1671	#if KASAN
1672	zone_require(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1673	#endif
1674	zfree_nozero(ZONE_ID_MBUF_REF, rfa);
1675	}
1676
1677	__attribute__((always_inline))
1678	static inline void *
1679	mz_cl_alloc(zone_id_t zid, zalloc_flags_t flags)
1680	{
1681	if (flags & Z_NOWAIT) {
1682	flags ^= Z_NOWAIT \| Z_NOPAGEWAIT;
1683	} else if (!(flags & Z_NOPAGEWAIT)) {
1684	flags \|= Z_NOFAIL;
1685	}
1686	return (zalloc_id)(zid, flags: flags \| Z_NOZZC);
1687	}
1688
1689	__attribute__((always_inline))
1690	static inline void
1691	mz_cl_free(zone_id_t zid, void *cl)
1692	{
1693	#if KASAN
1694	zone_require(zone_by_id(zid), cl);
1695	#endif
1696	zfree_nozero(zid, cl);
1697	}
1698
1699	__attribute__((always_inline))
1700	static inline zstack_t
1701	mz_composite_alloc_n(mbuf_class_t class, unsigned int n, zalloc_flags_t flags)
1702	{
1703	if (flags & Z_NOWAIT) {
1704	flags ^= Z_NOWAIT \| Z_NOPAGEWAIT;
1705	}
1706	return (zcache_alloc_n)(zone_id: m_class_to_zid(class), count: n, flags,
1707	ops: &mz_composite_ops);
1708	}
1709
1710	__attribute__((always_inline))
1711	static inline struct mbuf *
1712	mz_composite_alloc(mbuf_class_t class, zalloc_flags_t flags)
1713	{
1714	zstack_t list = {};
1715	list = mz_composite_alloc_n(class, n: `1`, flags);
1716	if (!zstack_empty(stack: list)) {
1717	return zstack_pop(stack: &list);
1718	} else {
1719	return NULL;
1720	}
1721	}
1722
1723	__attribute__((always_inline))
1724	static inline void
1725	mz_composite_free_n(mbuf_class_t class, zstack_t list)
1726	{
1727	(zcache_free_n)(zone_id: m_class_to_zid(class), stack: list, ops: &mz_composite_ops);
1728	}
1729
1730	__attribute__((always_inline))
1731	static inline void
1732	mz_composite_free(mbuf_class_t class, struct mbuf *m)
1733	{
1734	zstack_t list = {};
1735	zstack_push(stack: &list, elem: m);
1736	(zcache_free_n)(zone_id: m_class_to_zid(class), stack: list, ops: &mz_composite_ops);
1737	}
1738
1739	/ Converts composite zone ID to the cluster zone ID. /
1740	__attribute__((always_inline))
1741	static inline zone_id_t
1742	mz_cl_zid(zone_id_t zid)
1743	{
1744	return ZONE_ID_CLUSTER_2K + zid - ZONE_ID_MBUF_CLUSTER_2K;
1745	}
1746
1747	static void *
1748	mz_composite_build(zone_id_t zid, zalloc_flags_t flags)
1749	{
1750	const zone_id_t cl_zid = mz_cl_zid(zid);
1751	struct mbuf *m = NULL;
1752	struct ext_ref *rfa = NULL;
1753	void *cl = NULL;
1754
1755	cl = mz_cl_alloc(zid: cl_zid, flags);
1756	if (__improbable(cl == NULL)) {
1757	goto out;
1758	}
1759	rfa = mz_ref_alloc(flags);
1760	if (__improbable(rfa == NULL)) {
1761	goto out_free_cl;
1762	}
1763	m = mz_alloc(flags);
1764	if (__improbable(m == NULL)) {
1765	goto out_free_rfa;
1766	}
1767	MBUF_INIT(m, `0`, MT_FREE);
1768	if (zid == ZONE_ID_MBUF_CLUSTER_2K) {
1769	MBUF_CL_INIT(m, cl, rfa, `0`, EXTF_COMPOSITE);
1770	} else if (zid == ZONE_ID_MBUF_CLUSTER_4K) {
1771	MBUF_BIGCL_INIT(m, cl, rfa, `0`, EXTF_COMPOSITE);
1772	} else {
1773	MBUF_16KCL_INIT(m, cl, rfa, `0`, EXTF_COMPOSITE);
1774	}
1775	VERIFY(m->m_flags == M_EXT);
1776	VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
1777
1778	return m;
1779	out_free_rfa:
1780	mz_ref_free(rfa);
1781	out_free_cl:
1782	mz_cl_free(zid: cl_zid, cl);
1783	out:
1784	return NULL;
1785	}
1786
1787	static void *
1788	mz_composite_mark_valid(zone_id_t zid, void *p)
1789	{
1790	struct mbuf *m = p;
1791
1792	m = zcache_mark_valid(zone: zone_by_id(zid: ZONE_ID_MBUF), elem: m);
1793	#if KASAN
1794	struct ext_ref *rfa = m_get_rfa(m);
1795	const zone_id_t cl_zid = mz_cl_zid(zid);
1796	void *cl = m->m_ext.ext_buf;
1797
1798	cl = zcache_mark_valid(zone_by_id(cl_zid), cl);
1799	rfa = zcache_mark_valid(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1800	m->m_data = (uintptr_t)cl;
1801	m->m_ext.ext_buf = cl;
1802	m_set_rfa(m, rfa);
1803	#else
1804	#pragma unused(zid)
1805	#endif
1806	VERIFY(MBUF_IS_COMPOSITE(m));
1807
1808	return m;
1809	}
1810
1811	static void *
1812	mz_composite_mark_invalid(zone_id_t zid, void *p)
1813	{
1814	struct mbuf *m = p;
1815
1816	VERIFY(MBUF_IS_COMPOSITE(m));
1817	VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
1818	#if KASAN
1819	struct ext_ref *rfa = m_get_rfa(m);
1820	const zone_id_t cl_zid = mz_cl_zid(zid);
1821	void *cl = m->m_ext.ext_buf;
1822
1823	cl = zcache_mark_invalid(zone_by_id(cl_zid), cl);
1824	rfa = zcache_mark_invalid(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1825	m->m_data = (uintptr_t)cl;
1826	m->m_ext.ext_buf = cl;
1827	m_set_rfa(m, rfa);
1828	#else
1829	#pragma unused(zid)
1830	#endif
1831
1832	return zcache_mark_invalid(zone: zone_by_id(zid: ZONE_ID_MBUF), elem: m);
1833	}
1834
1835	static void
1836	mz_composite_destroy(zone_id_t zid, void *p)
1837	{
1838	const zone_id_t cl_zid = mz_cl_zid(zid);
1839	struct ext_ref *rfa = NULL;
1840	struct mbuf *m = p;
1841
1842	VERIFY(MBUF_IS_COMPOSITE(m));
1843
1844	MEXT_MINREF(m) = `0`;
1845	MEXT_REF(m) = `0`;
1846	MEXT_PREF(m) = `0`;
1847	MEXT_FLAGS(m) = `0`;
1848	MEXT_PRIV(m) = `0`;
1849	MEXT_PMBUF(m) = NULL;
1850	MEXT_TOKEN(m) = `0`;
1851
1852	rfa = m_get_rfa(m);
1853	m_set_ext(m, NULL, NULL, NULL);
1854
1855	m->m_type = MT_FREE;
1856	m->m_flags = m->m_len = `0`;
1857	m->m_next = m->m_nextpkt = NULL;
1858
1859	mz_cl_free(zid: cl_zid, cl: m->m_ext.ext_buf);
1860	m->m_ext.ext_buf = NULL;
1861	mz_ref_free(rfa);
1862	mz_free(m);
1863	}
1864	#endif /* !CONFIG_MBUF_MCACHE */
1865
1866	#if CONFIG_MBUF_MCACHE
1867	static int
1868	mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1869	{
1870	#pragma unused(oidp, arg1, arg2)
1871	int i;
1872
1873	/ Ensure leak tracing turned on /
1874	if (!mclfindleak \|\| !mclexpleak) {
1875	return ENXIO;
1876	}
1877
1878	lck_mtx_lock(mleak_lock);
1879	mleak_update_stats();
1880	i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1881	lck_mtx_unlock(mleak_lock);
1882
1883	return i;
1884	}
1885
1886	static int
1887	mleak_table_sysctl SYSCTL_HANDLER_ARGS
1888	{
1889	#pragma unused(oidp, arg1, arg2)
1890	int i = `0`;
1891
1892	/ Ensure leak tracing turned on /
1893	if (!mclfindleak \|\| !mclexpleak) {
1894	return ENXIO;
1895	}
1896
1897	lck_mtx_lock(mleak_lock);
1898	i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
1899	lck_mtx_unlock(mleak_lock);
1900
1901	return i;
1902	}
1903	#endif /* CONFIG_MBUF_MCACHE */
1904
1905	static inline void
1906	m_incref(struct mbuf *m)
1907	{
1908	uint16_t new = os_atomic_inc(&MEXT_REF(m), relaxed);
1909
1910	VERIFY(new != `0`);
1911	/*
1912	* If cluster is shared, mark it with (sticky) EXTF_READONLY;
1913	* we don't clear the flag when the refcount goes back to the
1914	* minimum, to simplify code calling m_mclhasreference().
1915	*/
1916	if (new > (MEXT_MINREF(m) + `1`) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
1917	os_atomic_or(&MEXT_FLAGS(m), EXTF_READONLY, relaxed);
1918	}
1919	}
1920
1921	static inline uint16_t
1922	m_decref(struct mbuf *m)
1923	{
1924	VERIFY(MEXT_REF(m) != `0`);
1925
1926	return os_atomic_dec(&MEXT_REF(m), acq_rel);
1927	}
1928
1929	static void
1930	mbuf_table_init(void)
1931	{
1932	unsigned int b, c, s;
1933	int m, config_mbuf_jumbo = `0`;
1934
1935	omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)),
1936	ZALIGN(struct omb_stat));
1937
1938	mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)),
1939	ZALIGN(mb_stat_t));
1940
1941	mb_stat->mbs_cnt = NELEM(mbuf_table);
1942	for (m = `0`; m < NELEM(mbuf_table); m++) {
1943	mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1944	}
1945
1946	#if CONFIG_MBUF_JUMBO
1947	config_mbuf_jumbo = `1`;
1948	#endif /* CONFIG_MBUF_JUMBO */
1949
1950	if (config_mbuf_jumbo == `1` \|\| PAGE_SIZE == M16KCLBYTES) {
1951	/*
1952	* Set aside 1/3 of the mbuf cluster map for jumbo
1953	* clusters; we do this only on platforms where jumbo
1954	* cluster pool is enabled.
1955	*/
1956	njcl = nmbclusters / `3`;
1957	njclbytes = M16KCLBYTES;
1958	}
1959
1960	/*
1961	* nclusters holds both the 2KB and 4KB pools, so ensure it's
1962	* a multiple of 4KB clusters.
1963	*/
1964	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1965	if (njcl > `0`) {
1966	/*
1967	* Each jumbo cluster takes 8 2KB clusters, so make
1968	* sure that the pool size is evenly divisible by 8;
1969	* njcl is in 2KB unit, hence treated as such.
1970	*/
1971	njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1972
1973	/ Update nclusters with rounded down value of njcl /
1974	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1975	}
1976
1977	/*
1978	* njcl is valid only on platforms with 16KB jumbo clusters or
1979	* with 16KB pages, where it is configured to 1/3 of the pool
1980	* size. On these platforms, the remaining is used for 2KB
1981	* and 4KB clusters. On platforms without 16KB jumbo clusters,
1982	* the entire pool is used for both 2KB and 4KB clusters. A 4KB
1983	* cluster can either be splitted into 16 mbufs, or into 2 2KB
1984	* clusters.
1985	*
1986	* +---+---+------------ ... -----------+------- ... -------+
1987	* \| c \| b \| s \| njcl \|
1988	* +---+---+------------ ... -----------+------- ... -------+
1989	*
1990	* 1/32th of the shared region is reserved for pure 2KB and 4KB
1991	* clusters (1/64th each.)
1992	*/
1993	c = P2ROUNDDOWN((nclusters >> `6`), NCLPG); / in 2KB unit /
1994	b = P2ROUNDDOWN((nclusters >> (`6` + NCLPBGSHIFT)), NBCLPG); / in 4KB unit /
1995	s = nclusters - (c + (b << NCLPBGSHIFT)); / in 2KB unit /
1996
1997	/*
1998	* 1/64th (c) is reserved for 2KB clusters.
1999	*/
2000	m_minlimit(MC_CL) = c;
2001	m_maxlimit(MC_CL) = s + c; / in 2KB unit /
2002	m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
2003	snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
2004
2005	/*
2006	* Another 1/64th (b) of the map is reserved for 4KB clusters.
2007	* It cannot be turned into 2KB clusters or mbufs.
2008	*/
2009	m_minlimit(MC_BIGCL) = b;
2010	m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; / in 4KB unit /
2011	m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
2012	snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
2013
2014	/*
2015	* The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
2016	*/
2017	m_minlimit(MC_MBUF) = `0`;
2018	m_maxlimit(MC_MBUF) = s * NMBPCL; / in mbuf unit /
2019	m_maxsize(MC_MBUF) = m_size(MC_MBUF) = _MSIZE;
2020	snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
2021
2022	/*
2023	* Set limits for the composite classes.
2024	*/
2025	m_minlimit(MC_MBUF_CL) = `0`;
2026	m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
2027	m_maxsize(MC_MBUF_CL) = MCLBYTES;
2028	m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
2029	snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
2030
2031	m_minlimit(MC_MBUF_BIGCL) = `0`;
2032	m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
2033	m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
2034	m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
2035	snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
2036
2037	/*
2038	* And for jumbo classes.
2039	*/
2040	m_minlimit(MC_16KCL) = `0`;
2041	m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); / in 16KB unit /
2042	m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
2043	snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
2044
2045	m_minlimit(MC_MBUF_16KCL) = `0`;
2046	m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
2047	m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
2048	m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
2049	snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
2050
2051	/*
2052	* Initialize the legacy mbstat structure.
2053	*/
2054	bzero(s: &mbstat, n: sizeof(mbstat));
2055	mbstat.m_msize = m_maxsize(MC_MBUF);
2056	mbstat.m_mclbytes = m_maxsize(MC_CL);
2057	mbstat.m_minclsize = MINCLSIZE;
2058	mbstat.m_mlen = MLEN;
2059	mbstat.m_mhlen = MHLEN;
2060	mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
2061	}
2062
2063	static int
2064	mbuf_get_class(struct mbuf *m)
2065	{
2066	if (m->m_flags & M_EXT) {
2067	uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
2068	m_ext_free_func_t m_free_func = m_get_ext_free(m);
2069
2070	if (m_free_func == NULL) {
2071	if (composite) {
2072	return MC_MBUF_CL;
2073	} else {
2074	return MC_CL;
2075	}
2076	} else if (m_free_func == m_bigfree) {
2077	if (composite) {
2078	return MC_MBUF_BIGCL;
2079	} else {
2080	return MC_BIGCL;
2081	}
2082	} else if (m_free_func == m_16kfree) {
2083	if (composite) {
2084	return MC_MBUF_16KCL;
2085	} else {
2086	return MC_16KCL;
2087	}
2088	}
2089	}
2090
2091	return MC_MBUF;
2092	}
2093
2094	bool
2095	mbuf_class_under_pressure(struct mbuf *m)
2096	{
2097	int mclass = mbuf_get_class(m);
2098
2099	#if CONFIG_MBUF_MCACHE
2100	if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / `100`) {
2101	/*
2102	* The above computation does not include the per-CPU cached objects.
2103	* As a fast-path check this is good-enough. But now we do
2104	* the "slower" count of the cached objects to know exactly the
2105	* number of active mbufs in use.
2106	*
2107	* We do not take the mbuf_lock here to avoid lock-contention. Numbers
2108	* might be slightly off but we don't try to be 100% accurate.
2109	* At worst, we drop a packet that we shouldn't have dropped or
2110	* we might go slightly above our memory-pressure threshold.
2111	*/
2112	mcache_t *cp = m_cache(mclass);
2113	mcache_cpu_t *ccp = &cp->mc_cpu[`0`];
2114
2115	int bktsize = os_access_once(ccp->cc_bktsize);
2116	uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
2117	uint32_t cached = `0`;
2118	int i;
2119
2120	for (i = `0`; i < ncpu; i++) {
2121	ccp = &cp->mc_cpu[i];
2122
2123	int cc_objs = os_access_once(ccp->cc_objs);
2124	if (cc_objs > `0`) {
2125	cached += cc_objs;
2126	}
2127
2128	int cc_pobjs = os_access_once(ccp->cc_pobjs);
2129	if (cc_pobjs > `0`) {
2130	cached += cc_pobjs;
2131	}
2132	}
2133	cached += (bl_total * bktsize);
2134	if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / `100`) {
2135	os_log(OS_LOG_DEFAULT,
2136	"%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
2137	__func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
2138	return true;
2139	}
2140	}
2141	#else
2142	/*
2143	* Grab the statistics from zalloc.
2144	* We can't call mbuf_stat_sync() since that requires a lock.
2145	*/
2146	const zone_id_t zid = m_class_to_zid(m_class(mclass));
2147	const zone_t zone = zone_by_id(zid);
2148	struct zone_basic_stats stats = {};
2149
2150	zone_get_stats(zone, stats: &stats);
2151	if (stats.zbs_avail - stats.zbs_free >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / `100`) {
2152	os_log(OS_LOG_DEFAULT,
2153	"%s memory-pressure on mbuf due to class %u, total %llu free %llu max %u",
2154	__func__, mclass, stats.zbs_avail, stats.zbs_free, m_maxlimit(mclass));
2155	return true;
2156	}
2157	#endif /* CONFIG_MBUF_MCACHE */
2158
2159	return false;
2160	}
2161
2162	#if defined(__LP64__)
2163	typedef struct ncl_tbl {
2164	uint64_t nt_maxmem; / memory (sane) size /
2165	uint32_t nt_mbpool; / mbuf pool size /
2166	} ncl_tbl_t;
2167
2168	static const ncl_tbl_t ncl_table[] = {
2169	{ (`1ULL` << GBSHIFT) / 1 GB /, (`64` << MBSHIFT) / 64 MB / },
2170	{ .nt_maxmem: (`1ULL` << (GBSHIFT + `2`)) / 4 GB /, .nt_mbpool: (`96` << MBSHIFT) / 96 MB / },
2171	{ .nt_maxmem: (`1ULL` << (GBSHIFT + `3`)) / 8 GB /, .nt_mbpool: (`128` << MBSHIFT) / 128 MB / },
2172	{ .nt_maxmem: (`1ULL` << (GBSHIFT + `4`)) / 16 GB /, .nt_mbpool: (`256` << MBSHIFT) / 256 MB / },
2173	{ .nt_maxmem: (`1ULL` << (GBSHIFT + `5`)) / 32 GB /, .nt_mbpool: (`512` << MBSHIFT) / 512 MB / },
2174	{ .nt_maxmem: `0`, .nt_mbpool: `0` }
2175	};
2176	#endif /* __LP64__ */
2177
2178	__private_extern__ unsigned int
2179	mbuf_default_ncl(uint64_t mem)
2180	{
2181	#if !defined(__LP64__)
2182	unsigned int n;
2183	/*
2184	* 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
2185	*/
2186	if ((n = ((mem / `16`) / MCLBYTES)) > `32768`) {
2187	n = `32768`;
2188	}
2189	#else
2190	unsigned int n, i;
2191	/*
2192	* 64-bit kernel (mbuf pool size based on table).
2193	*/
2194	n = ncl_table[`0`].nt_mbpool;
2195	for (i = `0`; ncl_table[i].nt_mbpool != `0`; i++) {
2196	if (mem < ncl_table[i].nt_maxmem) {
2197	break;
2198	}
2199	n = ncl_table[i].nt_mbpool;
2200	}
2201	n >>= MCLSHIFT;
2202	#endif /* !__LP64__ */
2203	return n;
2204	}
2205
2206	__private_extern__ void
2207	mbinit(void)
2208	{
2209	unsigned int m;
2210	#if CONFIG_MBUF_MCACHE
2211	unsigned int initmcl = `0`;
2212	thread_t thread = THREAD_NULL;
2213	#endif /* CONFIG_MBUF_MCACHE */
2214
2215	#if CONFIG_MBUF_MCACHE
2216	microuptime(&mb_start);
2217	#endif /* CONFIG_MBUF_MCACHE */
2218
2219	/*
2220	* These MBUF_ values must be equal to their private counterparts.
2221	*/
2222	_CASSERT(MBUF_EXT == M_EXT);
2223	_CASSERT(MBUF_PKTHDR == M_PKTHDR);
2224	_CASSERT(MBUF_EOR == M_EOR);
2225	_CASSERT(MBUF_LOOP == M_LOOP);
2226	_CASSERT(MBUF_BCAST == M_BCAST);
2227	_CASSERT(MBUF_MCAST == M_MCAST);
2228	_CASSERT(MBUF_FRAG == M_FRAG);
2229	_CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
2230	_CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
2231	_CASSERT(MBUF_PROMISC == M_PROMISC);
2232	_CASSERT(MBUF_HASFCS == M_HASFCS);
2233
2234	_CASSERT(MBUF_TYPE_FREE == MT_FREE);
2235	_CASSERT(MBUF_TYPE_DATA == MT_DATA);
2236	_CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
2237	_CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
2238	_CASSERT(MBUF_TYPE_PCB == MT_PCB);
2239	_CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
2240	_CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
2241	_CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
2242	_CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
2243	_CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
2244	_CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
2245	_CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
2246	_CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
2247	_CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
2248	_CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
2249
2250	_CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
2251	_CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
2252	_CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
2253	_CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
2254	_CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
2255	_CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
2256	_CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
2257	_CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
2258	_CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
2259	_CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
2260	_CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
2261	_CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
2262	_CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
2263	_CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
2264
2265	_CASSERT(MBUF_WAITOK == M_WAIT);
2266	_CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
2267	_CASSERT(MBUF_COPYALL == M_COPYALL);
2268
2269	_CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
2270	_CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
2271	_CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
2272	_CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
2273	_CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
2274	_CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
2275	_CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
2276	_CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
2277	_CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
2278	_CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
2279	_CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
2280
2281	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
2282	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
2283	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
2284	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
2285
2286	/ Module specific scratch space (32-bit alignment requirement) /
2287	_CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
2288	sizeof(uint32_t)));
2289
2290	/ pktdata needs to start at 128-bit offset! /
2291	_CASSERT((offsetof(struct mbuf, m_pktdat) % `16`) == `0`);
2292
2293	/ Initialize random red zone cookie value /
2294	_CASSERT(sizeof(mb_redzone_cookie) ==
2295	sizeof(((struct pkthdr *)`0`)->redzone));
2296	read_random(buffer: &mb_redzone_cookie, numBytes: sizeof(mb_redzone_cookie));
2297	read_random(buffer: &mb_obscure_extref, numBytes: sizeof(mb_obscure_extref));
2298	read_random(buffer: &mb_obscure_extfree, numBytes: sizeof(mb_obscure_extfree));
2299	mb_obscure_extref \|= `0x3`;
2300	mb_obscure_extref = `0`;
2301	mb_obscure_extfree \|= `0x3`;
2302
2303	#if CONFIG_MBUF_MCACHE
2304	/ Make sure we don't save more than we should /
2305	_CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
2306	#endif /* CONFIG_MBUF_MCACHE */
2307
2308	if (nmbclusters == `0`) {
2309	nmbclusters = NMBCLUSTERS;
2310	}
2311
2312	/ This should be a sane (at least even) value by now /
2313	VERIFY(nmbclusters != `0` && !(nmbclusters & `0x1`));
2314
2315	/ Setup the mbuf table /
2316	mbuf_table_init();
2317
2318	_CASSERT(sizeof(struct mbuf) == _MSIZE);
2319
2320	#if CONFIG_MBUF_MCACHE
2321	/*
2322	* Allocate cluster slabs table:
2323	*
2324	* maxslabgrp = (N * 2048) / (1024 * 1024)
2325	*
2326	* Where N is nmbclusters rounded up to the nearest 512. This yields
2327	* mcl_slab_g_t units, each one representing a MB of memory.
2328	*/
2329	maxslabgrp =
2330	(P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
2331	slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
2332	ZALIGN(mcl_slabg_t));
2333
2334	/*
2335	* Allocate audit structures, if needed:
2336	*
2337	* maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
2338	*
2339	* This yields mcl_audit_t units, each one representing a page.
2340	*/
2341	PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
2342	mbuf_debug \|= mcache_getflags();
2343	if (mbuf_debug & MCF_DEBUG) {
2344	int l;
2345	mcl_audit_t *mclad;
2346	maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
2347	mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
2348	ZALIGN(mcl_audit_t));
2349	for (l = `0`, mclad = mclaudit; l < maxclaudit; l++) {
2350	mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
2351	ZALIGN_PTR);
2352	}
2353
2354	mcl_audit_con_cache = mcache_create("mcl_audit_contents",
2355	AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), `0`, MCR_SLEEP);
2356	VERIFY(mcl_audit_con_cache != NULL);
2357	}
2358	mclverify = (mbuf_debug & MCF_VERIFY);
2359	mcltrace = (mbuf_debug & MCF_TRACE);
2360	mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
2361	mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
2362
2363	/ Enable mbuf leak logging, with a lock to protect the tables /
2364
2365	mleak_activate();
2366
2367	/*
2368	* Allocate structure for per-CPU statistics that's aligned
2369	* on the CPU cache boundary; this code assumes that we never
2370	* uninitialize this framework, since the original address
2371	* before alignment is not saved.
2372	*/
2373	ncpu = ml_wait_max_cpus();
2374
2375	/ Calculate the number of pages assigned to the cluster pool /
2376	mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
2377	mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
2378	ZALIGN(ppnum_t));
2379
2380	/ Register with the I/O Bus mapper /
2381	mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
2382
2383	embutl = (mbutl + (nmbclusters * MCLBYTES));
2384	VERIFY(((embutl - mbutl) % MBIGCLBYTES) == `0`);
2385
2386	/ Prime up the freelist /
2387	PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
2388	if (initmcl != `0`) {
2389	initmcl >>= NCLPBGSHIFT; / become a 4K unit /
2390	if (initmcl > m_maxlimit(MC_BIGCL)) {
2391	initmcl = m_maxlimit(MC_BIGCL);
2392	}
2393	}
2394	if (initmcl < m_minlimit(MC_BIGCL)) {
2395	initmcl = m_minlimit(MC_BIGCL);
2396	}
2397
2398	lck_mtx_lock(mbuf_mlock);
2399
2400	/*
2401	* For classes with non-zero minimum limits, populate their freelists
2402	* so that m_total(class) is at least m_minlimit(class).
2403	*/
2404	VERIFY(m_total(MC_BIGCL) == `0` && m_minlimit(MC_BIGCL) != `0`);
2405	freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
2406	VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2407	freelist_init(m_class(MC_CL));
2408	#else
2409	/*
2410	* We have yet to create the non composite zones
2411	* and thus we haven't asked zalloc to allocate
2412	* anything yet, which means that at this point
2413	* m_total() is zero. Once we create the zones and
2414	* raise the reserve, m_total() will be calculated,
2415	* but until then just assume that we will have
2416	* at least the minium limit allocated.
2417	*/
2418	m_total(MC_BIGCL) = m_minlimit(MC_BIGCL);
2419	m_total(MC_CL) = m_minlimit(MC_CL);
2420	#endif /* CONFIG_MBUF_MCACHE */
2421
2422	for (m = `0`; m < NELEM(mbuf_table); m++) {
2423	/ Make sure we didn't miss any /
2424	VERIFY(m_minlimit(m_class(m)) == `0` \|\|
2425	m_total(m_class(m)) >= m_minlimit(m_class(m)));
2426	}
2427
2428	#if CONFIG_MBUF_MCACHE
2429	lck_mtx_unlock(mbuf_mlock);
2430
2431	(void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
2432	NULL, &thread);
2433	thread_deallocate(thread);
2434
2435	ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
2436	`0`, `0`, MCR_SLEEP);
2437	#endif /* CONFIG_MBUF_MCACHE */
2438
2439	/ Create the cache for each class /
2440	for (m = `0`; m < NELEM(mbuf_table); m++) {
2441	#if CONFIG_MBUF_MCACHE
2442	void allocfunc, freefunc, auditfunc, logfunc;
2443	u_int32_t flags;
2444
2445	flags = mbuf_debug;
2446	if (m_class(m) == MC_MBUF_CL \|\| m_class(m) == MC_MBUF_BIGCL \|\|
2447	m_class(m) == MC_MBUF_16KCL) {
2448	allocfunc = mbuf_cslab_alloc;
2449	freefunc = mbuf_cslab_free;
2450	auditfunc = mbuf_cslab_audit;
2451	logfunc = mleak_logger;
2452	} else {
2453	allocfunc = mbuf_slab_alloc;
2454	freefunc = mbuf_slab_free;
2455	auditfunc = mbuf_slab_audit;
2456	logfunc = mleak_logger;
2457	}
2458
2459	/*
2460	* Disable per-CPU caches for jumbo classes if there
2461	* is no jumbo cluster pool available in the system.
2462	* The cache itself is still created (but will never
2463	* be populated) since it simplifies the code.
2464	*/
2465	if ((m_class(m) == MC_MBUF_16KCL \|\| m_class(m) == MC_16KCL) &&
2466	njcl == `0`) {
2467	flags \|= MCF_NOCPUCACHE;
2468	}
2469
2470	if (!mclfindleak) {
2471	flags \|= MCF_NOLEAKLOG;
2472	}
2473
2474	m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
2475	allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
2476	(void *)(uintptr_t)m, flags, MCR_SLEEP);
2477	#else
2478	if (!MBUF_CLASS_COMPOSITE(m)) {
2479	zone_t zone = zone_by_id(zid: m_class_to_zid(class: m));
2480
2481	zone_set_exhaustible(zone, m_maxlimit(m), false);
2482	zone_raise_reserve(zone_or_view: zone, m_minlimit(m));
2483	/*
2484	* Pretend that we have allocated m_total() items
2485	* at this point. zalloc will eventually do that
2486	* but it's an async operation.
2487	*/
2488	m_total(m) = m_minlimit(m);
2489	}
2490	#endif /* CONFIG_MBUF_MCACHE */
2491	}
2492
2493	/*
2494	* Set the max limit on sb_max to be 1/16 th of the size of
2495	* memory allocated for mbuf clusters.
2496	*/
2497	high_sb_max = (nmbclusters << (MCLSHIFT - `4`));
2498	if (high_sb_max < sb_max) {
2499	/ sb_max is too large for this configuration, scale it down /
2500	if (high_sb_max > (`1` << MBSHIFT)) {
2501	/ We have atleast 16 M of mbuf pool /
2502	sb_max = high_sb_max;
2503	} else if ((nmbclusters << MCLSHIFT) > (`1` << MBSHIFT)) {
2504	/*
2505	* If we have more than 1M of mbufpool, cap the size of
2506	* max sock buf at 1M
2507	*/
2508	sb_max = high_sb_max = (`1` << MBSHIFT);
2509	} else {
2510	sb_max = high_sb_max;
2511	}
2512	}
2513
2514	#if CONFIG_MBUF_MCACHE
2515	/ allocate space for mbuf_dump_buf /
2516	mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
2517
2518	if (mbuf_debug & MCF_DEBUG) {
2519	printf("%s: MLEN %d, MHLEN %d\n", __func__,
2520	(int)_MLEN, (int)_MHLEN);
2521	}
2522	#else
2523	mbuf_defunct_tcall =
2524	thread_call_allocate_with_options(func: mbuf_watchdog_defunct,
2525	NULL,
2526	pri: THREAD_CALL_PRIORITY_KERNEL,
2527	options: THREAD_CALL_OPTIONS_ONCE);
2528	mbuf_drain_tcall =
2529	thread_call_allocate_with_options(func: mbuf_watchdog_drain_composite,
2530	NULL,
2531	pri: THREAD_CALL_PRIORITY_KERNEL,
2532	options: THREAD_CALL_OPTIONS_ONCE);
2533	#endif /* CONFIG_MBUF_MCACHE */
2534	printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
2535	(nmbclusters << MCLSHIFT) >> MBSHIFT,
2536	(nclusters << MCLSHIFT) >> MBSHIFT,
2537	(njcl << MCLSHIFT) >> MBSHIFT);
2538
2539	PE_parse_boot_argn(arg_string: "mb_tag_mbuf", arg_ptr: &mb_tag_mbuf, max_arg: sizeof(mb_tag_mbuf));
2540	}
2541
2542	#if CONFIG_MBUF_MCACHE
2543	/*
2544	* Obtain a slab of object(s) from the class's freelist.
2545	*/
2546	static mcache_obj_t *
2547	slab_alloc(mbuf_class_t class, int wait)
2548	{
2549	mcl_slab_t *sp;
2550	mcache_obj_t *buf;
2551
2552	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2553
2554	/ This should always be NULL for us /
2555	VERIFY(m_cobjlist(class) == NULL);
2556
2557	/*
2558	* Treat composite objects as having longer lifespan by using
2559	* a slab from the reverse direction, in hoping that this could
2560	* reduce the probability of fragmentation for slabs that hold
2561	* more than one buffer chunks (e.g. mbuf slabs). For other
2562	* slabs, this probably doesn't make much of a difference.
2563	*/
2564	if ((class == MC_MBUF \|\| class == MC_CL \|\| class == MC_BIGCL)
2565	&& (wait & MCR_COMP)) {
2566	sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
2567	} else {
2568	sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
2569	}
2570
2571	if (sp == NULL) {
2572	VERIFY(m_infree(class) == `0` && m_slab_cnt(class) == `0`);
2573	/ The slab list for this class is empty /
2574	return NULL;
2575	}
2576
2577	VERIFY(m_infree(class) > `0`);
2578	VERIFY(!slab_is_detached(sp));
2579	VERIFY(sp->sl_class == class &&
2580	(sp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) == SLF_MAPPED);
2581	buf = sp->sl_head;
2582	VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
2583	sp->sl_head = buf->obj_next;
2584	/ Increment slab reference /
2585	sp->sl_refcnt++;
2586
2587	VERIFY(sp->sl_head != NULL \|\| sp->sl_refcnt == sp->sl_chunks);
2588
2589	if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
2590	slab_nextptr_panic(sp, sp->sl_head);
2591	/ In case sl_head is in the map but not in the slab /
2592	VERIFY(slab_inrange(sp, sp->sl_head));
2593	/ NOTREACHED /
2594	}
2595
2596	if (mclaudit != NULL) {
2597	mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2598	mca->mca_uflags = `0`;
2599	/ Save contents on mbuf objects only /
2600	if (class == MC_MBUF) {
2601	mca->mca_uflags \|= MB_SCVALID;
2602	}
2603	}
2604
2605	if (class == MC_CL) {
2606	mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2607	/*
2608	* A 2K cluster slab can have at most NCLPG references.
2609	*/
2610	VERIFY(sp->sl_refcnt >= `1` && sp->sl_refcnt <= NCLPG &&
2611	sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2612	VERIFY(sp->sl_refcnt < NCLPG \|\| sp->sl_head == NULL);
2613	} else if (class == MC_BIGCL) {
2614	mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
2615	m_infree(MC_MBUF_BIGCL);
2616	/*
2617	* A 4K cluster slab can have NBCLPG references.
2618	*/
2619	VERIFY(sp->sl_refcnt >= `1` && sp->sl_chunks == NBCLPG &&
2620	sp->sl_len == PAGE_SIZE &&
2621	(sp->sl_refcnt < NBCLPG \|\| sp->sl_head == NULL));
2622	} else if (class == MC_16KCL) {
2623	mcl_slab_t *nsp;
2624	int k;
2625
2626	--m_infree(MC_16KCL);
2627	VERIFY(sp->sl_refcnt == `1` && sp->sl_chunks == `1` &&
2628	sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2629	/*
2630	* Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
2631	* A 16KB big cluster takes NSLABSP16KB slabs, each having at
2632	* most 1 reference.
2633	*/
2634	for (nsp = sp, k = `1`; k < NSLABSP16KB; k++) {
2635	nsp = nsp->sl_next;
2636	/ Next slab must already be present /
2637	VERIFY(nsp != NULL);
2638	nsp->sl_refcnt++;
2639	VERIFY(!slab_is_detached(nsp));
2640	VERIFY(nsp->sl_class == MC_16KCL &&
2641	nsp->sl_flags == (SLF_MAPPED \| SLF_PARTIAL) &&
2642	nsp->sl_refcnt == `1` && nsp->sl_chunks == `0` &&
2643	nsp->sl_len == `0` && nsp->sl_base == sp->sl_base &&
2644	nsp->sl_head == NULL);
2645	}
2646	} else {
2647	VERIFY(class == MC_MBUF);
2648	--m_infree(MC_MBUF);
2649	/*
2650	* If auditing is turned on, this check is
2651	* deferred until later in mbuf_slab_audit().
2652	*/
2653	if (mclaudit == NULL) {
2654	_MCHECK((struct mbuf *)buf);
2655	}
2656	/*
2657	* Since we have incremented the reference count above,
2658	* an mbuf slab (formerly a 4KB cluster slab that was cut
2659	* up into mbufs) must have a reference count between 1
2660	* and NMBPG at this point.
2661	*/
2662	VERIFY(sp->sl_refcnt >= `1` && sp->sl_refcnt <= NMBPG &&
2663	sp->sl_chunks == NMBPG &&
2664	sp->sl_len == PAGE_SIZE);
2665	VERIFY(sp->sl_refcnt < NMBPG \|\| sp->sl_head == NULL);
2666	}
2667
2668	/ If empty, remove this slab from the class's freelist /
2669	if (sp->sl_head == NULL) {
2670	VERIFY(class != MC_MBUF \|\| sp->sl_refcnt == NMBPG);
2671	VERIFY(class != MC_CL \|\| sp->sl_refcnt == NCLPG);
2672	VERIFY(class != MC_BIGCL \|\| sp->sl_refcnt == NBCLPG);
2673	slab_remove(sp, class);
2674	}
2675
2676	return buf;
2677	}
2678
2679	/*
2680	* Place a slab of object(s) back into a class's slab list.
2681	*/
2682	static void
2683	slab_free(mbuf_class_t class, mcache_obj_t *buf)
2684	{
2685	mcl_slab_t *sp;
2686	boolean_t reinit_supercl = false;
2687	mbuf_class_t super_class;
2688
2689	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2690
2691	VERIFY(class != MC_16KCL \|\| njcl > `0`);
2692	VERIFY(buf->obj_next == NULL);
2693
2694	/*
2695	* Synchronizing with m_clalloc, as it reads m_total, while we here
2696	* are modifying m_total.
2697	*/
2698	while (mb_clalloc_busy) {
2699	mb_clalloc_waiters++;
2700	(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2701	(PZERO - `1`), "m_clalloc", NULL);
2702	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2703	}
2704
2705	/ We are busy now; tell everyone else to go away /
2706	mb_clalloc_busy = TRUE;
2707
2708	sp = slab_get(buf);
2709	VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
2710	(sp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) == SLF_MAPPED);
2711
2712	/ Decrement slab reference /
2713	sp->sl_refcnt--;
2714
2715	if (class == MC_CL) {
2716	VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
2717	/*
2718	* A slab that has been splitted for 2KB clusters can have
2719	* at most 1 outstanding reference at this point.
2720	*/
2721	VERIFY(sp->sl_refcnt >= `0` && sp->sl_refcnt <= (NCLPG - `1`) &&
2722	sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2723	VERIFY(sp->sl_refcnt < (NCLPG - `1`) \|\|
2724	(slab_is_detached(sp) && sp->sl_head == NULL));
2725	} else if (class == MC_BIGCL) {
2726	VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
2727
2728	/ A 4KB cluster slab can have NBCLPG references at most /
2729	VERIFY(sp->sl_refcnt >= `0` && sp->sl_chunks == NBCLPG);
2730	VERIFY(sp->sl_refcnt < (NBCLPG - `1`) \|\|
2731	(slab_is_detached(sp) && sp->sl_head == NULL));
2732	} else if (class == MC_16KCL) {
2733	mcl_slab_t *nsp;
2734	int k;
2735	/*
2736	* A 16KB cluster takes NSLABSP16KB slabs, all must
2737	* now have 0 reference.
2738	*/
2739	VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
2740	VERIFY(sp->sl_refcnt == `0` && sp->sl_chunks == `1` &&
2741	sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2742	VERIFY(slab_is_detached(sp));
2743	for (nsp = sp, k = `1`; k < NSLABSP16KB; k++) {
2744	nsp = nsp->sl_next;
2745	/ Next slab must already be present /
2746	VERIFY(nsp != NULL);
2747	nsp->sl_refcnt--;
2748	VERIFY(slab_is_detached(nsp));
2749	VERIFY(nsp->sl_class == MC_16KCL &&
2750	(nsp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) &&
2751	nsp->sl_refcnt == `0` && nsp->sl_chunks == `0` &&
2752	nsp->sl_len == `0` && nsp->sl_base == sp->sl_base &&
2753	nsp->sl_head == NULL);
2754	}
2755	} else {
2756	/*
2757	* A slab that has been splitted for mbufs has at most
2758	* NMBPG reference counts. Since we have decremented
2759	* one reference above, it must now be between 0 and
2760	* NMBPG-1.
2761	*/
2762	VERIFY(class == MC_MBUF);
2763	VERIFY(sp->sl_refcnt >= `0` &&
2764	sp->sl_refcnt <= (NMBPG - `1`) &&
2765	sp->sl_chunks == NMBPG &&
2766	sp->sl_len == PAGE_SIZE);
2767	VERIFY(sp->sl_refcnt < (NMBPG - `1`) \|\|
2768	(slab_is_detached(sp) && sp->sl_head == NULL));
2769	}
2770
2771	/*
2772	* When auditing is enabled, ensure that the buffer still
2773	* contains the free pattern. Otherwise it got corrupted
2774	* while at the CPU cache layer.
2775	*/
2776	if (mclaudit != NULL) {
2777	mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2778	if (mclverify) {
2779	mcache_audit_free_verify(mca, buf, `0`,
2780	m_maxsize(class));
2781	}
2782	mca->mca_uflags &= ~MB_SCVALID;
2783	}
2784
2785	if (class == MC_CL) {
2786	mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2787	buf->obj_next = sp->sl_head;
2788	} else if (class == MC_BIGCL) {
2789	mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2790	m_infree(MC_MBUF_BIGCL);
2791	buf->obj_next = sp->sl_head;
2792	} else if (class == MC_16KCL) {
2793	++m_infree(MC_16KCL);
2794	} else {
2795	++m_infree(MC_MBUF);
2796	buf->obj_next = sp->sl_head;
2797	}
2798	sp->sl_head = buf;
2799
2800	/*
2801	* If a slab has been split to either one which holds 2KB clusters,
2802	* or one which holds mbufs, turn it back to one which holds a
2803	* 4 or 16 KB cluster depending on the page size.
2804	*/
2805	if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2806	super_class = MC_BIGCL;
2807	} else {
2808	VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2809	super_class = MC_16KCL;
2810	}
2811	if (class == MC_MBUF && sp->sl_refcnt == `0` &&
2812	m_total(class) >= (m_minlimit(class) + NMBPG) &&
2813	m_total(super_class) < m_maxlimit(super_class)) {
2814	int i = NMBPG;
2815
2816	m_total(MC_MBUF) -= NMBPG;
2817	mbstat.m_mbufs = m_total(MC_MBUF);
2818	m_infree(MC_MBUF) -= NMBPG;
2819	mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2820
2821	while (i--) {
2822	struct mbuf *m = sp->sl_head;
2823	VERIFY(m != NULL);
2824	sp->sl_head = m->m_next;
2825	m->m_next = NULL;
2826	}
2827	reinit_supercl = true;
2828	} else if (class == MC_CL && sp->sl_refcnt == `0` &&
2829	m_total(class) >= (m_minlimit(class) + NCLPG) &&
2830	m_total(super_class) < m_maxlimit(super_class)) {
2831	int i = NCLPG;
2832
2833	m_total(MC_CL) -= NCLPG;
2834	mbstat.m_clusters = m_total(MC_CL);
2835	m_infree(MC_CL) -= NCLPG;
2836
2837	while (i--) {
2838	union mcluster *c = sp->sl_head;
2839	VERIFY(c != NULL);
2840	sp->sl_head = c->mcl_next;
2841	c->mcl_next = NULL;
2842	}
2843	reinit_supercl = true;
2844	} else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2845	sp->sl_refcnt == `0` &&
2846	m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2847	m_total(super_class) < m_maxlimit(super_class)) {
2848	int i = NBCLPG;
2849
2850	VERIFY(super_class == MC_16KCL);
2851	m_total(MC_BIGCL) -= NBCLPG;
2852	mbstat.m_bigclusters = m_total(MC_BIGCL);
2853	m_infree(MC_BIGCL) -= NBCLPG;
2854
2855	while (i--) {
2856	union mbigcluster *bc = sp->sl_head;
2857	VERIFY(bc != NULL);
2858	sp->sl_head = bc->mbc_next;
2859	bc->mbc_next = NULL;
2860	}
2861	reinit_supercl = true;
2862	}
2863
2864	if (reinit_supercl) {
2865	VERIFY(sp->sl_head == NULL);
2866	VERIFY(m_total(class) >= m_minlimit(class));
2867	slab_remove(sp, class);
2868
2869	/ Reinitialize it as a cluster for the super class /
2870	m_total(super_class)++;
2871	m_infree(super_class)++;
2872	VERIFY(sp->sl_flags == (SLF_MAPPED \| SLF_DETACHED) &&
2873	sp->sl_len == PAGE_SIZE && sp->sl_refcnt == `0`);
2874
2875	slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2876	sp->sl_base, PAGE_SIZE, `0`, `1`);
2877	if (mclverify) {
2878	mcache_set_pattern(MCACHE_FREE_PATTERN,
2879	(caddr_t)sp->sl_base, sp->sl_len);
2880	}
2881	((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2882
2883	if (super_class == MC_BIGCL) {
2884	mbstat.m_bigclusters = m_total(MC_BIGCL);
2885	mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2886	m_infree(MC_MBUF_BIGCL);
2887	}
2888
2889	VERIFY(slab_is_detached(sp));
2890	VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2891
2892	/ And finally switch class /
2893	class = super_class;
2894	}
2895
2896	/ Reinsert the slab to the class's slab list /
2897	if (slab_is_detached(sp)) {
2898	slab_insert(sp, class);
2899	}
2900
2901	/ We're done; let others enter /
2902	mb_clalloc_busy = FALSE;
2903	if (mb_clalloc_waiters > `0`) {
2904	mb_clalloc_waiters = `0`;
2905	wakeup(mb_clalloc_waitchan);
2906	}
2907	}
2908
2909	/*
2910	* Common allocator for rudimentary objects called by the CPU cache layer
2911	* during an allocation request whenever there is no available element in the
2912	* bucket layer. It returns one or more elements from the appropriate global
2913	* freelist. If the freelist is empty, it will attempt to populate it and
2914	* retry the allocation.
2915	*/
2916	static unsigned int
2917	mbuf_slab_alloc(void arg, mcache_obj_t *plist, unsigned* int num, int wait)
2918	{
2919	mbuf_class_t class = (mbuf_class_t)arg;
2920	unsigned int need = num;
2921	mcache_obj_t *list = plist;
2922
2923	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2924	ASSERT(need > `0`);
2925
2926	lck_mtx_lock(mbuf_mlock);
2927
2928	for (;;) {
2929	if ((*list = slab_alloc(class, wait)) != NULL) {
2930	(*list)->obj_next = NULL;
2931	list = plist = &(list)->obj_next;
2932
2933	if (--need == `0`) {
2934	/*
2935	* If the number of elements in freelist has
2936	* dropped below low watermark, asynchronously
2937	* populate the freelist now rather than doing
2938	* it later when we run out of elements.
2939	*/
2940	if (!mbuf_cached_above(class, wait) &&
2941	m_infree(class) < (m_total(class) >> `5`)) {
2942	(void) freelist_populate(class, `1`,
2943	M_DONTWAIT);
2944	}
2945	break;
2946	}
2947	} else {
2948	VERIFY(m_infree(class) == `0` \|\| class == MC_CL);
2949
2950	(void) freelist_populate(class, `1`,
2951	(wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2952
2953	if (m_infree(class) > `0`) {
2954	continue;
2955	}
2956
2957	/ Check if there's anything at the cache layer /
2958	if (mbuf_cached_above(class, wait)) {
2959	break;
2960	}
2961
2962	/ watchdog checkpoint /
2963	mbuf_watchdog();
2964
2965	/ We have nothing and cannot block; give up /
2966	if (wait & MCR_NOSLEEP) {
2967	if (!(wait & MCR_TRYHARD)) {
2968	m_fail_cnt(class)++;
2969	mbstat.m_drops++;
2970	break;
2971	}
2972	}
2973
2974	/*
2975	* If the freelist is still empty and the caller is
2976	* willing to be blocked, sleep on the wait channel
2977	* until an element is available. Otherwise, if
2978	* MCR_TRYHARD is set, do our best to satisfy the
2979	* request without having to go to sleep.
2980	*/
2981	if (mbuf_worker_ready &&
2982	mbuf_sleep(class, need, wait)) {
2983	break;
2984	}
2985
2986	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2987	}
2988	}
2989
2990	m_alloc_cnt(class) += num - need;
2991	lck_mtx_unlock(mbuf_mlock);
2992
2993	return num - need;
2994	}
2995
2996	/*
2997	* Common de-allocator for rudimentary objects called by the CPU cache
2998	* layer when one or more elements need to be returned to the appropriate
2999	* global freelist.
3000	*/
3001	static void
3002	mbuf_slab_free(void arg, mcache_obj_t list, __unused int purged)
3003	{
3004	mbuf_class_t class = (mbuf_class_t)arg;
3005	mcache_obj_t *nlist;
3006	unsigned int num = `0`;
3007	int w;
3008
3009	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
3010
3011	lck_mtx_lock(mbuf_mlock);
3012
3013	for (;;) {
3014	nlist = list->obj_next;
3015	list->obj_next = NULL;
3016	slab_free(class, list);
3017	++num;
3018	if ((list = nlist) == NULL) {
3019	break;
3020	}
3021	}
3022	m_free_cnt(class) += num;
3023
3024	if ((w = mb_waiters) > `0`) {
3025	mb_waiters = `0`;
3026	}
3027	if (w) {
3028	mbwdog_logger("waking up all threads");
3029	}
3030	lck_mtx_unlock(mbuf_mlock);
3031
3032	if (w != `0`) {
3033	wakeup(mb_waitchan);
3034	}
3035	}
3036
3037	/*
3038	* Common auditor for rudimentary objects called by the CPU cache layer
3039	* during an allocation or free request. For the former, this is called
3040	* after the objects are obtained from either the bucket or slab layer
3041	* and before they are returned to the caller. For the latter, this is
3042	* called immediately during free and before placing the objects into
3043	* the bucket or slab layer.
3044	*/
3045	static void
3046	mbuf_slab_audit(void arg, mcache_obj_t list, boolean_t alloc)
3047	{
3048	mbuf_class_t class = (mbuf_class_t)arg;
3049	mcache_audit_t *mca;
3050
3051	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
3052
3053	while (list != NULL) {
3054	lck_mtx_lock(mbuf_mlock);
3055	mca = mcl_audit_buf2mca(class, list);
3056
3057	/ Do the sanity checks /
3058	if (class == MC_MBUF) {
3059	mcl_audit_mbuf(mca, list, FALSE, alloc);
3060	ASSERT(mca->mca_uflags & MB_SCVALID);
3061	} else {
3062	mcl_audit_cluster(mca, list, m_maxsize(class),
3063	alloc, TRUE);
3064	ASSERT(!(mca->mca_uflags & MB_SCVALID));
3065	}
3066	/ Record this transaction /
3067	if (mcltrace) {
3068	mcache_buffer_log(mca, list, m_cache(class), &mb_start);
3069	}
3070
3071	if (alloc) {
3072	mca->mca_uflags \|= MB_INUSE;
3073	} else {
3074	mca->mca_uflags &= ~MB_INUSE;
3075	}
3076	/ Unpair the object (unconditionally) /
3077	mca->mca_uptr = NULL;
3078	lck_mtx_unlock(mbuf_mlock);
3079
3080	list = list->obj_next;
3081	}
3082	}
3083
3084	/*
3085	* Common notify routine for all caches. It is called by mcache when
3086	* one or more objects get freed. We use this indication to trigger
3087	* the wakeup of any sleeping threads so that they can retry their
3088	* allocation requests.
3089	*/
3090	static void
3091	mbuf_slab_notify(void *arg, u_int32_t reason)
3092	{
3093	mbuf_class_t class = (mbuf_class_t)arg;
3094	int w;
3095
3096	ASSERT(MBUF_CLASS_VALID(class));
3097
3098	if (reason != MCN_RETRYALLOC) {
3099	return;
3100	}
3101
3102	lck_mtx_lock(mbuf_mlock);
3103	if ((w = mb_waiters) > `0`) {
3104	m_notified(class)++;
3105	mb_waiters = `0`;
3106	}
3107	if (w) {
3108	mbwdog_logger("waking up all threads");
3109	}
3110	lck_mtx_unlock(mbuf_mlock);
3111
3112	if (w != `0`) {
3113	wakeup(mb_waitchan);
3114	}
3115	}
3116
3117	/*
3118	* Obtain object(s) from the composite class's freelist.
3119	*/
3120	static unsigned int
3121	cslab_alloc(mbuf_class_t class, mcache_obj_t **plist, unsigned* int num)
3122	{
3123	unsigned int need = num;
3124	mcl_slab_t sp, clsp, *nsp;
3125	struct mbuf *m;
3126	mcache_obj_t *list = plist;
3127	void *cl;
3128
3129	VERIFY(need > `0`);
3130	VERIFY(class != MC_MBUF_16KCL \|\| njcl > `0`);
3131	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3132
3133	/ Get what we can from the freelist /
3134	while ((*list = m_cobjlist(class)) != NULL) {
3135	MRANGE(*list);
3136
3137	m = (struct mbuf )list;
3138	sp = slab_get(m);
3139	cl = m->m_ext.ext_buf;
3140	clsp = slab_get(cl);
3141	VERIFY(m->m_flags == M_EXT && cl != NULL);
3142	VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
3143
3144	if (class == MC_MBUF_CL) {
3145	VERIFY(clsp->sl_refcnt >= `1` &&
3146	clsp->sl_refcnt <= NCLPG);
3147	} else {
3148	VERIFY(clsp->sl_refcnt >= `1` &&
3149	clsp->sl_refcnt <= NBCLPG);
3150	}
3151
3152	if (class == MC_MBUF_16KCL) {
3153	int k;
3154	for (nsp = clsp, k = `1`; k < NSLABSP16KB; k++) {
3155	nsp = nsp->sl_next;
3156	/ Next slab must already be present /
3157	VERIFY(nsp != NULL);
3158	VERIFY(nsp->sl_refcnt == `1`);
3159	}
3160	}
3161
3162	if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
3163	!MBUF_IN_MAP(m_cobjlist(class))) {
3164	slab_nextptr_panic(sp, m_cobjlist(class));
3165	/ NOTREACHED /
3166	}
3167	(*list)->obj_next = NULL;
3168	list = plist = &(list)->obj_next;
3169
3170	if (--need == `0`) {
3171	break;
3172	}
3173	}
3174	m_infree(class) -= (num - need);
3175
3176	return num - need;
3177	}
3178
3179	/*
3180	* Place object(s) back into a composite class's freelist.
3181	*/
3182	static unsigned int
3183	cslab_free(mbuf_class_t class, mcache_obj_t list, int* purged)
3184	{
3185	mcache_obj_t o, tail;
3186	unsigned int num = `0`;
3187	struct mbuf m, ms;
3188	mcache_audit_t *mca = NULL;
3189	mcache_obj_t *ref_list = NULL;
3190	mcl_slab_t clsp, nsp;
3191	void *cl;
3192	mbuf_class_t cl_class;
3193
3194	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3195	VERIFY(class != MC_MBUF_16KCL \|\| njcl > `0`);
3196	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3197
3198	if (class == MC_MBUF_CL) {
3199	cl_class = MC_CL;
3200	} else if (class == MC_MBUF_BIGCL) {
3201	cl_class = MC_BIGCL;
3202	} else {
3203	VERIFY(class == MC_MBUF_16KCL);
3204	cl_class = MC_16KCL;
3205	}
3206
3207	o = tail = list;
3208
3209	while ((m = ms = (struct mbuf *)o) != NULL) {
3210	mcache_obj_t rfa, nexto = o->obj_next;
3211
3212	/ Do the mbuf sanity checks /
3213	if (mclaudit != NULL) {
3214	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3215	if (mclverify) {
3216	mcache_audit_free_verify(mca, m, `0`,
3217	m_maxsize(MC_MBUF));
3218	}
3219	ms = MCA_SAVED_MBUF_PTR(mca);
3220	}
3221
3222	/ Do the cluster sanity checks /
3223	cl = ms->m_ext.ext_buf;
3224	clsp = slab_get(cl);
3225	if (mclverify) {
3226	size_t size = m_maxsize(cl_class);
3227	mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
3228	(mcache_obj_t *)cl), cl, `0`, size);
3229	}
3230	VERIFY(ms->m_type == MT_FREE);
3231	VERIFY(ms->m_flags == M_EXT);
3232	VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3233	if (cl_class == MC_CL) {
3234	VERIFY(clsp->sl_refcnt >= `1` &&
3235	clsp->sl_refcnt <= NCLPG);
3236	} else {
3237	VERIFY(clsp->sl_refcnt >= `1` &&
3238	clsp->sl_refcnt <= NBCLPG);
3239	}
3240	if (cl_class == MC_16KCL) {
3241	int k;
3242	for (nsp = clsp, k = `1`; k < NSLABSP16KB; k++) {
3243	nsp = nsp->sl_next;
3244	/ Next slab must already be present /
3245	VERIFY(nsp != NULL);
3246	VERIFY(nsp->sl_refcnt == `1`);
3247	}
3248	}
3249
3250	/*
3251	* If we're asked to purge, restore the actual mbuf using
3252	* contents of the shadow structure (if auditing is enabled)
3253	* and clear EXTF_COMPOSITE flag from the mbuf, as we are
3254	* about to free it and the attached cluster into their caches.
3255	*/
3256	if (purged) {
3257	/ Restore constructed mbuf fields /
3258	if (mclaudit != NULL) {
3259	mcl_audit_restore_mbuf(m, mca, TRUE);
3260	}
3261
3262	MEXT_MINREF(m) = `0`;
3263	MEXT_REF(m) = `0`;
3264	MEXT_PREF(m) = `0`;
3265	MEXT_FLAGS(m) = `0`;
3266	MEXT_PRIV(m) = `0`;
3267	MEXT_PMBUF(m) = NULL;
3268	MEXT_TOKEN(m) = `0`;
3269
3270	rfa = (mcache_obj_t )(void* *)m_get_rfa(m);
3271	m_set_ext(m, NULL, NULL, NULL);
3272	rfa->obj_next = ref_list;
3273	ref_list = rfa;
3274
3275	m->m_type = MT_FREE;
3276	m->m_flags = m->m_len = `0`;
3277	m->m_next = m->m_nextpkt = NULL;
3278
3279	/ Save mbuf fields and make auditing happy /
3280	if (mclaudit != NULL) {
3281	mcl_audit_mbuf(mca, o, FALSE, FALSE);
3282	}
3283
3284	VERIFY(m_total(class) > `0`);
3285	m_total(class)--;
3286
3287	/ Free the mbuf /
3288	o->obj_next = NULL;
3289	slab_free(MC_MBUF, o);
3290
3291	/ And free the cluster /
3292	((mcache_obj_t *)cl)->obj_next = NULL;
3293	if (class == MC_MBUF_CL) {
3294	slab_free(MC_CL, cl);
3295	} else if (class == MC_MBUF_BIGCL) {
3296	slab_free(MC_BIGCL, cl);
3297	} else {
3298	slab_free(MC_16KCL, cl);
3299	}
3300	}
3301
3302	++num;
3303	tail = o;
3304	o = nexto;
3305	}
3306
3307	if (!purged) {
3308	tail->obj_next = m_cobjlist(class);
3309	m_cobjlist(class) = list;
3310	m_infree(class) += num;
3311	} else if (ref_list != NULL) {
3312	mcache_free_ext(ref_cache, ref_list);
3313	}
3314
3315	return num;
3316	}
3317
3318	/*
3319	* Common allocator for composite objects called by the CPU cache layer
3320	* during an allocation request whenever there is no available element in
3321	* the bucket layer. It returns one or more composite elements from the
3322	* appropriate global freelist. If the freelist is empty, it will attempt
3323	* to obtain the rudimentary objects from their caches and construct them
3324	* into composite mbuf + cluster objects.
3325	*/
3326	static unsigned int
3327	mbuf_cslab_alloc(void arg, mcache_obj_t *plist, unsigned* int needed,
3328	int wait)
3329	{
3330	mbuf_class_t class = (mbuf_class_t)arg;
3331	mbuf_class_t cl_class = `0`;
3332	unsigned int num = `0`, cnum = `0`, want = needed;
3333	mcache_obj_t *ref_list = NULL;
3334	mcache_obj_t *mp_list = NULL;
3335	mcache_obj_t *clp_list = NULL;
3336	mcache_obj_t **list;
3337	struct ext_ref *rfa;
3338	struct mbuf *m;
3339	void *cl;
3340
3341	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3342	ASSERT(needed > `0`);
3343
3344	VERIFY(class != MC_MBUF_16KCL \|\| njcl > `0`);
3345
3346	/ There should not be any slab for this class /
3347	VERIFY(m_slab_cnt(class) == `0` &&
3348	m_slablist(class).tqh_first == NULL &&
3349	m_slablist(class).tqh_last == NULL);
3350
3351	lck_mtx_lock(mbuf_mlock);
3352
3353	/ Try using the freelist first /
3354	num = cslab_alloc(class, plist, needed);
3355	list = *plist;
3356	if (num == needed) {
3357	m_alloc_cnt(class) += num;
3358	lck_mtx_unlock(mbuf_mlock);
3359	return needed;
3360	}
3361
3362	lck_mtx_unlock(mbuf_mlock);
3363
3364	/*
3365	* We could not satisfy the request using the freelist alone;
3366	* allocate from the appropriate rudimentary caches and use
3367	* whatever we can get to construct the composite objects.
3368	*/
3369	needed -= num;
3370
3371	/*
3372	* Mark these allocation requests as coming from a composite cache.
3373	* Also, if the caller is willing to be blocked, mark the request
3374	* with MCR_FAILOK such that we don't end up sleeping at the mbuf
3375	* slab layer waiting for the individual object when one or more
3376	* of the already-constructed composite objects are available.
3377	*/
3378	wait \|= MCR_COMP;
3379	if (!(wait & MCR_NOSLEEP)) {
3380	wait \|= MCR_FAILOK;
3381	}
3382
3383	/ allocate mbufs /
3384	needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
3385	if (needed == `0`) {
3386	ASSERT(mp_list == NULL);
3387	goto fail;
3388	}
3389
3390	/ allocate clusters /
3391	if (class == MC_MBUF_CL) {
3392	cl_class = MC_CL;
3393	} else if (class == MC_MBUF_BIGCL) {
3394	cl_class = MC_BIGCL;
3395	} else {
3396	VERIFY(class == MC_MBUF_16KCL);
3397	cl_class = MC_16KCL;
3398	}
3399	needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
3400	if (needed == `0`) {
3401	ASSERT(clp_list == NULL);
3402	goto fail;
3403	}
3404
3405	needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
3406	if (needed == `0`) {
3407	ASSERT(ref_list == NULL);
3408	goto fail;
3409	}
3410
3411	/*
3412	* By this time "needed" is MIN(mbuf, cluster, ref). Any left
3413	* overs will get freed accordingly before we return to caller.
3414	*/
3415	for (cnum = `0`; cnum < needed; cnum++) {
3416	struct mbuf *ms;
3417
3418	m = ms = (struct mbuf *)mp_list;
3419	mp_list = mp_list->obj_next;
3420
3421	cl = clp_list;
3422	clp_list = clp_list->obj_next;
3423	((mcache_obj_t *)cl)->obj_next = NULL;
3424
3425	rfa = (struct ext_ref *)ref_list;
3426	ref_list = ref_list->obj_next;
3427	((mcache_obj_t )(void* *)rfa)->obj_next = NULL;
3428
3429	/*
3430	* If auditing is enabled, construct the shadow mbuf
3431	* in the audit structure instead of in the actual one.
3432	* mbuf_cslab_audit() will take care of restoring the
3433	* contents after the integrity check.
3434	*/
3435	if (mclaudit != NULL) {
3436	mcache_audit_t mca, cl_mca;
3437
3438	lck_mtx_lock(mbuf_mlock);
3439	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3440	ms = MCA_SAVED_MBUF_PTR(mca);
3441	cl_mca = mcl_audit_buf2mca(cl_class,
3442	(mcache_obj_t *)cl);
3443
3444	/*
3445	* Pair them up. Note that this is done at the time
3446	* the mbuf+cluster objects are constructed. This
3447	* information should be treated as "best effort"
3448	* debugging hint since more than one mbufs can refer
3449	* to a cluster. In that case, the cluster might not
3450	* be freed along with the mbuf it was paired with.
3451	*/
3452	mca->mca_uptr = cl_mca;
3453	cl_mca->mca_uptr = mca;
3454
3455	ASSERT(mca->mca_uflags & MB_SCVALID);
3456	ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
3457	lck_mtx_unlock(mbuf_mlock);
3458
3459	/ Technically, they are in the freelist /
3460	if (mclverify) {
3461	size_t size;
3462
3463	mcache_set_pattern(MCACHE_FREE_PATTERN, m,
3464	m_maxsize(MC_MBUF));
3465
3466	if (class == MC_MBUF_CL) {
3467	size = m_maxsize(MC_CL);
3468	} else if (class == MC_MBUF_BIGCL) {
3469	size = m_maxsize(MC_BIGCL);
3470	} else {
3471	size = m_maxsize(MC_16KCL);
3472	}
3473
3474	mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
3475	size);
3476	}
3477	}
3478
3479	MBUF_INIT(ms, `0`, MT_FREE);
3480	if (class == MC_MBUF_16KCL) {
3481	MBUF_16KCL_INIT(ms, cl, rfa, `0`, EXTF_COMPOSITE);
3482	} else if (class == MC_MBUF_BIGCL) {
3483	MBUF_BIGCL_INIT(ms, cl, rfa, `0`, EXTF_COMPOSITE);
3484	} else {
3485	MBUF_CL_INIT(ms, cl, rfa, `0`, EXTF_COMPOSITE);
3486	}
3487	VERIFY(ms->m_flags == M_EXT);
3488	VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3489
3490	list = (mcache_obj_t )m;
3491	(*list)->obj_next = NULL;
3492	list = plist = &(list)->obj_next;
3493	}
3494
3495	fail:
3496	/*
3497	* Free up what's left of the above.
3498	*/
3499	if (mp_list != NULL) {
3500	mcache_free_ext(m_cache(MC_MBUF), mp_list);
3501	}
3502	if (clp_list != NULL) {
3503	mcache_free_ext(m_cache(cl_class), clp_list);
3504	}
3505	if (ref_list != NULL) {
3506	mcache_free_ext(ref_cache, ref_list);
3507	}
3508
3509	lck_mtx_lock(mbuf_mlock);
3510	if (num > `0` \|\| cnum > `0`) {
3511	m_total(class) += cnum;
3512	VERIFY(m_total(class) <= m_maxlimit(class));
3513	m_alloc_cnt(class) += num + cnum;
3514	}
3515	if ((num + cnum) < want) {
3516	m_fail_cnt(class) += (want - (num + cnum));
3517	}
3518	lck_mtx_unlock(mbuf_mlock);
3519
3520	return num + cnum;
3521	}
3522
3523	/*
3524	* Common de-allocator for composite objects called by the CPU cache
3525	* layer when one or more elements need to be returned to the appropriate
3526	* global freelist.
3527	*/
3528	static void
3529	mbuf_cslab_free(void arg, mcache_obj_t list, int purged)
3530	{
3531	mbuf_class_t class = (mbuf_class_t)arg;
3532	unsigned int num;
3533	int w;
3534
3535	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3536
3537	lck_mtx_lock(mbuf_mlock);
3538
3539	num = cslab_free(class, list, purged);
3540	m_free_cnt(class) += num;
3541
3542	if ((w = mb_waiters) > `0`) {
3543	mb_waiters = `0`;
3544	}
3545	if (w) {
3546	mbwdog_logger("waking up all threads");
3547	}
3548
3549	lck_mtx_unlock(mbuf_mlock);
3550
3551	if (w != `0`) {
3552	wakeup(mb_waitchan);
3553	}
3554	}
3555
3556	/*
3557	* Common auditor for composite objects called by the CPU cache layer
3558	* during an allocation or free request. For the former, this is called
3559	* after the objects are obtained from either the bucket or slab layer
3560	* and before they are returned to the caller. For the latter, this is
3561	* called immediately during free and before placing the objects into
3562	* the bucket or slab layer.
3563	*/
3564	static void
3565	mbuf_cslab_audit(void arg, mcache_obj_t list, boolean_t alloc)
3566	{
3567	mbuf_class_t class = (mbuf_class_t)arg, cl_class;
3568	mcache_audit_t *mca;
3569	struct mbuf m, ms;
3570	mcl_slab_t clsp, nsp;
3571	size_t cl_size;
3572	void *cl;
3573
3574	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3575	if (class == MC_MBUF_CL) {
3576	cl_class = MC_CL;
3577	} else if (class == MC_MBUF_BIGCL) {
3578	cl_class = MC_BIGCL;
3579	} else {
3580	cl_class = MC_16KCL;
3581	}
3582	cl_size = m_maxsize(cl_class);
3583
3584	while ((m = ms = (struct mbuf *)list) != NULL) {
3585	lck_mtx_lock(mbuf_mlock);
3586	/ Do the mbuf sanity checks and record its transaction /
3587	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3588	mcl_audit_mbuf(mca, m, TRUE, alloc);
3589	if (mcltrace) {
3590	mcache_buffer_log(mca, m, m_cache(class), &mb_start);
3591	}
3592
3593	if (alloc) {
3594	mca->mca_uflags \|= MB_COMP_INUSE;
3595	} else {
3596	mca->mca_uflags &= ~MB_COMP_INUSE;
3597	}
3598
3599	/*
3600	* Use the shadow mbuf in the audit structure if we are
3601	* freeing, since the contents of the actual mbuf has been
3602	* pattern-filled by the above call to mcl_audit_mbuf().
3603	*/
3604	if (!alloc && mclverify) {
3605	ms = MCA_SAVED_MBUF_PTR(mca);
3606	}
3607
3608	/ Do the cluster sanity checks and record its transaction /
3609	cl = ms->m_ext.ext_buf;
3610	clsp = slab_get(cl);
3611	VERIFY(ms->m_flags == M_EXT && cl != NULL);
3612	VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3613	if (class == MC_MBUF_CL) {
3614	VERIFY(clsp->sl_refcnt >= `1` &&
3615	clsp->sl_refcnt <= NCLPG);
3616	} else {
3617	VERIFY(clsp->sl_refcnt >= `1` &&
3618	clsp->sl_refcnt <= NBCLPG);
3619	}
3620
3621	if (class == MC_MBUF_16KCL) {
3622	int k;
3623	for (nsp = clsp, k = `1`; k < NSLABSP16KB; k++) {
3624	nsp = nsp->sl_next;
3625	/ Next slab must already be present /
3626	VERIFY(nsp != NULL);
3627	VERIFY(nsp->sl_refcnt == `1`);
3628	}
3629	}
3630
3631
3632	mca = mcl_audit_buf2mca(cl_class, cl);
3633	mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
3634	if (mcltrace) {
3635	mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
3636	}
3637
3638	if (alloc) {
3639	mca->mca_uflags \|= MB_COMP_INUSE;
3640	} else {
3641	mca->mca_uflags &= ~MB_COMP_INUSE;
3642	}
3643	lck_mtx_unlock(mbuf_mlock);
3644
3645	list = list->obj_next;
3646	}
3647	}
3648
3649	static void
3650	m_vm_error_stats(uint32_t cnt, uint64_t ts, uint64_t *size,
3651	uint64_t alloc_size, kern_return_t error)
3652	{
3653	cnt = cnt + `1`;
3654	*ts = net_uptime();
3655	if (size) {
3656	*size = alloc_size;
3657	}
3658	switch (error) {
3659	case KERN_SUCCESS:
3660	break;
3661	case KERN_INVALID_ARGUMENT:
3662	mb_kmem_stats[`0`]++;
3663	break;
3664	case KERN_INVALID_ADDRESS:
3665	mb_kmem_stats[`1`]++;
3666	break;
3667	case KERN_RESOURCE_SHORTAGE:
3668	mb_kmem_stats[`2`]++;
3669	break;
3670	case KERN_NO_SPACE:
3671	mb_kmem_stats[`3`]++;
3672	break;
3673	case KERN_FAILURE:
3674	mb_kmem_stats[`4`]++;
3675	break;
3676	default:
3677	mb_kmem_stats[`5`]++;
3678	break;
3679	}
3680	}
3681
3682	static vm_offset_t
3683	kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
3684	{
3685	vm_offset_t addr = `0`;
3686	kern_return_t kr = KERN_SUCCESS;
3687
3688	if (!physContig) {
3689	kr = kmem_alloc(mbmap, &addr, size,
3690	KMA_KOBJECT \| KMA_LOMEM, VM_KERN_MEMORY_MBUF);
3691	} else {
3692	kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, `0xfffff`,
3693	`0`, KMA_KOBJECT \| KMA_LOMEM, VM_KERN_MEMORY_MBUF);
3694	}
3695
3696	if (kr != KERN_SUCCESS) {
3697	addr = `0`;
3698	}
3699	if (err) {
3700	*err = kr;
3701	}
3702
3703	return addr;
3704	}
3705
3706	/*
3707	* Allocate some number of mbuf clusters and place on cluster freelist.
3708	*/
3709	static int
3710	m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
3711	{
3712	int i, count = `0`;
3713	vm_size_t size = `0`;
3714	int numpages = `0`, large_buffer;
3715	vm_offset_t page = `0`;
3716	mcache_audit_t *mca_list = NULL;
3717	mcache_obj_t *con_list = NULL;
3718	mcl_slab_t *sp;
3719	mbuf_class_t class;
3720	kern_return_t error;
3721
3722	/ Set if a buffer allocation needs allocation of multiple pages /
3723	large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
3724	PAGE_SIZE < M16KCLBYTES);
3725	VERIFY(bufsize == m_maxsize(MC_BIGCL) \|\|
3726	bufsize == m_maxsize(MC_16KCL));
3727
3728	VERIFY((bufsize == PAGE_SIZE) \|\|
3729	(bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
3730
3731	if (bufsize == m_size(MC_BIGCL)) {
3732	class = MC_BIGCL;
3733	} else {
3734	class = MC_16KCL;
3735	}
3736
3737	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3738
3739	/*
3740	* Multiple threads may attempt to populate the cluster map one
3741	* after another. Since we drop the lock below prior to acquiring
3742	* the physical page(s), our view of the cluster map may no longer
3743	* be accurate, and we could end up over-committing the pages beyond
3744	* the maximum allowed for each class. To prevent it, this entire
3745	* operation (including the page mapping) is serialized.
3746	*/
3747	while (mb_clalloc_busy) {
3748	mb_clalloc_waiters++;
3749	(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
3750	(PZERO - `1`), "m_clalloc", NULL);
3751	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3752	}
3753
3754	/ We are busy now; tell everyone else to go away /
3755	mb_clalloc_busy = TRUE;
3756
3757	/*
3758	* Honor the caller's wish to block or not block. We have a way
3759	* to grow the pool asynchronously using the mbuf worker thread.
3760	*/
3761	i = m_howmany(num, bufsize);
3762	if (i <= `0` \|\| (wait & M_DONTWAIT)) {
3763	goto out;
3764	}
3765
3766	lck_mtx_unlock(mbuf_mlock);
3767
3768	size = round_page(i * bufsize);
3769	page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
3770
3771	/*
3772	* If we did ask for "n" 16KB physically contiguous chunks
3773	* and didn't get them, then please try again without this
3774	* restriction.
3775	*/
3776	net_update_uptime();
3777	if (large_buffer && page == `0`) {
3778	m_vm_error_stats(&mb_kmem_contig_failed,
3779	&mb_kmem_contig_failed_ts,
3780	&mb_kmem_contig_failed_size,
3781	size, error);
3782	page = kmem_mb_alloc(mb_map, size, `0`, &error);
3783	}
3784
3785	if (page == `0`) {
3786	m_vm_error_stats(&mb_kmem_failed,
3787	&mb_kmem_failed_ts,
3788	&mb_kmem_failed_size,
3789	size, error);
3790	#if PAGE_SIZE == 4096
3791	if (bufsize == m_maxsize(MC_BIGCL)) {
3792	#else
3793	if (bufsize >= m_maxsize(MC_BIGCL)) {
3794	#endif
3795	/ Try for 1 page if failed /
3796	size = PAGE_SIZE;
3797	page = kmem_mb_alloc(mb_map, size, `0`, &error);
3798	if (page == `0`) {
3799	m_vm_error_stats(&mb_kmem_one_failed,
3800	&mb_kmem_one_failed_ts,
3801	NULL, size, error);
3802	}
3803	}
3804
3805	if (page == `0`) {
3806	lck_mtx_lock(mbuf_mlock);
3807	goto out;
3808	}
3809	}
3810
3811	VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
3812	numpages = size / PAGE_SIZE;
3813
3814	/ If auditing is enabled, allocate the audit structures now /
3815	if (mclaudit != NULL) {
3816	int needed;
3817
3818	/*
3819	* Yes, I realize this is a waste of memory for clusters
3820	* that never get transformed into mbufs, as we may end
3821	* up with NMBPG-1 unused audit structures per cluster.
3822	* But doing so tremendously simplifies the allocation
3823	* strategy, since at this point we are not holding the
3824	* mbuf lock and the caller is okay to be blocked.
3825	*/
3826	if (bufsize == PAGE_SIZE) {
3827	needed = numpages * NMBPG;
3828
3829	i = mcache_alloc_ext(mcl_audit_con_cache,
3830	&con_list, needed, MCR_SLEEP);
3831
3832	VERIFY(con_list != NULL && i == needed);
3833	} else {
3834	/*
3835	* if multiple 4K pages are being used for a
3836	* 16K cluster
3837	*/
3838	needed = numpages / NSLABSP16KB;
3839	}
3840
3841	i = mcache_alloc_ext(mcache_audit_cache,
3842	(mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3843
3844	VERIFY(mca_list != NULL && i == needed);
3845	}
3846
3847	lck_mtx_lock(mbuf_mlock);
3848
3849	for (i = `0`; i < numpages; i++, page += PAGE_SIZE) {
3850	ppnum_t offset =
3851	((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3852	ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3853
3854	/*
3855	* If there is a mapper the appropriate I/O page is
3856	* returned; zero out the page to discard its past
3857	* contents to prevent exposing leftover kernel memory.
3858	*/
3859	VERIFY(offset < mcl_pages);
3860	if (mcl_paddr_base != `0`) {
3861	bzero((void *)(uintptr_t) page, PAGE_SIZE);
3862	new_page = IOMapperInsertPage(mcl_paddr_base,
3863	offset, new_page);
3864	}
3865	mcl_paddr[offset] = new_page;
3866
3867	/ Pattern-fill this fresh page /
3868	if (mclverify) {
3869	mcache_set_pattern(MCACHE_FREE_PATTERN,
3870	(caddr_t)page, PAGE_SIZE);
3871	}
3872	if (bufsize == PAGE_SIZE) {
3873	mcache_obj_t *buf;
3874	/ One for the entire page /
3875	sp = slab_get((void *)page);
3876	if (mclaudit != NULL) {
3877	mcl_audit_init((void *)page,
3878	&mca_list, &con_list,
3879	AUDIT_CONTENTS_SIZE, NMBPG);
3880	}
3881	VERIFY(sp->sl_refcnt == `0` && sp->sl_flags == `0`);
3882	slab_init(sp, class, SLF_MAPPED, (void *)page,
3883	(void *)page, PAGE_SIZE, `0`, `1`);
3884	buf = (mcache_obj_t *)page;
3885	buf->obj_next = NULL;
3886
3887	/ Insert this slab /
3888	slab_insert(sp, class);
3889
3890	/ Update stats now since slab_get drops the lock /
3891	++m_infree(class);
3892	++m_total(class);
3893	VERIFY(m_total(class) <= m_maxlimit(class));
3894	if (class == MC_BIGCL) {
3895	mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3896	m_infree(MC_MBUF_BIGCL);
3897	mbstat.m_bigclusters = m_total(MC_BIGCL);
3898	}
3899	++count;
3900	} else if ((bufsize > PAGE_SIZE) &&
3901	(i % NSLABSP16KB) == `0`) {
3902	union m16kcluster m16kcl = (union* m16kcluster *)page;
3903	mcl_slab_t *nsp;
3904	int k;
3905
3906	/ One for the entire 16KB /
3907	sp = slab_get(m16kcl);
3908	if (mclaudit != NULL) {
3909	mcl_audit_init(m16kcl, &mca_list, NULL, `0`, `1`);
3910	}
3911
3912	VERIFY(sp->sl_refcnt == `0` && sp->sl_flags == `0`);
3913	slab_init(sp, MC_16KCL, SLF_MAPPED,
3914	m16kcl, m16kcl, bufsize, `0`, `1`);
3915	m16kcl->m16kcl_next = NULL;
3916
3917	/*
3918	* 2nd-Nth page's slab is part of the first one,
3919	* where N is NSLABSP16KB.
3920	*/
3921	for (k = `1`; k < NSLABSP16KB; k++) {
3922	nsp = slab_get(((union mbigcluster *)page) + k);
3923	VERIFY(nsp->sl_refcnt == `0` &&
3924	nsp->sl_flags == `0`);
3925	slab_init(nsp, MC_16KCL,
3926	SLF_MAPPED \| SLF_PARTIAL,
3927	m16kcl, NULL, `0`, `0`, `0`);
3928	}
3929	/ Insert this slab /
3930	slab_insert(sp, MC_16KCL);
3931
3932	/ Update stats now since slab_get drops the lock /
3933	++m_infree(MC_16KCL);
3934	++m_total(MC_16KCL);
3935	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3936	++count;
3937	}
3938	}
3939	VERIFY(mca_list == NULL && con_list == NULL);
3940
3941	/ We're done; let others enter /
3942	mb_clalloc_busy = FALSE;
3943	if (mb_clalloc_waiters > `0`) {
3944	mb_clalloc_waiters = `0`;
3945	wakeup(mb_clalloc_waitchan);
3946	}
3947
3948	return count;
3949	out:
3950	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3951
3952	mtracelarge_register(size);
3953
3954	/ We're done; let others enter /
3955	mb_clalloc_busy = FALSE;
3956	if (mb_clalloc_waiters > `0`) {
3957	mb_clalloc_waiters = `0`;
3958	wakeup(mb_clalloc_waitchan);
3959	}
3960
3961	/*
3962	* When non-blocking we kick a thread if we have to grow the
3963	* pool or if the number of free clusters is less than requested.
3964	*/
3965	if (i > `0` && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3966	mbwdog_logger("waking up the worker thread to to grow %s by %d",
3967	m_cname(class), i);
3968	wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3969	mbuf_worker_needs_wakeup = FALSE;
3970	}
3971	if (class == MC_BIGCL) {
3972	if (i > `0`) {
3973	/*
3974	* Remember total number of 4KB clusters needed
3975	* at this time.
3976	*/
3977	i += m_total(MC_BIGCL);
3978	if (i > m_region_expand(MC_BIGCL)) {
3979	m_region_expand(MC_BIGCL) = i;
3980	}
3981	}
3982	if (m_infree(MC_BIGCL) >= num) {
3983	return `1`;
3984	}
3985	} else {
3986	if (i > `0`) {
3987	/*
3988	* Remember total number of 16KB clusters needed
3989	* at this time.
3990	*/
3991	i += m_total(MC_16KCL);
3992	if (i > m_region_expand(MC_16KCL)) {
3993	m_region_expand(MC_16KCL) = i;
3994	}
3995	}
3996	if (m_infree(MC_16KCL) >= num) {
3997	return `1`;
3998	}
3999	}
4000	return `0`;
4001	}
4002
4003	/*
4004	* Populate the global freelist of the corresponding buffer class.
4005	*/
4006	static int
4007	freelist_populate(mbuf_class_t class, unsigned int num, int wait)
4008	{
4009	mcache_obj_t *o = NULL;
4010	int i, numpages = `0`, count;
4011	mbuf_class_t super_class;
4012
4013	VERIFY(class == MC_MBUF \|\| class == MC_CL \|\| class == MC_BIGCL \|\|
4014	class == MC_16KCL);
4015
4016	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4017
4018	VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) \|\|
4019	PAGE_SIZE == m_maxsize(MC_16KCL));
4020
4021	if (m_maxsize(class) >= PAGE_SIZE) {
4022	return m_clalloc(num, wait, m_maxsize(class)) != `0`;
4023	}
4024
4025	/*
4026	* The rest of the function will allocate pages and will slice
4027	* them up into the right size
4028	*/
4029
4030	numpages = (num * m_size(class) + PAGE_SIZE - `1`) / PAGE_SIZE;
4031
4032	/ Currently assume that pages are 4K or 16K /
4033	if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
4034	super_class = MC_BIGCL;
4035	} else {
4036	super_class = MC_16KCL;
4037	}
4038
4039	i = m_clalloc(numpages, wait, m_maxsize(super_class));
4040
4041	/ how many objects will we cut the page into? /
4042	int numobj = PAGE_SIZE / m_maxsize(class);
4043
4044	for (count = `0`; count < numpages; count++) {
4045	/ respect totals, minlimit, maxlimit /
4046	if (m_total(super_class) <= m_minlimit(super_class) \|\|
4047	m_total(class) >= m_maxlimit(class)) {
4048	break;
4049	}
4050
4051	if ((o = slab_alloc(super_class, wait)) == NULL) {
4052	break;
4053	}
4054
4055	struct mbuf m = (struct* mbuf *)o;
4056	union mcluster c = (union* mcluster *)o;
4057	union mbigcluster mbc = (union* mbigcluster *)o;
4058	mcl_slab_t *sp = slab_get(o);
4059	mcache_audit_t *mca = NULL;
4060
4061	/*
4062	* since one full page will be converted to MC_MBUF or
4063	* MC_CL, verify that the reference count will match that
4064	* assumption
4065	*/
4066	VERIFY(sp->sl_refcnt == `1` && slab_is_detached(sp));
4067	VERIFY((sp->sl_flags & (SLF_MAPPED \| SLF_PARTIAL)) == SLF_MAPPED);
4068	/*
4069	* Make sure that the cluster is unmolested
4070	* while in freelist
4071	*/
4072	if (mclverify) {
4073	mca = mcl_audit_buf2mca(super_class,
4074	(mcache_obj_t *)o);
4075	mcache_audit_free_verify(mca,
4076	(mcache_obj_t *)o, `0`, m_maxsize(super_class));
4077	}
4078
4079	/ Reinitialize it as an mbuf or 2K or 4K slab /
4080	slab_init(sp, class, sp->sl_flags,
4081	sp->sl_base, NULL, PAGE_SIZE, `0`, numobj);
4082
4083	VERIFY(sp->sl_head == NULL);
4084
4085	VERIFY(m_total(super_class) >= `1`);
4086	m_total(super_class)--;
4087
4088	if (super_class == MC_BIGCL) {
4089	mbstat.m_bigclusters = m_total(MC_BIGCL);
4090	}
4091
4092	m_total(class) += numobj;
4093	VERIFY(m_total(class) <= m_maxlimit(class));
4094	m_infree(class) += numobj;
4095
4096	i = numobj;
4097	if (class == MC_MBUF) {
4098	mbstat.m_mbufs = m_total(MC_MBUF);
4099	mtype_stat_add(MT_FREE, NMBPG);
4100	while (i--) {
4101	/*
4102	* If auditing is enabled, construct the
4103	* shadow mbuf in the audit structure
4104	* instead of the actual one.
4105	* mbuf_slab_audit() will take care of
4106	* restoring the contents after the
4107	* integrity check.
4108	*/
4109	if (mclaudit != NULL) {
4110	struct mbuf *ms;
4111	mca = mcl_audit_buf2mca(MC_MBUF,
4112	(mcache_obj_t *)m);
4113	ms = MCA_SAVED_MBUF_PTR(mca);
4114	ms->m_type = MT_FREE;
4115	} else {
4116	m->m_type = MT_FREE;
4117	}
4118	m->m_next = sp->sl_head;
4119	sp->sl_head = (void *)m++;
4120	}
4121	} else if (class == MC_CL) { / MC_CL /
4122	mbstat.m_clfree =
4123	m_infree(MC_CL) + m_infree(MC_MBUF_CL);
4124	mbstat.m_clusters = m_total(MC_CL);
4125	while (i--) {
4126	c->mcl_next = sp->sl_head;
4127	sp->sl_head = (void *)c++;
4128	}
4129	} else {
4130	VERIFY(class == MC_BIGCL);
4131	mbstat.m_bigclusters = m_total(MC_BIGCL);
4132	mbstat.m_bigclfree = m_infree(MC_BIGCL) +
4133	m_infree(MC_MBUF_BIGCL);
4134	while (i--) {
4135	mbc->mbc_next = sp->sl_head;
4136	sp->sl_head = (void *)mbc++;
4137	}
4138	}
4139
4140	/ Insert into the mbuf or 2k or 4k slab list /
4141	slab_insert(sp, class);
4142
4143	if ((i = mb_waiters) > `0`) {
4144	mb_waiters = `0`;
4145	}
4146	if (i != `0`) {
4147	mbwdog_logger("waking up all threads");
4148	wakeup(mb_waitchan);
4149	}
4150	}
4151	return count != `0`;
4152	}
4153
4154	/*
4155	* For each class, initialize the freelist to hold m_minlimit() objects.
4156	*/
4157	static void
4158	freelist_init(mbuf_class_t class)
4159	{
4160	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4161
4162	VERIFY(class == MC_CL \|\| class == MC_BIGCL);
4163	VERIFY(m_total(class) == `0`);
4164	VERIFY(m_minlimit(class) > `0`);
4165
4166	while (m_total(class) < m_minlimit(class)) {
4167	(void) freelist_populate(class, m_minlimit(class), M_WAIT);
4168	}
4169
4170	VERIFY(m_total(class) >= m_minlimit(class));
4171	}
4172
4173	/*
4174	* (Inaccurately) check if it might be worth a trip back to the
4175	* mcache layer due the availability of objects there. We'll
4176	* end up back here if there's nothing up there.
4177	*/
4178	static boolean_t
4179	mbuf_cached_above(mbuf_class_t class, int wait)
4180	{
4181	switch (class) {
4182	case MC_MBUF:
4183	if (wait & MCR_COMP) {
4184	return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) \|\|
4185	!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
4186	}
4187	break;
4188
4189	case MC_CL:
4190	if (wait & MCR_COMP) {
4191	return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
4192	}
4193	break;
4194
4195	case MC_BIGCL:
4196	if (wait & MCR_COMP) {
4197	return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
4198	}
4199	break;
4200
4201	case MC_16KCL:
4202	if (wait & MCR_COMP) {
4203	return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
4204	}
4205	break;
4206
4207	case MC_MBUF_CL:
4208	case MC_MBUF_BIGCL:
4209	case MC_MBUF_16KCL:
4210	break;
4211
4212	default:
4213	VERIFY(`0`);
4214	/ NOTREACHED /
4215	}
4216
4217	return !mcache_bkt_isempty(m_cache(class));
4218	}
4219
4220	/*
4221	* If possible, convert constructed objects to raw ones.
4222	*/
4223	static boolean_t
4224	mbuf_steal(mbuf_class_t class, unsigned int num)
4225	{
4226	mcache_obj_t *top = NULL;
4227	mcache_obj_t **list = &top;
4228	unsigned int tot = `0`;
4229
4230	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4231
4232	switch (class) {
4233	case MC_MBUF:
4234	case MC_CL:
4235	case MC_BIGCL:
4236	case MC_16KCL:
4237	return FALSE;
4238
4239	case MC_MBUF_CL:
4240	case MC_MBUF_BIGCL:
4241	case MC_MBUF_16KCL:
4242	/ Get the required number of constructed objects if possible /
4243	if (m_infree(class) > m_minlimit(class)) {
4244	tot = cslab_alloc(class, &list,
4245	MIN(num, m_infree(class)));
4246	}
4247
4248	/ And destroy them to get back the raw objects /
4249	if (top != NULL) {
4250	(void) cslab_free(class, top, `1`);
4251	}
4252	break;
4253
4254	default:
4255	VERIFY(`0`);
4256	/ NOTREACHED /
4257	}
4258
4259	return tot == num;
4260	}
4261
4262	static void
4263	m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
4264	{
4265	int m, bmap = `0`;
4266
4267	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4268
4269	VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
4270	VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
4271	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
4272
4273	/*
4274	* This logic can be made smarter; for now, simply mark
4275	* all other related classes as potential victims.
4276	*/
4277	switch (class) {
4278	case MC_MBUF:
4279	m_wantpurge(MC_CL)++;
4280	m_wantpurge(MC_BIGCL)++;
4281	m_wantpurge(MC_MBUF_CL)++;
4282	m_wantpurge(MC_MBUF_BIGCL)++;
4283	break;
4284
4285	case MC_CL:
4286	m_wantpurge(MC_MBUF)++;
4287	m_wantpurge(MC_BIGCL)++;
4288	m_wantpurge(MC_MBUF_BIGCL)++;
4289	if (!comp) {
4290	m_wantpurge(MC_MBUF_CL)++;
4291	}
4292	break;
4293
4294	case MC_BIGCL:
4295	m_wantpurge(MC_MBUF)++;
4296	m_wantpurge(MC_CL)++;
4297	m_wantpurge(MC_MBUF_CL)++;
4298	if (!comp) {
4299	m_wantpurge(MC_MBUF_BIGCL)++;
4300	}
4301	break;
4302
4303	case MC_16KCL:
4304	if (!comp) {
4305	m_wantpurge(MC_MBUF_16KCL)++;
4306	}
4307	break;
4308
4309	default:
4310	VERIFY(`0`);
4311	/ NOTREACHED /
4312	}
4313
4314	/*
4315	* Run through each marked class and check if we really need to
4316	* purge (and therefore temporarily disable) the per-CPU caches
4317	* layer used by the class. If so, remember the classes since
4318	* we are going to drop the lock below prior to purging.
4319	*/
4320	for (m = `0`; m < NELEM(mbuf_table); m++) {
4321	if (m_wantpurge(m) > `0`) {
4322	m_wantpurge(m) = `0`;
4323	/*
4324	* Try hard to steal the required number of objects
4325	* from the freelist of other mbuf classes. Only
4326	* purge and disable the per-CPU caches layer when
4327	* we don't have enough; it's the last resort.
4328	*/
4329	if (!mbuf_steal(m, num)) {
4330	bmap \|= (`1` << m);
4331	}
4332	}
4333	}
4334
4335	lck_mtx_unlock(mbuf_mlock);
4336
4337	if (bmap != `0`) {
4338	/ signal the domains to drain /
4339	net_drain_domains();
4340
4341	/ Sigh; we have no other choices but to ask mcache to purge /
4342	for (m = `0`; m < NELEM(mbuf_table); m++) {
4343	if ((bmap & (`1` << m)) &&
4344	mcache_purge_cache(m_cache(m), TRUE)) {
4345	lck_mtx_lock(mbuf_mlock);
4346	m_purge_cnt(m)++;
4347	mbstat.m_drain++;
4348	lck_mtx_unlock(mbuf_mlock);
4349	}
4350	}
4351	} else {
4352	/*
4353	* Request mcache to reap extra elements from all of its caches;
4354	* note that all reaps are serialized and happen only at a fixed
4355	* interval.
4356	*/
4357	mcache_reap();
4358	}
4359	lck_mtx_lock(mbuf_mlock);
4360	}
4361	#endif /* CONFIG_MBUF_MCACHE */
4362
4363	static inline struct mbuf *
4364	m_get_common(int wait, short type, int hdr)
4365	{
4366	struct mbuf *m;
4367
4368	#if CONFIG_MBUF_MCACHE
4369	int mcflags = MSLEEPF(wait);
4370
4371	/ Is this due to a non-blocking retry? If so, then try harder /
4372	if (mcflags & MCR_NOSLEEP) {
4373	mcflags \|= MCR_TRYHARD;
4374	}
4375
4376	m = mcache_alloc(m_cache(MC_MBUF), mcflags);
4377	#else
4378	m = mz_alloc(flags: wait);
4379	#endif /* CONFIG_MBUF_MCACHE */
4380	if (m != NULL) {
4381	MBUF_INIT(m, hdr, type);
4382	mtype_stat_inc(type);
4383	mtype_stat_dec(MT_FREE);
4384	}
4385	return m;
4386	}
4387
4388	/*
4389	* Space allocation routines; these are also available as macros
4390	* for critical paths.
4391	*/
4392	#define _M_GET(wait, type) m_get_common(wait, type, 0)
4393	#define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
4394	#define _M_RETRY(wait, type) _M_GET(wait, type)
4395	#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
4396	#define _MGET(m, how, type) ((m) = _M_GET(how, type))
4397	#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
4398
4399	struct mbuf *
4400	m_get(int wait, int type)
4401	{
4402	return _M_GET(wait, type);
4403	}
4404
4405	struct mbuf *
4406	m_gethdr(int wait, int type)
4407	{
4408	return _M_GETHDR(wait, type);
4409	}
4410
4411	struct mbuf *
4412	m_retry(int wait, int type)
4413	{
4414	return _M_RETRY(wait, type);
4415	}
4416
4417	struct mbuf *
4418	m_retryhdr(int wait, int type)
4419	{
4420	return _M_RETRYHDR(wait, type);
4421	}
4422
4423	struct mbuf *
4424	m_getclr(int wait, int type)
4425	{
4426	struct mbuf *m;
4427
4428	_MGET(m, wait, type);
4429	if (m != NULL) {
4430	bzero(MTOD(m, caddr_t), MLEN);
4431	}
4432	return m;
4433	}
4434
4435	static int
4436	m_free_paired(struct mbuf *m)
4437	{
4438	VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
4439
4440	os_atomic_thread_fence(seq_cst);
4441	if (MEXT_PMBUF(m) == m) {
4442	/*
4443	* Paired ref count might be negative in case we lose
4444	* against another thread clearing MEXT_PMBUF, in the
4445	* event it occurs after the above memory barrier sync.
4446	* In that case just ignore as things have been unpaired.
4447	*/
4448	int16_t prefcnt = os_atomic_dec(&MEXT_PREF(m), acq_rel);
4449	if (prefcnt > `1`) {
4450	return `1`;
4451	} else if (prefcnt == `1`) {
4452	m_ext_free_func_t m_free_func = m_get_ext_free(m);
4453	VERIFY(m_free_func != NULL);
4454	(*m_free_func)(m->m_ext.ext_buf,
4455	m->m_ext.ext_size, m_get_ext_arg(m));
4456	return `1`;
4457	} else if (prefcnt == `0`) {
4458	VERIFY(MBUF_IS_PAIRED(m));
4459
4460	/*
4461	* Restore minref to its natural value, so that
4462	* the caller will be able to free the cluster
4463	* as appropriate.
4464	*/
4465	MEXT_MINREF(m) = `0`;
4466
4467	/*
4468	* Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
4469	* as it is immutable. atomic_set_ptr also causes
4470	* memory barrier sync.
4471	*/
4472	os_atomic_store(&MEXT_PMBUF(m), NULL, release);
4473
4474	switch (m->m_ext.ext_size) {
4475	case MCLBYTES:
4476	m_set_ext(m, m_get_rfa(m), NULL, NULL);
4477	break;
4478
4479	case MBIGCLBYTES:
4480	m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
4481	break;
4482
4483	case M16KCLBYTES:
4484	m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
4485	break;
4486
4487	default:
4488	VERIFY(`0`);
4489	/ NOTREACHED /
4490	}
4491	}
4492	}
4493
4494	/*
4495	* Tell caller the unpair has occurred, and that the reference
4496	* count on the external cluster held for the paired mbuf should
4497	* now be dropped.
4498	*/
4499	return `0`;
4500	}
4501
4502	struct mbuf *
4503	m_free(struct mbuf *m)
4504	{
4505	struct mbuf *n = m->m_next;
4506
4507	if (m->m_type == MT_FREE) {
4508	panic("m_free: freeing an already freed mbuf");
4509	}
4510
4511	if (m->m_flags & M_PKTHDR) {
4512	/ Check for scratch area overflow /
4513	m_redzone_verify(m);
4514	/ Free the aux data and tags if there is any /
4515	m_tag_delete_chain(m);
4516
4517	m_do_tx_compl_callback(m, NULL);
4518	}
4519
4520	if (m->m_flags & M_EXT) {
4521	if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4522	return n;
4523	}
4524	/*
4525	* Make sure that we don't touch any ext_ref
4526	* member after we decrement the reference count
4527	* since that may lead to use-after-free
4528	* when we do not hold the last reference.
4529	*/
4530	const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
4531	const m_ext_free_func_t m_free_func = m_get_ext_free(m);
4532	const uint16_t minref = MEXT_MINREF(m);
4533	const uint16_t refcnt = m_decref(m);
4534
4535	if (refcnt == minref && !composite) {
4536	#if CONFIG_MBUF_MCACHE
4537	if (m_free_func == NULL) {
4538	mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
4539	} else if (m_free_func == m_bigfree) {
4540	mcache_free(m_cache(MC_BIGCL),
4541	m->m_ext.ext_buf);
4542	} else if (m_free_func == m_16kfree) {
4543	mcache_free(m_cache(MC_16KCL),
4544	m->m_ext.ext_buf);
4545	} else {
4546	(*m_free_func)(m->m_ext.ext_buf,
4547	m->m_ext.ext_size, m_get_ext_arg(m));
4548	}
4549	mcache_free(ref_cache, m_get_rfa(m));
4550	#else
4551	if (m_free_func == NULL) {
4552	mz_cl_free(zid: ZONE_ID_CLUSTER_2K, cl: m->m_ext.ext_buf);
4553	} else if (m_free_func == m_bigfree) {
4554	mz_cl_free(zid: ZONE_ID_CLUSTER_4K, cl: m->m_ext.ext_buf);
4555	} else if (m_free_func == m_16kfree) {
4556	mz_cl_free(zid: ZONE_ID_CLUSTER_16K, cl: m->m_ext.ext_buf);
4557	} else {
4558	(*m_free_func)(m->m_ext.ext_buf,
4559	m->m_ext.ext_size, m_get_ext_arg(m));
4560	}
4561	mz_ref_free(rfa: m_get_rfa(m));
4562	#endif /* CONFIG_MBUF_MCACHE */
4563	m_set_ext(m, NULL, NULL, NULL);
4564	} else if (refcnt == minref && composite) {
4565	VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4566
4567	mtype_stat_dec(m->m_type);
4568	mtype_stat_inc(MT_FREE);
4569
4570	m->m_type = MT_FREE;
4571	m->m_flags = M_EXT;
4572	m->m_len = `0`;
4573	m->m_next = m->m_nextpkt = NULL;
4574	/*
4575	* MEXT_FLAGS is safe to access here
4576	* since we are now sure that we held
4577	* the last reference to ext_ref.
4578	*/
4579	MEXT_FLAGS(m) &= ~EXTF_READONLY;
4580
4581	#if CONFIG_MBUF_MCACHE
4582	/ "Free" into the intermediate cache /
4583	if (m_free_func == NULL) {
4584	mcache_free(m_cache(MC_MBUF_CL), m);
4585	} else if (m_free_func == m_bigfree) {
4586	mcache_free(m_cache(MC_MBUF_BIGCL), m);
4587	} else {
4588	VERIFY(m_free_func == m_16kfree);
4589	mcache_free(m_cache(MC_MBUF_16KCL), m);
4590	}
4591	#else
4592	/ "Free" into the intermediate cache /
4593	if (m_free_func == NULL) {
4594	mz_composite_free(class: MC_MBUF_CL, m);
4595	} else if (m_free_func == m_bigfree) {
4596	mz_composite_free(class: MC_MBUF_BIGCL, m);
4597	} else {
4598	VERIFY(m_free_func == m_16kfree);
4599	mz_composite_free(class: MC_MBUF_16KCL, m);
4600	}
4601	#endif /* CONFIG_MBUF_MCACHE */
4602	return n;
4603	}
4604	}
4605
4606	mtype_stat_dec(m->m_type);
4607	mtype_stat_inc(MT_FREE);
4608
4609	m->m_type = MT_FREE;
4610	m->m_flags = m->m_len = `0`;
4611	m->m_next = m->m_nextpkt = NULL;
4612
4613	#if CONFIG_MBUF_MCACHE
4614	mcache_free(m_cache(MC_MBUF), m);
4615	#else
4616	mz_free(m);
4617	#endif /* CONFIG_MBUF_MCACHE */
4618
4619	return n;
4620	}
4621
4622	__private_extern__ struct mbuf *
4623	m_clattach(struct mbuf m, int* type, caddr_t extbuf,
4624	void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
4625	int wait, int pair)
4626	{
4627	struct ext_ref *rfa = NULL;
4628
4629	/*
4630	* If pairing is requested and an existing mbuf is provided, reject
4631	* it if it's already been paired to another cluster. Otherwise,
4632	* allocate a new one or free any existing below.
4633	*/
4634	if ((m != NULL && MBUF_IS_PAIRED(m)) \|\|
4635	(m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
4636	return NULL;
4637	}
4638
4639	if (m->m_flags & M_EXT) {
4640	/*
4641	* Make sure that we don't touch any ext_ref
4642	* member after we decrement the reference count
4643	* since that may lead to use-after-free
4644	* when we do not hold the last reference.
4645	*/
4646	const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
4647	VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
4648	const m_ext_free_func_t m_free_func = m_get_ext_free(m);
4649	const uint16_t minref = MEXT_MINREF(m);
4650	const uint16_t refcnt = m_decref(m);
4651
4652	if (refcnt == minref && !composite) {
4653	#if CONFIG_MBUF_MCACHE
4654	if (m_free_func == NULL) {
4655	mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
4656	} else if (m_free_func == m_bigfree) {
4657	mcache_free(m_cache(MC_BIGCL),
4658	m->m_ext.ext_buf);
4659	} else if (m_free_func == m_16kfree) {
4660	mcache_free(m_cache(MC_16KCL),
4661	m->m_ext.ext_buf);
4662	} else {
4663	(*m_free_func)(m->m_ext.ext_buf,
4664	m->m_ext.ext_size, m_get_ext_arg(m));
4665	}
4666	#else
4667	if (m_free_func == NULL) {
4668	mz_cl_free(zid: ZONE_ID_CLUSTER_2K, cl: m->m_ext.ext_buf);
4669	} else if (m_free_func == m_bigfree) {
4670	mz_cl_free(zid: ZONE_ID_CLUSTER_4K, cl: m->m_ext.ext_buf);
4671	} else if (m_free_func == m_16kfree) {
4672	mz_cl_free(zid: ZONE_ID_CLUSTER_16K, cl: m->m_ext.ext_buf);
4673	} else {
4674	(*m_free_func)(m->m_ext.ext_buf,
4675	m->m_ext.ext_size, m_get_ext_arg(m));
4676	}
4677	#endif /* CONFIG_MBUF_MCACHE */
4678	/ Re-use the reference structure /
4679	rfa = m_get_rfa(m);
4680	} else if (refcnt == minref && composite) {
4681	VERIFY(m->m_type != MT_FREE);
4682
4683	mtype_stat_dec(m->m_type);
4684	mtype_stat_inc(MT_FREE);
4685
4686	m->m_type = MT_FREE;
4687	m->m_flags = M_EXT;
4688	m->m_len = `0`;
4689	m->m_next = m->m_nextpkt = NULL;
4690
4691	/*
4692	* MEXT_FLAGS is safe to access here
4693	* since we are now sure that we held
4694	* the last reference to ext_ref.
4695	*/
4696	MEXT_FLAGS(m) &= ~EXTF_READONLY;
4697
4698	/ "Free" into the intermediate cache /
4699	#if CONFIG_MBUF_MCACHE
4700	if (m_free_func == NULL) {
4701	mcache_free(m_cache(MC_MBUF_CL), m);
4702	} else if (m_free_func == m_bigfree) {
4703	mcache_free(m_cache(MC_MBUF_BIGCL), m);
4704	} else {
4705	VERIFY(m_free_func == m_16kfree);
4706	mcache_free(m_cache(MC_MBUF_16KCL), m);
4707	}
4708	#else
4709	if (m_free_func == NULL) {
4710	mz_composite_free(class: MC_MBUF_CL, m);
4711	} else if (m_free_func == m_bigfree) {
4712	mz_composite_free(class: MC_MBUF_BIGCL, m);
4713	} else {
4714	VERIFY(m_free_func == m_16kfree);
4715	mz_composite_free(class: MC_MBUF_16KCL, m);
4716	}
4717	#endif /* CONFIG_MBUF_MCACHE */
4718	/*
4719	* Allocate a new mbuf, since we didn't divorce
4720	* the composite mbuf + cluster pair above.
4721	*/
4722	if ((m = _M_GETHDR(wait, type)) == NULL) {
4723	return NULL;
4724	}
4725	}
4726	}
4727
4728	#if CONFIG_MBUF_MCACHE
4729	if (rfa == NULL &&
4730	(rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4731	m_free(m);
4732	return NULL;
4733	}
4734	#else
4735	if (rfa == NULL &&
4736	(rfa = mz_ref_alloc(flags: wait)) == NULL) {
4737	m_free(m);
4738	return NULL;
4739	}
4740	#endif /* CONFIG_MBUF_MCACHE */
4741
4742	if (!pair) {
4743	MEXT_INIT(m, buf: extbuf, size: extsize, free: extfree, free_arg: extarg, rfa,
4744	min: `0`, ref: `1`, pref: `0`, flag: `0`, priv: `0`, NULL);
4745	} else {
4746	MEXT_INIT(m, buf: extbuf, size: extsize, free: extfree, free_arg: (caddr_t)m, rfa,
4747	min: `1`, ref: `1`, pref: `1`, EXTF_PAIRED, priv: `0`, pm: m);
4748	}
4749
4750	return m;
4751	}
4752
4753	/*
4754	* Perform `fast' allocation mbuf clusters from a cache of recently-freed
4755	* clusters. (If the cache is empty, new clusters are allocated en-masse.)
4756	*/
4757	struct mbuf *
4758	m_getcl(int wait, int type, int flags)
4759	{
4760	struct mbuf *m = NULL;
4761	int hdr = (flags & M_PKTHDR);
4762
4763	#if CONFIG_MBUF_MCACHE
4764	int mcflags = MSLEEPF(wait);
4765
4766	/ Is this due to a non-blocking retry? If so, then try harder /
4767	if (mcflags & MCR_NOSLEEP) {
4768	mcflags \|= MCR_TRYHARD;
4769	}
4770
4771	m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
4772	#else
4773	m = mz_composite_alloc(class: MC_MBUF_CL, flags: wait);
4774	#endif /* CONFIG_MBUF_MCACHE */
4775	if (m != NULL) {
4776	u_int16_t flag;
4777	struct ext_ref *rfa;
4778	void *cl;
4779
4780	VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4781	cl = m->m_ext.ext_buf;
4782	rfa = m_get_rfa(m);
4783
4784	ASSERT(cl != NULL && rfa != NULL);
4785	VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
4786
4787	flag = MEXT_FLAGS(m);
4788
4789	MBUF_INIT(m, hdr, type);
4790	MBUF_CL_INIT(m, cl, rfa, `1`, flag);
4791
4792	mtype_stat_inc(type);
4793	mtype_stat_dec(MT_FREE);
4794	}
4795	return m;
4796	}
4797
4798	/ m_mclget() add an mbuf cluster to a normal mbuf /
4799	struct mbuf *
4800	m_mclget(struct mbuf m, int* wait)
4801	{
4802	struct ext_ref *rfa = NULL;
4803
4804	#if CONFIG_MBUF_MCACHE
4805	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4806	return m;
4807	}
4808	#else
4809	if ((rfa = mz_ref_alloc(flags: wait)) == NULL) {
4810	return m;
4811	}
4812	#endif /* CONFIG_MBUF_MCACHE */
4813	m->m_ext.ext_buf = m_mclalloc(wait);
4814	if (m->m_ext.ext_buf != NULL) {
4815	MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, `1`, `0`);
4816	} else {
4817	#if CONFIG_MBUF_MCACHE
4818	mcache_free(ref_cache, rfa);
4819	#else
4820	mz_ref_free(rfa);
4821	#endif /* CONFIG_MBUF_MCACHE */
4822	}
4823
4824	return m;
4825	}
4826
4827	/ Allocate an mbuf cluster /
4828	caddr_t
4829	m_mclalloc(int wait)
4830	{
4831	#if CONFIG_MBUF_MCACHE
4832	int mcflags = MSLEEPF(wait);
4833
4834	/ Is this due to a non-blocking retry? If so, then try harder /
4835	if (mcflags & MCR_NOSLEEP) {
4836	mcflags \|= MCR_TRYHARD;
4837	}
4838
4839	return mcache_alloc(m_cache(MC_CL), mcflags);
4840	#else
4841	return mz_cl_alloc(zid: ZONE_ID_CLUSTER_2K, flags: wait);
4842	#endif /* CONFIG_MBUF_MCACHE */
4843	}
4844
4845	/ Free an mbuf cluster /
4846	void
4847	m_mclfree(caddr_t p)
4848	{
4849	#if CONFIG_MBUF_MCACHE
4850	mcache_free(m_cache(MC_CL), p);
4851	#else
4852	mz_cl_free(zid: ZONE_ID_CLUSTER_2K, cl: p);
4853	#endif /* CONFIG_MBUF_MCACHE */
4854	}
4855
4856	/*
4857	* mcl_hasreference() checks if a cluster of an mbuf is referenced by
4858	* another mbuf; see comments in m_incref() regarding EXTF_READONLY.
4859	*/
4860	int
4861	m_mclhasreference(struct mbuf *m)
4862	{
4863	if (!(m->m_flags & M_EXT)) {
4864	return `0`;
4865	}
4866
4867	ASSERT(m_get_rfa(m) != NULL);
4868
4869	return (MEXT_FLAGS(m) & EXTF_READONLY) ? `1` : `0`;
4870	}
4871
4872	__private_extern__ caddr_t
4873	m_bigalloc(int wait)
4874	{
4875	#if CONFIG_MBUF_MCACHE
4876	int mcflags = MSLEEPF(wait);
4877
4878	/ Is this due to a non-blocking retry? If so, then try harder /
4879	if (mcflags & MCR_NOSLEEP) {
4880	mcflags \|= MCR_TRYHARD;
4881	}
4882
4883	return mcache_alloc(m_cache(MC_BIGCL), mcflags);
4884	#else
4885	return mz_cl_alloc(zid: ZONE_ID_CLUSTER_4K, flags: wait);
4886	#endif /* CONFIG_MBUF_MCACHE */
4887	}
4888
4889	__private_extern__ void
4890	m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4891	{
4892	#if CONFIG_MBUF_MCACHE
4893	mcache_free(m_cache(MC_BIGCL), p);
4894	#else
4895	mz_cl_free(zid: ZONE_ID_CLUSTER_4K, cl: p);
4896	#endif /* CONFIG_MBUF_MCACHE */
4897	}
4898
4899	/ m_mbigget() add an 4KB mbuf cluster to a normal mbuf /
4900	__private_extern__ struct mbuf *
4901	m_mbigget(struct mbuf m, int* wait)
4902	{
4903	struct ext_ref *rfa = NULL;
4904
4905	#if CONFIG_MBUF_MCACHE
4906	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4907	return m;
4908	}
4909	#else
4910	if ((rfa = mz_ref_alloc(flags: wait)) == NULL) {
4911	return m;
4912	}
4913	#endif /* CONFIG_MBUF_MCACHE */
4914	m->m_ext.ext_buf = m_bigalloc(wait);
4915	if (m->m_ext.ext_buf != NULL) {
4916	MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, `1`, `0`);
4917	} else {
4918	#if CONFIG_MBUF_MCACHE
4919	mcache_free(ref_cache, rfa);
4920	#else
4921	mz_ref_free(rfa);
4922	#endif /* CONFIG_MBUF_MCACHE */
4923	}
4924	return m;
4925	}
4926
4927	__private_extern__ caddr_t
4928	m_16kalloc(int wait)
4929	{
4930	#if CONFIG_MBUF_MCACHE
4931	int mcflags = MSLEEPF(wait);
4932
4933	/ Is this due to a non-blocking retry? If so, then try harder /
4934	if (mcflags & MCR_NOSLEEP) {
4935	mcflags \|= MCR_TRYHARD;
4936	}
4937
4938	return mcache_alloc(m_cache(MC_16KCL), mcflags);
4939	#else
4940	return mz_cl_alloc(zid: ZONE_ID_CLUSTER_16K, flags: wait);
4941	#endif /* CONFIG_MBUF_MCACHE */
4942	}
4943
4944	__private_extern__ void
4945	m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4946	{
4947	#if CONFIG_MBUF_MCACHE
4948	mcache_free(m_cache(MC_16KCL), p);
4949	#else
4950	mz_cl_free(zid: ZONE_ID_CLUSTER_16K, cl: p);
4951	#endif /* CONFIG_MBUF_MCACHE */
4952	}
4953
4954	/ m_m16kget() add a 16KB mbuf cluster to a normal mbuf /
4955	__private_extern__ struct mbuf *
4956	m_m16kget(struct mbuf m, int* wait)
4957	{
4958	struct ext_ref *rfa = NULL;
4959
4960	#if CONFIG_MBUF_MCACHE
4961	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4962	return m;
4963	}
4964	#else
4965	if ((rfa = mz_ref_alloc(flags: wait)) == NULL) {
4966	return m;
4967	}
4968	#endif /* CONFIG_MBUF_MCACHE */
4969	m->m_ext.ext_buf = m_16kalloc(wait);
4970	if (m->m_ext.ext_buf != NULL) {
4971	MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, `1`, `0`);
4972	} else {
4973	#if CONFIG_MBUF_MCACHE
4974	mcache_free(ref_cache, rfa);
4975	#else
4976	mz_ref_free(rfa);
4977	#endif /* CONFIG_MBUF_MCACHE */
4978	}
4979
4980	return m;
4981	}
4982
4983	/*
4984	* "Move" mbuf pkthdr from "from" to "to".
4985	* "from" must have M_PKTHDR set, and "to" must be empty.
4986	*/
4987	void
4988	m_copy_pkthdr(struct mbuf to, struct* mbuf *from)
4989	{
4990	VERIFY(from->m_flags & M_PKTHDR);
4991
4992	/ Check for scratch area overflow /
4993	m_redzone_verify(m: from);
4994
4995	if (to->m_flags & M_PKTHDR) {
4996	/ Check for scratch area overflow /
4997	m_redzone_verify(m: to);
4998	/ We will be taking over the tags of 'to' /
4999	m_tag_delete_chain(to);
5000	}
5001	to->m_pkthdr = from->m_pkthdr; / especially tags /
5002	m_classifier_init(from, `0`); / purge classifier info /
5003	m_tag_init(from, `1`); / purge all tags from src /
5004	m_scratch_init(from); / clear src scratch area /
5005	to->m_flags = (from->m_flags & M_COPYFLAGS) \| (to->m_flags & M_EXT);
5006	if ((to->m_flags & M_EXT) == `0`) {
5007	to->m_data = (uintptr_t)to->m_pktdat;
5008	}
5009	m_redzone_init(to); / setup red zone on dst /
5010	}
5011
5012	/*
5013	* Duplicate "from"'s mbuf pkthdr in "to".
5014	* "from" must have M_PKTHDR set, and "to" must be empty.
5015	* In particular, this does a deep copy of the packet tags.
5016	*/
5017	int
5018	m_dup_pkthdr(struct mbuf to, struct* mbuf from, int* how)
5019	{
5020	VERIFY(from->m_flags & M_PKTHDR);
5021
5022	/ Check for scratch area overflow /
5023	m_redzone_verify(m: from);
5024
5025	if (to->m_flags & M_PKTHDR) {
5026	/ Check for scratch area overflow /
5027	m_redzone_verify(m: to);
5028	/ We will be taking over the tags of 'to' /
5029	m_tag_delete_chain(to);
5030	}
5031	to->m_flags = (from->m_flags & M_COPYFLAGS) \| (to->m_flags & M_EXT);
5032	if ((to->m_flags & M_EXT) == `0`) {
5033	to->m_data = (uintptr_t)to->m_pktdat;
5034	}
5035	to->m_pkthdr = from->m_pkthdr;
5036	/ clear TX completion flag so the callback is not called in the copy /
5037	to->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
5038	m_redzone_init(to); / setup red zone on dst /
5039	m_tag_init(to, `0`); / preserve dst static tags /
5040	return m_tag_copy_chain(to, from, how);
5041	}
5042
5043	void
5044	m_copy_pftag(struct mbuf to, struct* mbuf *from)
5045	{
5046	memcpy(m_pftag(to), m_pftag(from), n: sizeof(struct pf_mtag));
5047	#if PF_ECN
5048	m_pftag(to)->pftag_hdr = NULL;
5049	m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET \| PF_TAG_HDR_INET6);
5050	#endif /* PF_ECN */
5051	}
5052
5053	void
5054	m_copy_necptag(struct mbuf to, struct* mbuf *from)
5055	{
5056	memcpy(m_necptag(to), m_necptag(from), n: sizeof(struct necp_mtag_));
5057	}
5058
5059	void
5060	m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
5061	{
5062	VERIFY(m->m_flags & M_PKTHDR);
5063
5064	m->m_pkthdr.pkt_proto = `0`;
5065	m->m_pkthdr.pkt_flowsrc = `0`;
5066	m->m_pkthdr.pkt_flowid = `0`;
5067	m->m_pkthdr.pkt_ext_flags = `0`;
5068	m->m_pkthdr.pkt_flags &= pktf_mask; / caller-defined mask /
5069	/ preserve service class and interface info for loopback packets /
5070	if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
5071	(void) m_set_service_class(m, MBUF_SC_BE);
5072	}
5073	if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
5074	m->m_pkthdr.pkt_ifainfo = `0`;
5075	}
5076	/*
5077	* Preserve timestamp if requested
5078	*/
5079	if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
5080	m->m_pkthdr.pkt_timestamp = `0`;
5081	}
5082	}
5083
5084	void
5085	m_copy_classifier(struct mbuf to, struct* mbuf *from)
5086	{
5087	VERIFY(to->m_flags & M_PKTHDR);
5088	VERIFY(from->m_flags & M_PKTHDR);
5089
5090	to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
5091	to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
5092	to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
5093	to->m_pkthdr.pkt_mpriv_srcid = from->m_pkthdr.pkt_mpriv_srcid;
5094	to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
5095	to->m_pkthdr.pkt_ext_flags = from->m_pkthdr.pkt_ext_flags;
5096	(void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
5097	to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
5098	}
5099
5100	/*
5101	* Return a list of mbuf hdrs that point to clusters. Try for num_needed;
5102	* if wantall is not set, return whatever number were available. Set up the
5103	* first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
5104	* are chained on the m_nextpkt field. Any packets requested beyond this
5105	* are chained onto the last packet header's m_next field. The size of
5106	* the cluster is controlled by the parameter bufsize.
5107	*/
5108	__private_extern__ struct mbuf *
5109	m_getpackets_internal(unsigned int num_needed, int* num_with_pkthdrs,
5110	int wait, int wantall, size_t bufsize)
5111	{
5112	struct mbuf *m = NULL;
5113	struct mbuf *np, top;
5114	unsigned int pnum, needed = *num_needed;
5115	#if CONFIG_MBUF_MCACHE
5116	mcache_obj_t *mp_list = NULL;
5117	int mcflags = MSLEEPF(wait);
5118	mcache_t *cp;
5119	#else
5120	zstack_t mp_list = {};
5121	mbuf_class_t class = MC_MBUF_CL;
5122	#endif /* CONFIG_MBUF_MCACHE */
5123	u_int16_t flag;
5124	struct ext_ref *rfa;
5125	void *cl;
5126
5127	ASSERT(bufsize == m_maxsize(MC_CL) \|\|
5128	bufsize == m_maxsize(MC_BIGCL) \|\|
5129	bufsize == m_maxsize(MC_16KCL));
5130
5131	/*
5132	* Caller must first check for njcl because this
5133	* routine is internal and not exposed/used via KPI.
5134	*/
5135	VERIFY(bufsize != m_maxsize(MC_16KCL) \|\| njcl > `0`);
5136
5137	top = NULL;
5138	np = &top;
5139	pnum = `0`;
5140
5141	/*
5142	* The caller doesn't want all the requested buffers; only some.
5143	* Try hard to get what we can, but don't block. This effectively
5144	* overrides MCR_SLEEP, since this thread will not go to sleep
5145	* if we can't get all the buffers.
5146	*/
5147	#if CONFIG_MBUF_MCACHE
5148	if (!wantall \|\| (mcflags & MCR_NOSLEEP)) {
5149	mcflags \|= MCR_TRYHARD;
5150	}
5151
5152	/ Allocate the composite mbuf + cluster elements from the cache /
5153	if (bufsize == m_maxsize(MC_CL)) {
5154	cp = m_cache(MC_MBUF_CL);
5155	} else if (bufsize == m_maxsize(MC_BIGCL)) {
5156	cp = m_cache(MC_MBUF_BIGCL);
5157	} else {
5158	cp = m_cache(MC_MBUF_16KCL);
5159	}
5160	needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
5161	#else
5162	if (!wantall \|\| (wait & Z_NOWAIT)) {
5163	wait &= ~Z_NOWAIT;
5164	wait \|= Z_NOPAGEWAIT;
5165	}
5166
5167	/ Allocate the composite mbuf + cluster elements from the cache /
5168	if (bufsize == m_maxsize(MC_CL)) {
5169	class = MC_MBUF_CL;
5170	} else if (bufsize == m_maxsize(MC_BIGCL)) {
5171	class = MC_MBUF_BIGCL;
5172	} else {
5173	class = MC_MBUF_16KCL;
5174	}
5175	mp_list = mz_composite_alloc_n(class, n: needed, flags: wait);
5176	needed = zstack_count(stack: mp_list);
5177	#endif /* CONFIG_MBUF_MCACHE */
5178
5179	for (pnum = `0`; pnum < needed; pnum++) {
5180	#if CONFIG_MBUF_MCACHE
5181	m = (struct mbuf *)mp_list;
5182	mp_list = mp_list->obj_next;
5183	#else
5184	m = zstack_pop(stack: &mp_list);
5185	#endif /* CONFIG_MBUF_MCACHE */
5186
5187	VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
5188	cl = m->m_ext.ext_buf;
5189	rfa = m_get_rfa(m);
5190
5191	ASSERT(cl != NULL && rfa != NULL);
5192	VERIFY(MBUF_IS_COMPOSITE(m));
5193
5194	flag = MEXT_FLAGS(m);
5195
5196	MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
5197	if (bufsize == m_maxsize(MC_16KCL)) {
5198	MBUF_16KCL_INIT(m, cl, rfa, `1`, flag);
5199	} else if (bufsize == m_maxsize(MC_BIGCL)) {
5200	MBUF_BIGCL_INIT(m, cl, rfa, `1`, flag);
5201	} else {
5202	MBUF_CL_INIT(m, cl, rfa, `1`, flag);
5203	}
5204
5205	if (num_with_pkthdrs > `0`) {
5206	--num_with_pkthdrs;
5207	}
5208
5209	*np = m;
5210	if (num_with_pkthdrs > `0`) {
5211	np = &m->m_nextpkt;
5212	} else {
5213	np = &m->m_next;
5214	}
5215	}
5216	#if CONFIG_MBUF_MCACHE
5217	ASSERT(pnum != *num_needed \|\| mp_list == NULL);
5218	if (mp_list != NULL) {
5219	mcache_free_ext(cp, mp_list);
5220	}
5221	#else
5222	ASSERT(pnum != *num_needed \|\| zstack_empty(mp_list));
5223	if (!zstack_empty(stack: mp_list)) {
5224	mz_composite_free_n(class, list: mp_list);
5225	}
5226	#endif /* CONFIG_MBUF_MCACHE */
5227	if (pnum > `0`) {
5228	mtype_stat_add(MT_DATA, pnum);
5229	mtype_stat_sub(MT_FREE, pnum);
5230	}
5231
5232	if (wantall && (pnum != *num_needed)) {
5233	if (top != NULL) {
5234	m_freem_list(top);
5235	}
5236	return NULL;
5237	}
5238
5239	if (pnum > *num_needed) {
5240	printf("%s: File a radar related to <rdar://10146739>. \
5241	needed = %u, pnum = %u, num_needed = %u \n",
5242	__func__, needed, pnum, *num_needed);
5243	}
5244	*num_needed = pnum;
5245
5246	return top;
5247	}
5248
5249	/*
5250	* Return list of mbuf linked by m_nextpkt. Try for numlist, and if
5251	* wantall is not set, return whatever number were available. The size of
5252	* each mbuf in the list is controlled by the parameter packetlen. Each
5253	* mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
5254	* in the chain is called a segment. If maxsegments is not null and the
5255	* value pointed to is not null, this specify the maximum number of segments
5256	* for a chain of mbufs. If maxsegments is zero or the value pointed to
5257	* is zero the caller does not have any restriction on the number of segments.
5258	* The actual number of segments of a mbuf chain is return in the value
5259	* pointed to by maxsegments.
5260	*/
5261	__private_extern__ struct mbuf *
5262	m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
5263	unsigned int maxsegments, int* wait, int wantall, size_t wantsize)
5264	{
5265	struct mbuf *np, top, *first = NULL;
5266	size_t bufsize, r_bufsize;
5267	unsigned int num = `0`;
5268	unsigned int nsegs = `0`;
5269	unsigned int needed = `0`, resid;
5270	#if CONFIG_MBUF_MCACHE
5271	int mcflags = MSLEEPF(wait);
5272	mcache_obj_t mp_list = NULL, rmp_list = NULL;
5273	mcache_t cp = NULL, rcp = NULL;
5274	#else
5275	zstack_t mp_list = {}, rmp_list = {};
5276	mbuf_class_t class = MC_MBUF, rclass = MC_MBUF_CL;
5277	#endif /* CONFIG_MBUF_MCACHE */
5278
5279	if (*numlist == `0`) {
5280	os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0");
5281	return NULL;
5282	}
5283
5284	top = NULL;
5285	np = &top;
5286
5287	if (wantsize == `0`) {
5288	if (packetlen <= MINCLSIZE) {
5289	bufsize = packetlen;
5290	} else if (packetlen > m_maxsize(MC_CL)) {
5291	/ Use 4KB if jumbo cluster pool isn't available /
5292	if (packetlen <= m_maxsize(MC_BIGCL) \|\| njcl == `0`) {
5293	bufsize = m_maxsize(MC_BIGCL);
5294	} else {
5295	bufsize = m_maxsize(MC_16KCL);
5296	}
5297	} else {
5298	bufsize = m_maxsize(MC_CL);
5299	}
5300	} else if (wantsize == m_maxsize(MC_CL) \|\|
5301	wantsize == m_maxsize(MC_BIGCL) \|\|
5302	(wantsize == m_maxsize(MC_16KCL) && njcl > `0`)) {
5303	bufsize = wantsize;
5304	} else {
5305	*numlist = `0`;
5306	os_log(OS_LOG_DEFAULT, "m_allocpacket_internal wantsize unsupported");
5307	return NULL;
5308	}
5309
5310	if (bufsize <= MHLEN) {
5311	nsegs = `1`;
5312	} else if (bufsize <= MINCLSIZE) {
5313	if (maxsegments != NULL && *maxsegments == `1`) {
5314	bufsize = m_maxsize(MC_CL);
5315	nsegs = `1`;
5316	} else {
5317	nsegs = `2`;
5318	}
5319	} else if (bufsize == m_maxsize(MC_16KCL)) {
5320	VERIFY(njcl > `0`);
5321	nsegs = ((packetlen - `1`) >> M16KCLSHIFT) + `1`;
5322	} else if (bufsize == m_maxsize(MC_BIGCL)) {
5323	nsegs = ((packetlen - `1`) >> MBIGCLSHIFT) + `1`;
5324	} else {
5325	nsegs = ((packetlen - `1`) >> MCLSHIFT) + `1`;
5326	}
5327	if (maxsegments != NULL) {
5328	if (maxsegments && nsegs > maxsegments) {
5329	*maxsegments = nsegs;
5330	*numlist = `0`;
5331	os_log(OS_LOG_DEFAULT, "m_allocpacket_internal nsegs > *maxsegments");
5332	return NULL;
5333	}
5334	*maxsegments = nsegs;
5335	}
5336
5337	/*
5338	* The caller doesn't want all the requested buffers; only some.
5339	* Try hard to get what we can, but don't block. This effectively
5340	* overrides MCR_SLEEP, since this thread will not go to sleep
5341	* if we can't get all the buffers.
5342	*/
5343	#if CONFIG_MBUF_MCACHE
5344	if (!wantall \|\| (mcflags & MCR_NOSLEEP)) {
5345	mcflags \|= MCR_TRYHARD;
5346	}
5347	#else
5348	if (!wantall \|\| (wait & Z_NOWAIT)) {
5349	wait &= ~Z_NOWAIT;
5350	wait \|= Z_NOPAGEWAIT;
5351	}
5352	#endif /* !CONFIG_MBUF_MCACHE */
5353
5354	/*
5355	* Simple case where all elements in the lists/chains are mbufs.
5356	* Unless bufsize is greater than MHLEN, each segment chain is made
5357	* up of exactly 1 mbuf. Otherwise, each segment chain is made up
5358	* of 2 mbufs; the second one is used for the residual data, i.e.
5359	* the remaining data that cannot fit into the first mbuf.
5360	*/
5361	if (bufsize <= MINCLSIZE) {
5362	/ Allocate the elements in one shot from the mbuf cache /
5363	ASSERT(bufsize <= MHLEN \|\| nsegs == `2`);
5364	#if CONFIG_MBUF_MCACHE
5365	cp = m_cache(MC_MBUF);
5366	needed = mcache_alloc_ext(cp, &mp_list,
5367	(numlist) nsegs, mcflags);
5368	#else
5369	class = MC_MBUF;
5370	mp_list = mz_alloc_n(count: (numlist) nsegs, flags: wait);
5371	needed = zstack_count(stack: mp_list);
5372	#endif /* CONFIG_MBUF_MCACHE */
5373
5374	/*
5375	* The number of elements must be even if we are to use an
5376	* mbuf (instead of a cluster) to store the residual data.
5377	* If we couldn't allocate the requested number of mbufs,
5378	* trim the number down (if it's odd) in order to avoid
5379	* creating a partial segment chain.
5380	*/
5381	if (bufsize > MHLEN && (needed & `0x1`)) {
5382	needed--;
5383	}
5384
5385	while (num < needed) {
5386	struct mbuf *m = NULL;
5387
5388	#if CONFIG_MBUF_MCACHE
5389	m = (struct mbuf *)mp_list;
5390	mp_list = mp_list->obj_next;
5391	#else
5392	m = zstack_pop(stack: &mp_list);
5393	#endif /* CONFIG_MBUF_MCACHE */
5394	ASSERT(m != NULL);
5395
5396	MBUF_INIT(m, `1`, MT_DATA);
5397	num++;
5398	if (bufsize > MHLEN) {
5399	/ A second mbuf for this segment chain /
5400	#if CONFIG_MBUF_MCACHE
5401	m->m_next = (struct mbuf *)mp_list;
5402	mp_list = mp_list->obj_next;
5403	#else
5404	m->m_next = zstack_pop(stack: &mp_list);
5405	#endif /* CONFIG_MBUF_MCACHE */
5406
5407	ASSERT(m->m_next != NULL);
5408
5409	MBUF_INIT(m->m_next, `0`, MT_DATA);
5410	num++;
5411	}
5412	*np = m;
5413	np = &m->m_nextpkt;
5414	}
5415	#if CONFIG_MBUF_MCACHE
5416	ASSERT(num != *numlist \|\| mp_list == NULL);
5417	#else
5418	ASSERT(num != *numlist \|\| zstack_empty(mp_list));
5419	#endif /* CONFIG_MBUF_MCACHE */
5420
5421	if (num > `0`) {
5422	mtype_stat_add(MT_DATA, num);
5423	mtype_stat_sub(MT_FREE, num);
5424	}
5425	num /= nsegs;
5426
5427	/ We've got them all; return to caller /
5428	if (num == *numlist) {
5429	return top;
5430	}
5431
5432	goto fail;
5433	}
5434
5435	/*
5436	* Complex cases where elements are made up of one or more composite
5437	* mbufs + cluster, depending on packetlen. Each N-segment chain can
5438	* be illustrated as follows:
5439	*
5440	* [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
5441	*
5442	* Every composite mbuf + cluster element comes from the intermediate
5443	* cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
5444	* the last composite element will come from the MC_MBUF_CL cache,
5445	* unless the residual data is larger than 2KB where we use the
5446	* big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
5447	* data is defined as extra data beyond the first element that cannot
5448	* fit into the previous element, i.e. there is no residual data if
5449	* the chain only has 1 segment.
5450	*/
5451	r_bufsize = bufsize;
5452	resid = packetlen > bufsize ? packetlen % bufsize : `0`;
5453	if (resid > `0`) {
5454	/ There is residual data; figure out the cluster size /
5455	if (wantsize == `0` && packetlen > MINCLSIZE) {
5456	/*
5457	* Caller didn't request that all of the segments
5458	* in the chain use the same cluster size; use the
5459	* smaller of the cluster sizes.
5460	*/
5461	if (njcl > `0` && resid > m_maxsize(MC_BIGCL)) {
5462	r_bufsize = m_maxsize(MC_16KCL);
5463	} else if (resid > m_maxsize(MC_CL)) {
5464	r_bufsize = m_maxsize(MC_BIGCL);
5465	} else {
5466	r_bufsize = m_maxsize(MC_CL);
5467	}
5468	} else {
5469	/ Use the same cluster size as the other segments /
5470	resid = `0`;
5471	}
5472	}
5473
5474	needed = *numlist;
5475	if (resid > `0`) {
5476	/*
5477	* Attempt to allocate composite mbuf + cluster elements for
5478	* the residual data in each chain; record the number of such
5479	* elements that can be allocated so that we know how many
5480	* segment chains we can afford to create.
5481	*/
5482	#if CONFIG_MBUF_MCACHE
5483	if (r_bufsize <= m_maxsize(MC_CL)) {
5484	rcp = m_cache(MC_MBUF_CL);
5485	} else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
5486	rcp = m_cache(MC_MBUF_BIGCL);
5487	} else {
5488	rcp = m_cache(MC_MBUF_16KCL);
5489	}
5490	needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
5491	#else
5492	if (r_bufsize <= m_maxsize(MC_CL)) {
5493	rclass = MC_MBUF_CL;
5494	} else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
5495	rclass = MC_MBUF_BIGCL;
5496	} else {
5497	rclass = MC_MBUF_16KCL;
5498	}
5499	rmp_list = mz_composite_alloc_n(class: rclass, n: *numlist, flags: wait);
5500	needed = zstack_count(stack: rmp_list);
5501	#endif /* CONFIG_MBUF_MCACHE */
5502	if (needed == `0`) {
5503	goto fail;
5504	}
5505
5506	/ This is temporarily reduced for calculation /
5507	ASSERT(nsegs > `1`);
5508	nsegs--;
5509	}
5510
5511	/*
5512	* Attempt to allocate the rest of the composite mbuf + cluster
5513	* elements for the number of segment chains that we need.
5514	*/
5515	#if CONFIG_MBUF_MCACHE
5516	if (bufsize <= m_maxsize(MC_CL)) {
5517	cp = m_cache(MC_MBUF_CL);
5518	} else if (bufsize <= m_maxsize(MC_BIGCL)) {
5519	cp = m_cache(MC_MBUF_BIGCL);
5520	} else {
5521	cp = m_cache(MC_MBUF_16KCL);
5522	}
5523	needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
5524	#else
5525	if (bufsize <= m_maxsize(MC_CL)) {
5526	class = MC_MBUF_CL;
5527	} else if (bufsize <= m_maxsize(MC_BIGCL)) {
5528	class = MC_MBUF_BIGCL;
5529	} else {
5530	class = MC_MBUF_16KCL;
5531	}
5532	mp_list = mz_composite_alloc_n(class, n: needed * nsegs, flags: wait);
5533	needed = zstack_count(stack: mp_list);
5534	#endif /* CONFIG_MBUF_MCACHE */
5535
5536	/ Round it down to avoid creating a partial segment chain /
5537	needed = (needed / nsegs) * nsegs;
5538	if (needed == `0`) {
5539	goto fail;
5540	}
5541
5542	if (resid > `0`) {
5543	/*
5544	* We're about to construct the chain(s); take into account
5545	* the number of segments we have created above to hold the
5546	* residual data for each chain, as well as restore the
5547	* original count of segments per chain.
5548	*/
5549	ASSERT(nsegs > `0`);
5550	needed += needed / nsegs;
5551	nsegs++;
5552	}
5553
5554	for (;;) {
5555	struct mbuf *m = NULL;
5556	u_int16_t flag;
5557	struct ext_ref *rfa;
5558	void *cl;
5559	int pkthdr;
5560	m_ext_free_func_t m_free_func;
5561
5562	++num;
5563
5564	if (nsegs == `1` \|\| (num % nsegs) != `0` \|\| resid == `0`) {
5565	#if CONFIG_MBUF_MCACHE
5566	m = (struct mbuf *)mp_list;
5567	mp_list = mp_list->obj_next;
5568	#else
5569	m = zstack_pop(stack: &mp_list);
5570	#endif /* CONFIG_MBUF_MCACHE */
5571	} else {
5572	#if CONFIG_MBUF_MCACHE
5573	m = (struct mbuf *)rmp_list;
5574	rmp_list = rmp_list->obj_next;
5575	#else
5576	m = zstack_pop(stack: &rmp_list);
5577	#endif /* CONFIG_MBUF_MCACHE */
5578	}
5579	m_free_func = m_get_ext_free(m);
5580	ASSERT(m != NULL);
5581	VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
5582	VERIFY(m_free_func == NULL \|\| m_free_func == m_bigfree \|\|
5583	m_free_func == m_16kfree);
5584
5585	cl = m->m_ext.ext_buf;
5586	rfa = m_get_rfa(m);
5587
5588	ASSERT(cl != NULL && rfa != NULL);
5589	VERIFY(MBUF_IS_COMPOSITE(m));
5590
5591	flag = MEXT_FLAGS(m);
5592
5593	pkthdr = (nsegs == `1` \|\| (num % nsegs) == `1`);
5594	if (pkthdr) {
5595	first = m;
5596	}
5597	MBUF_INIT(m, pkthdr, MT_DATA);
5598	if (m_free_func == m_16kfree) {
5599	MBUF_16KCL_INIT(m, cl, rfa, `1`, flag);
5600	} else if (m_free_func == m_bigfree) {
5601	MBUF_BIGCL_INIT(m, cl, rfa, `1`, flag);
5602	} else {
5603	MBUF_CL_INIT(m, cl, rfa, `1`, flag);
5604	}
5605
5606	*np = m;
5607	if ((num % nsegs) == `0`) {
5608	np = &first->m_nextpkt;
5609	} else {
5610	np = &m->m_next;
5611	}
5612
5613	if (num == needed) {
5614	break;
5615	}
5616	}
5617
5618	if (num > `0`) {
5619	mtype_stat_add(MT_DATA, num);
5620	mtype_stat_sub(MT_FREE, num);
5621	}
5622
5623	num /= nsegs;
5624
5625	/ We've got them all; return to caller /
5626	if (num == *numlist) {
5627	#if CONFIG_MBUF_MCACHE
5628	ASSERT(mp_list == NULL && rmp_list == NULL);
5629	#else
5630	ASSERT(zstack_empty(mp_list) && zstack_empty(rmp_list));
5631	#endif /* CONFIG_MBUF_MCACHE */
5632	return top;
5633	}
5634
5635	fail:
5636	/ Free up what's left of the above /
5637	#if CONFIG_MBUF_MCACHE
5638	if (mp_list != NULL) {
5639	mcache_free_ext(cp, mp_list);
5640	}
5641	if (rmp_list != NULL) {
5642	mcache_free_ext(rcp, rmp_list);
5643	}
5644	#else
5645	if (!zstack_empty(stack: mp_list)) {
5646	if (class == MC_MBUF) {
5647	/ No need to elide, these mbufs came from the cache. /
5648	mz_free_n(list: mp_list);
5649	} else {
5650	mz_composite_free_n(class, list: mp_list);
5651	}
5652	}
5653	if (!zstack_empty(stack: rmp_list)) {
5654	mz_composite_free_n(class: rclass, list: rmp_list);
5655	}
5656	#endif /* CONFIG_MBUF_MCACHE */
5657	if (wantall && top != NULL) {
5658	m_freem_list(top);
5659	*numlist = `0`;
5660	return NULL;
5661	}
5662	*numlist = num;
5663	return top;
5664	}
5665
5666	/*
5667	* Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
5668	* packets on receive ring.
5669	*/
5670	__private_extern__ struct mbuf *
5671	m_getpacket_how(int wait)
5672	{
5673	unsigned int num_needed = `1`;
5674
5675	return m_getpackets_internal(num_needed: &num_needed, num_with_pkthdrs: `1`, wait, wantall: `1`,
5676	m_maxsize(MC_CL));
5677	}
5678
5679	/*
5680	* Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
5681	* packets on receive ring.
5682	*/
5683	struct mbuf *
5684	m_getpacket(void)
5685	{
5686	unsigned int num_needed = `1`;
5687
5688	return m_getpackets_internal(num_needed: &num_needed, num_with_pkthdrs: `1`, M_WAIT, wantall: `1`,
5689	m_maxsize(MC_CL));
5690	}
5691
5692	/*
5693	* Return a list of mbuf hdrs that point to clusters. Try for num_needed;
5694	* if this can't be met, return whatever number were available. Set up the
5695	* first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
5696	* are chained on the m_nextpkt field. Any packets requested beyond this are
5697	* chained onto the last packet header's m_next field.
5698	*/
5699	struct mbuf *
5700	m_getpackets(int num_needed, int num_with_pkthdrs, int how)
5701	{
5702	unsigned int n = num_needed;
5703
5704	return m_getpackets_internal(num_needed: &n, num_with_pkthdrs, wait: how, wantall: `0`,
5705	m_maxsize(MC_CL));
5706	}
5707
5708	/*
5709	* Return a list of mbuf hdrs set up as packet hdrs chained together
5710	* on the m_nextpkt field
5711	*/
5712	struct mbuf *
5713	m_getpackethdrs(int num_needed, int how)
5714	{
5715	struct mbuf *m;
5716	struct mbuf *np, top;
5717
5718	top = NULL;
5719	np = &top;
5720
5721	while (num_needed--) {
5722	m = _M_RETRYHDR(how, MT_DATA);
5723	if (m == NULL) {
5724	break;
5725	}
5726
5727	*np = m;
5728	np = &m->m_nextpkt;
5729	}
5730
5731	return top;
5732	}
5733
5734	/*
5735	* Free an mbuf list (m_nextpkt) while following m_next. Returns the count
5736	* for mbufs packets freed. Used by the drivers.
5737	*/
5738	int
5739	m_freem_list(struct mbuf *m)
5740	{
5741	struct mbuf *nextpkt;
5742	#if CONFIG_MBUF_MCACHE
5743	mcache_obj_t *mp_list = NULL;
5744	mcache_obj_t *mcl_list = NULL;
5745	mcache_obj_t *mbc_list = NULL;
5746	mcache_obj_t *m16k_list = NULL;
5747	mcache_obj_t *m_mcl_list = NULL;
5748	mcache_obj_t *m_mbc_list = NULL;
5749	mcache_obj_t *m_m16k_list = NULL;
5750	mcache_obj_t *ref_list = NULL;
5751	#else
5752	zstack_t mp_list = {}, mcl_list = {}, mbc_list = {},
5753	m16k_list = {}, m_mcl_list = {},
5754	m_mbc_list = {}, m_m16k_list = {}, ref_list = {};
5755	#endif /* CONFIG_MBUF_MCACHE */
5756	int pktcount = `0`;
5757	int mt_free = `0`, mt_data = `0`, mt_header = `0`, mt_soname = `0`, mt_tag = `0`;
5758
5759	while (m != NULL) {
5760	pktcount++;
5761
5762	nextpkt = m->m_nextpkt;
5763	m->m_nextpkt = NULL;
5764
5765	while (m != NULL) {
5766	struct mbuf *next = m->m_next;
5767	#if CONFIG_MBUF_MCACHE
5768	mcache_obj_t o, rfa;
5769	#else
5770	void *cl = NULL;
5771	#endif /* CONFIG_MBUF_MCACHE */
5772	if (m->m_type == MT_FREE) {
5773	panic("m_free: freeing an already freed mbuf");
5774	}
5775
5776	if (m->m_flags & M_PKTHDR) {
5777	/ Check for scratch area overflow /
5778	m_redzone_verify(m);
5779	/ Free the aux data and tags if there is any /
5780	m_tag_delete_chain(m);
5781	m_do_tx_compl_callback(m, NULL);
5782	}
5783
5784	if (!(m->m_flags & M_EXT)) {
5785	mt_free++;
5786	goto simple_free;
5787	}
5788
5789	if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
5790	m = next;
5791	continue;
5792	}
5793
5794	mt_free++;
5795
5796	#if CONFIG_MBUF_MCACHE
5797	o = (mcache_obj_t )(void* *)m->m_ext.ext_buf;
5798	#else
5799	cl = m->m_ext.ext_buf;
5800	#endif /* CONFIG_MBUF_MCACHE */
5801	/*
5802	* Make sure that we don't touch any ext_ref
5803	* member after we decrement the reference count
5804	* since that may lead to use-after-free
5805	* when we do not hold the last reference.
5806	*/
5807	const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
5808	const m_ext_free_func_t m_free_func = m_get_ext_free(m);
5809	const uint16_t minref = MEXT_MINREF(m);
5810	const uint16_t refcnt = m_decref(m);
5811	if (refcnt == minref && !composite) {
5812	#if CONFIG_MBUF_MCACHE
5813	if (m_free_func == NULL) {
5814	o->obj_next = mcl_list;
5815	mcl_list = o;
5816	} else if (m_free_func == m_bigfree) {
5817	o->obj_next = mbc_list;
5818	mbc_list = o;
5819	} else if (m_free_func == m_16kfree) {
5820	o->obj_next = m16k_list;
5821	m16k_list = o;
5822	} else {
5823	(*(m_free_func))((caddr_t)o,
5824	m->m_ext.ext_size,
5825	m_get_ext_arg(m));
5826	}
5827	rfa = (mcache_obj_t )(void* *)m_get_rfa(m);
5828	rfa->obj_next = ref_list;
5829	ref_list = rfa;
5830	#else
5831	if (m_free_func == NULL) {
5832	zstack_push(stack: &mcl_list, elem: cl);
5833	} else if (m_free_func == m_bigfree) {
5834	zstack_push(stack: &mbc_list, elem: cl);
5835	} else if (m_free_func == m_16kfree) {
5836	zstack_push(stack: &m16k_list, elem: cl);
5837	} else {
5838	(*(m_free_func))((caddr_t)cl,
5839	m->m_ext.ext_size,
5840	m_get_ext_arg(m));
5841	}
5842	zstack_push(stack: &ref_list, elem: m_get_rfa(m));
5843	#endif /* CONFIG_MBUF_MCACHE */
5844	m_set_ext(m, NULL, NULL, NULL);
5845	} else if (refcnt == minref && composite) {
5846	VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
5847	/*
5848	* Amortize the costs of atomic operations
5849	* by doing them at the end, if possible.
5850	*/
5851	if (m->m_type == MT_DATA) {
5852	mt_data++;
5853	} else if (m->m_type == MT_HEADER) {
5854	mt_header++;
5855	} else if (m->m_type == MT_SONAME) {
5856	mt_soname++;
5857	} else if (m->m_type == MT_TAG) {
5858	mt_tag++;
5859	} else {
5860	mtype_stat_dec(m->m_type);
5861	}
5862
5863	m->m_type = MT_FREE;
5864	m->m_flags = M_EXT;
5865	m->m_len = `0`;
5866	m->m_next = m->m_nextpkt = NULL;
5867
5868	/*
5869	* MEXT_FLAGS is safe to access here
5870	* since we are now sure that we held
5871	* the last reference to ext_ref.
5872	*/
5873	MEXT_FLAGS(m) &= ~EXTF_READONLY;
5874
5875	/ "Free" into the intermediate cache /
5876	#if CONFIG_MBUF_MCACHE
5877	o = (mcache_obj_t *)m;
5878	if (m_free_func == NULL) {
5879	o->obj_next = m_mcl_list;
5880	m_mcl_list = o;
5881	} else if (m_free_func == m_bigfree) {
5882	o->obj_next = m_mbc_list;
5883	m_mbc_list = o;
5884	} else {
5885	VERIFY(m_free_func == m_16kfree);
5886	o->obj_next = m_m16k_list;
5887	m_m16k_list = o;
5888	}
5889	#else
5890	if (m_free_func == NULL) {
5891	zstack_push(stack: &m_mcl_list, elem: m);
5892	} else if (m_free_func == m_bigfree) {
5893	zstack_push(stack: &m_mbc_list, elem: m);
5894	} else {
5895	VERIFY(m_free_func == m_16kfree);
5896	zstack_push(stack: &m_m16k_list, elem: m);
5897	}
5898	#endif /* CONFIG_MBUF_MCACHE */
5899	m = next;
5900	continue;
5901	}
5902	simple_free:
5903	/*
5904	* Amortize the costs of atomic operations
5905	* by doing them at the end, if possible.
5906	*/
5907	if (m->m_type == MT_DATA) {
5908	mt_data++;
5909	} else if (m->m_type == MT_HEADER) {
5910	mt_header++;
5911	} else if (m->m_type == MT_SONAME) {
5912	mt_soname++;
5913	} else if (m->m_type == MT_TAG) {
5914	mt_tag++;
5915	} else if (m->m_type != MT_FREE) {
5916	mtype_stat_dec(m->m_type);
5917	}
5918
5919	m->m_type = MT_FREE;
5920	m->m_flags = m->m_len = `0`;
5921	m->m_next = m->m_nextpkt = NULL;
5922
5923	#if CONFIG_MBUF_MCACHE
5924	((mcache_obj_t *)m)->obj_next = mp_list;
5925	mp_list = (mcache_obj_t *)m;
5926	#else
5927	m_elide(m);
5928	zstack_push(stack: &mp_list, elem: m);
5929	#endif /* CONFIG_MBUF_MCACHE */
5930
5931	m = next;
5932	}
5933
5934	m = nextpkt;
5935	}
5936
5937	if (mt_free > `0`) {
5938	mtype_stat_add(MT_FREE, mt_free);
5939	}
5940	if (mt_data > `0`) {
5941	mtype_stat_sub(MT_DATA, mt_data);
5942	}
5943	if (mt_header > `0`) {
5944	mtype_stat_sub(MT_HEADER, mt_header);
5945	}
5946	if (mt_soname > `0`) {
5947	mtype_stat_sub(MT_SONAME, mt_soname);
5948	}
5949	if (mt_tag > `0`) {
5950	mtype_stat_sub(MT_TAG, mt_tag);
5951	}
5952	#if CONFIG_MBUF_MCACHE
5953	if (mp_list != NULL) {
5954	mcache_free_ext(m_cache(MC_MBUF), mp_list);
5955	}
5956	if (mcl_list != NULL) {
5957	mcache_free_ext(m_cache(MC_CL), mcl_list);
5958	}
5959	if (mbc_list != NULL) {
5960	mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
5961	}
5962	if (m16k_list != NULL) {
5963	mcache_free_ext(m_cache(MC_16KCL), m16k_list);
5964	}
5965	if (m_mcl_list != NULL) {
5966	mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
5967	}
5968	if (m_mbc_list != NULL) {
5969	mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
5970	}
5971	if (m_m16k_list != NULL) {
5972	mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
5973	}
5974	if (ref_list != NULL) {
5975	mcache_free_ext(ref_cache, ref_list);
5976	}
5977	#else
5978	if (!zstack_empty(stack: mp_list)) {
5979	/ mbufs elided above. /
5980	mz_free_n(list: mp_list);
5981	}
5982	if (!zstack_empty(stack: mcl_list)) {
5983	zfree_nozero_n(ZONE_ID_CLUSTER_2K, mcl_list);
5984	}
5985	if (!zstack_empty(stack: mbc_list)) {
5986	zfree_nozero_n(ZONE_ID_CLUSTER_4K, mbc_list);
5987	}
5988	if (!zstack_empty(stack: m16k_list)) {
5989	zfree_nozero_n(ZONE_ID_CLUSTER_16K, m16k_list);
5990	}
5991	if (!zstack_empty(stack: m_mcl_list)) {
5992	mz_composite_free_n(class: MC_MBUF_CL, list: m_mcl_list);
5993	}
5994	if (!zstack_empty(stack: m_mbc_list)) {
5995	mz_composite_free_n(class: MC_MBUF_BIGCL, list: m_mbc_list);
5996	}
5997	if (!zstack_empty(stack: m_m16k_list)) {
5998	mz_composite_free_n(class: MC_MBUF_16KCL, list: m_m16k_list);
5999	}
6000	if (!zstack_empty(stack: ref_list)) {
6001	zfree_nozero_n(ZONE_ID_MBUF_REF, ref_list);
6002	}
6003	#endif /* CONFIG_MBUF_MCACHE */
6004
6005	return pktcount;
6006	}
6007
6008	void
6009	m_freem(struct mbuf *m)
6010	{
6011	while (m != NULL) {
6012	m = m_free(m);
6013	}
6014	}
6015
6016	/*
6017	* Mbuffer utility routines.
6018	*/
6019	/*
6020	* Set the m_data pointer of a newly allocated mbuf to place an object of the
6021	* specified size at the end of the mbuf, longword aligned.
6022	*
6023	* NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
6024	* separate macros, each asserting that it was called at the proper moment.
6025	* This required callers to themselves test the storage type and call the
6026	* right one. Rather than require callers to be aware of those layout
6027	* decisions, we centralize here.
6028	*/
6029	void
6030	m_align(struct mbuf m, int* len)
6031	{
6032	int adjust = `0`;
6033
6034	/ At this point data must point to start /
6035	VERIFY(m->m_data == (uintptr_t)M_START(m));
6036	VERIFY(len >= `0`);
6037	VERIFY(len <= M_SIZE(m));
6038	adjust = M_SIZE(m) - len;
6039	m->m_data += adjust & ~(sizeof(long) - `1`);
6040	}
6041
6042	/*
6043	* Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
6044	* copy junk along. Does not adjust packet header length.
6045	*/
6046	struct mbuf *
6047	m_prepend(struct mbuf m, int* len, int how)
6048	{
6049	struct mbuf *mn;
6050
6051	_MGET(mn, how, m->m_type);
6052	if (mn == NULL) {
6053	m_freem(m);
6054	return NULL;
6055	}
6056	if (m->m_flags & M_PKTHDR) {
6057	M_COPY_PKTHDR(mn, m);
6058	m->m_flags &= ~M_PKTHDR;
6059	}
6060	mn->m_next = m;
6061	m = mn;
6062	if (m->m_flags & M_PKTHDR) {
6063	VERIFY(len <= MHLEN);
6064	MH_ALIGN(m, len);
6065	} else {
6066	VERIFY(len <= MLEN);
6067	M_ALIGN(m, len);
6068	}
6069	m->m_len = len;
6070	return m;
6071	}
6072
6073	/*
6074	* Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
6075	* chain, copy junk along, and adjust length.
6076	*/
6077	struct mbuf *
6078	m_prepend_2(struct mbuf m, int* len, int how, int align)
6079	{
6080	if (M_LEADINGSPACE(m) >= len &&
6081	(!align \|\| IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
6082	m->m_data -= len;
6083	m->m_len += len;
6084	} else {
6085	m = m_prepend(m, len, how);
6086	}
6087	if ((m) && (m->m_flags & M_PKTHDR)) {
6088	m->m_pkthdr.len += len;
6089	}
6090	return m;
6091	}
6092
6093	/*
6094	* Make a copy of an mbuf chain starting "off0" bytes from the beginning,
6095	* continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
6096	* The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
6097	*
6098	* The last mbuf and offset accessed are passed in and adjusted on return to
6099	* avoid having to iterate over the entire mbuf chain each time.
6100	*/
6101	struct mbuf *
6102	m_copym_mode(struct mbuf m, int* off0, int len0, int wait,
6103	struct mbuf *m_lastm, int* *m_off, uint32_t mode)
6104	{
6105	struct mbuf n, mhdr = NULL, **np;
6106	int off = off0, len = len0;
6107	struct mbuf *top;
6108	int copyhdr = `0`;
6109
6110	if (off < `0` \|\| len < `0`) {
6111	panic("m_copym: invalid offset %d or len %d", off, len);
6112	}
6113
6114	VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
6115	mode != M_COPYM_MUST_MOVE_HDR) \|\| (m->m_flags & M_PKTHDR));
6116
6117	if ((off == `0` && (m->m_flags & M_PKTHDR)) \|\|
6118	mode == M_COPYM_MUST_COPY_HDR \|\| mode == M_COPYM_MUST_MOVE_HDR) {
6119	mhdr = m;
6120	copyhdr = `1`;
6121	}
6122
6123	if (m_lastm != NULL && *m_lastm != NULL) {
6124	if (off0 >= *m_off) {
6125	m = *m_lastm;
6126	off = off0 - *m_off;
6127	}
6128	}
6129
6130	while (off >= m->m_len) {
6131	off -= m->m_len;
6132	m = m->m_next;
6133	}
6134	np = &top;
6135	top = NULL;
6136
6137	while (len > `0`) {
6138	if (m == NULL) {
6139	if (len != M_COPYALL) {
6140	panic("m_copym: len != M_COPYALL");
6141	}
6142	break;
6143	}
6144
6145	if (copyhdr) {
6146	n = _M_RETRYHDR(wait, m->m_type);
6147	} else {
6148	n = _M_RETRY(wait, m->m_type);
6149	}
6150	*np = n;
6151
6152	if (n == NULL) {
6153	goto nospace;
6154	}
6155
6156	if (copyhdr != `0`) {
6157	if ((mode == M_COPYM_MOVE_HDR) \|\|
6158	(mode == M_COPYM_MUST_MOVE_HDR)) {
6159	M_COPY_PKTHDR(n, mhdr);
6160	} else if ((mode == M_COPYM_COPY_HDR) \|\|
6161	(mode == M_COPYM_MUST_COPY_HDR)) {
6162	if (m_dup_pkthdr(to: n, from: mhdr, how: wait) == `0`) {
6163	goto nospace;
6164	}
6165	}
6166	if (len == M_COPYALL) {
6167	n->m_pkthdr.len -= off0;
6168	} else {
6169	n->m_pkthdr.len = len;
6170	}
6171	copyhdr = `0`;
6172	/*
6173	* There is data to copy from the packet header mbuf
6174	* if it is empty or it is before the starting offset
6175	*/
6176	if (mhdr != m) {
6177	np = &n->m_next;
6178	continue;
6179	}
6180	}
6181	n->m_len = MIN(len, (m->m_len - off));
6182	if (m->m_flags & M_EXT) {
6183	n->m_ext = m->m_ext;
6184	m_incref(m);
6185	n->m_data = m->m_data + off;
6186	n->m_flags \|= M_EXT;
6187	} else {
6188	/*
6189	* Limit to the capacity of the destination
6190	*/
6191	if (n->m_flags & M_PKTHDR) {
6192	n->m_len = MIN(n->m_len, MHLEN);
6193	} else {
6194	n->m_len = MIN(n->m_len, MLEN);
6195	}
6196
6197	if (MTOD(n, char ) + n->m_len > ((char* *)n) + _MSIZE) {
6198	panic("%s n %p copy overflow",
6199	__func__, n);
6200	}
6201
6202	bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
6203	n: (unsigned)n->m_len);
6204	}
6205	if (len != M_COPYALL) {
6206	len -= n->m_len;
6207	}
6208
6209	if (len == `0`) {
6210	if (m_lastm != NULL) {
6211	*m_lastm = m;
6212	*m_off = off0 + len0 - (off + n->m_len);
6213	}
6214	}
6215	off = `0`;
6216	m = m->m_next;
6217	np = &n->m_next;
6218	}
6219
6220	return top;
6221	nospace:
6222	m_freem(m: top);
6223
6224	return NULL;
6225	}
6226
6227
6228	struct mbuf *
6229	m_copym(struct mbuf m, int* off0, int len, int wait)
6230	{
6231	return m_copym_mode(m, off0, len0: len, wait, NULL, NULL, M_COPYM_MOVE_HDR);
6232	}
6233
6234	/*
6235	* Equivalent to m_copym except that all necessary mbuf hdrs are allocated
6236	* within this routine also.
6237	*
6238	* The last mbuf and offset accessed are passed in and adjusted on return to
6239	* avoid having to iterate over the entire mbuf chain each time.
6240	*/
6241	struct mbuf *
6242	m_copym_with_hdrs(struct mbuf m0, int* off0, int len0, int wait,
6243	struct mbuf *m_lastm, int* *m_off, uint32_t mode)
6244	{
6245	struct mbuf m = m0, n, **np = NULL;
6246	int off = off0, len = len0;
6247	struct mbuf *top = NULL;
6248	#if CONFIG_MBUF_MCACHE
6249	int mcflags = MSLEEPF(wait);
6250	mcache_obj_t *list = NULL;
6251	#else
6252	zstack_t list = {};
6253	#endif /* CONFIG_MBUF_MCACHE */
6254	int copyhdr = `0`;
6255	int type = `0`;
6256	int needed = `0`;
6257
6258	if (off == `0` && (m->m_flags & M_PKTHDR)) {
6259	copyhdr = `1`;
6260	}
6261
6262	if (m_lastm != NULL && *m_lastm != NULL) {
6263	if (off0 >= *m_off) {
6264	m = *m_lastm;
6265	off = off0 - *m_off;
6266	}
6267	}
6268
6269	while (off >= m->m_len) {
6270	off -= m->m_len;
6271	m = m->m_next;
6272	}
6273
6274	n = m;
6275	while (len > `0`) {
6276	needed++;
6277	len -= MIN(len, (n->m_len - ((needed == `1`) ? off : `0`)));
6278	n = n->m_next;
6279	}
6280	needed++;
6281	len = len0;
6282
6283	#if CONFIG_MBUF_MCACHE
6284	/*
6285	* If the caller doesn't want to be put to sleep, mark it with
6286	* MCR_TRYHARD so that we may reclaim buffers from other places
6287	* before giving up.
6288	*/
6289	if (mcflags & MCR_NOSLEEP) {
6290	mcflags \|= MCR_TRYHARD;
6291	}
6292
6293	if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
6294	mcflags) != needed) {
6295	goto nospace;
6296	}
6297	#else
6298	list = mz_alloc_n(count: needed, flags: wait);
6299	if (zstack_count(stack: list) != needed) {
6300	goto nospace;
6301	}
6302	#endif /* CONFIG_MBUF_MCACHE */
6303
6304	needed = `0`;
6305	while (len > `0`) {
6306	#if CONFIG_MBUF_MCACHE
6307	n = (struct mbuf *)list;
6308	list = list->obj_next;
6309	#else
6310	n = zstack_pop(stack: &list);
6311	#endif /* CONFIG_MBUF_MCACHE */
6312	ASSERT(n != NULL && m != NULL);
6313
6314	type = (top == NULL) ? MT_HEADER : m->m_type;
6315	MBUF_INIT(n, (top == NULL), type);
6316
6317	if (top == NULL) {
6318	top = n;
6319	np = &top->m_next;
6320	continue;
6321	} else {
6322	needed++;
6323	*np = n;
6324	}
6325
6326	if (copyhdr) {
6327	if ((mode == M_COPYM_MOVE_HDR) \|\|
6328	(mode == M_COPYM_MUST_MOVE_HDR)) {
6329	M_COPY_PKTHDR(n, m);
6330	} else if ((mode == M_COPYM_COPY_HDR) \|\|
6331	(mode == M_COPYM_MUST_COPY_HDR)) {
6332	if (m_dup_pkthdr(to: n, from: m, how: wait) == `0`) {
6333	#if !CONFIG_MBUF_MCACHE
6334	m_elide(m: n);
6335	#endif
6336	goto nospace;
6337	}
6338	}
6339	n->m_pkthdr.len = len;
6340	copyhdr = `0`;
6341	}
6342	n->m_len = MIN(len, (m->m_len - off));
6343
6344	if (m->m_flags & M_EXT) {
6345	n->m_ext = m->m_ext;
6346	m_incref(m);
6347	n->m_data = m->m_data + off;
6348	n->m_flags \|= M_EXT;
6349	} else {
6350	if (m_mtod_end(m: n) > m_mtod_upper_bound(m: n)) {
6351	panic("%s n %p copy overflow",
6352	__func__, n);
6353	}
6354
6355	bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
6356	n: (unsigned)n->m_len);
6357	}
6358	len -= n->m_len;
6359
6360	if (len == `0`) {
6361	if (m_lastm != NULL) {
6362	*m_lastm = m;
6363	*m_off = off0 + len0 - (off + n->m_len);
6364	}
6365	break;
6366	}
6367	off = `0`;
6368	m = m->m_next;
6369	np = &n->m_next;
6370	}
6371
6372	mtype_stat_inc(MT_HEADER);
6373	mtype_stat_add(type, needed);
6374	mtype_stat_sub(MT_FREE, needed + `1`);
6375
6376	#if CONFIG_MBUF_MCACHE
6377	ASSERT(list == NULL);
6378	#else
6379	ASSERT(zstack_empty(list));
6380	#endif /* CONFIG_MBUF_MCACHE */
6381
6382	return top;
6383
6384	nospace:
6385	#if CONFIG_MBUF_MCACHE
6386	if (list != NULL) {
6387	mcache_free_ext(m_cache(MC_MBUF), list);
6388	}
6389	#else
6390	if (!zstack_empty(stack: list)) {
6391	/ No need to elide, these mbufs came from the cache. /
6392	mz_free_n(list);
6393	}
6394	#endif /* CONFIG_MBUF_MCACHE */
6395	if (top != NULL) {
6396	m_freem(m: top);
6397	}
6398	return NULL;
6399	}
6400
6401	/*
6402	* Copy data from an mbuf chain starting "off" bytes from the beginning,
6403	* continuing for "len" bytes, into the indicated buffer.
6404	*/
6405	void
6406	m_copydata(struct mbuf m, int* off, int len, void *vp)
6407	{
6408	int off0 = off, len0 = len;
6409	struct mbuf *m0 = m;
6410	unsigned count;
6411	char *cp = vp;
6412
6413	if (__improbable(off < `0` \|\| len < `0`)) {
6414	panic("%s: invalid offset %d or len %d", __func__, off, len);
6415	/ NOTREACHED /
6416	}
6417
6418	while (off > `0`) {
6419	if (__improbable(m == NULL)) {
6420	panic("%s: invalid mbuf chain %p [off %d, len %d]",
6421	__func__, m0, off0, len0);
6422	/ NOTREACHED /
6423	}
6424	if (off < m->m_len) {
6425	break;
6426	}
6427	off -= m->m_len;
6428	m = m->m_next;
6429	}
6430	while (len > `0`) {
6431	if (__improbable(m == NULL)) {
6432	panic("%s: invalid mbuf chain %p [off %d, len %d]",
6433	__func__, m0, off0, len0);
6434	/ NOTREACHED /
6435	}
6436	count = MIN(m->m_len - off, len);
6437	bcopy(MTOD(m, caddr_t) + off, dst: cp, n: count);
6438	len -= count;
6439	cp += count;
6440	off = `0`;
6441	m = m->m_next;
6442	}
6443	}
6444
6445	/*
6446	* Concatenate mbuf chain n to m. Both chains must be of the same type
6447	* (e.g. MT_DATA). Any m_pkthdr is not updated.
6448	*/
6449	void
6450	m_cat(struct mbuf m, struct* mbuf *n)
6451	{
6452	while (m->m_next) {
6453	m = m->m_next;
6454	}
6455	while (n) {
6456	if ((m->m_flags & M_EXT) \|\|
6457	m->m_data + m->m_len + n->m_len >= (uintptr_t)&m->m_dat[MLEN]) {
6458	/ just join the two chains /
6459	m->m_next = n;
6460	return;
6461	}
6462	/ splat the data from one into the other /
6463	bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
6464	n: (u_int)n->m_len);
6465	m->m_len += n->m_len;
6466	n = m_free(m: n);
6467	}
6468	}
6469
6470	void
6471	m_adj(struct mbuf mp, int* req_len)
6472	{
6473	int len = req_len;
6474	struct mbuf *m;
6475	int count;
6476
6477	if ((m = mp) == NULL) {
6478	return;
6479	}
6480	if (len >= `0`) {
6481	/*
6482	* Trim from head.
6483	*/
6484	while (m != NULL && len > `0`) {
6485	if (m->m_len <= len) {
6486	len -= m->m_len;
6487	m->m_len = `0`;
6488	m = m->m_next;
6489	} else {
6490	m->m_len -= len;
6491	m->m_data += len;
6492	len = `0`;
6493	}
6494	}
6495	m = mp;
6496	if (m->m_flags & M_PKTHDR) {
6497	m->m_pkthdr.len -= (req_len - len);
6498	}
6499	} else {
6500	/*
6501	* Trim from tail. Scan the mbuf chain,
6502	* calculating its length and finding the last mbuf.
6503	* If the adjustment only affects this mbuf, then just
6504	* adjust and return. Otherwise, rescan and truncate
6505	* after the remaining size.
6506	*/
6507	len = -len;
6508	count = `0`;
6509	for (;;) {
6510	count += m->m_len;
6511	if (m->m_next == (struct mbuf *)`0`) {
6512	break;
6513	}
6514	m = m->m_next;
6515	}
6516	if (m->m_len >= len) {
6517	m->m_len -= len;
6518	m = mp;
6519	if (m->m_flags & M_PKTHDR) {
6520	m->m_pkthdr.len -= len;
6521	}
6522	return;
6523	}
6524	count -= len;
6525	if (count < `0`) {
6526	count = `0`;
6527	}
6528	/*
6529	* Correct length for chain is "count".
6530	* Find the mbuf with last data, adjust its length,
6531	* and toss data from remaining mbufs on chain.
6532	*/
6533	m = mp;
6534	if (m->m_flags & M_PKTHDR) {
6535	m->m_pkthdr.len = count;
6536	}
6537	for (; m; m = m->m_next) {
6538	if (m->m_len >= count) {
6539	m->m_len = count;
6540	break;
6541	}
6542	count -= m->m_len;
6543	}
6544	while ((m = m->m_next)) {
6545	m->m_len = `0`;
6546	}
6547	}
6548	}
6549
6550	/*
6551	* Rearange an mbuf chain so that len bytes are contiguous
6552	* and in the data area of an mbuf (so that mtod
6553	* will work for a structure of size len). Returns the resulting
6554	* mbuf chain on success, frees it and returns null on failure.
6555	* If there is room, it will add up to max_protohdr-len extra bytes to the
6556	* contiguous region in an attempt to avoid being called next time.
6557	*/
6558	struct mbuf *
6559	m_pullup(struct mbuf n, int* len)
6560	{
6561	struct mbuf *m;
6562	int count;
6563	int space;
6564
6565	/ check invalid arguments /
6566	if (n == NULL) {
6567	panic("%s: n == NULL", __func__);
6568	}
6569	if (len < `0`) {
6570	os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
6571	__func__, len);
6572	goto bad;
6573	}
6574	if (len > MLEN) {
6575	os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
6576	__func__, len);
6577	goto bad;
6578	}
6579	if ((n->m_flags & M_EXT) == `0` &&
6580	m_mtod_current(m: n) >= m_mtod_upper_bound(m: n)) {
6581	os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
6582	__func__);
6583	goto bad;
6584	}
6585
6586	/*
6587	* If first mbuf has no cluster, and has room for len bytes
6588	* without shifting current data, pullup into it,
6589	* otherwise allocate a new mbuf to prepend to the chain.
6590	*/
6591	if ((n->m_flags & M_EXT) == `0` &&
6592	len < m_mtod_upper_bound(m: n) - m_mtod_current(m: n) && n->m_next != NULL) {
6593	if (n->m_len >= len) {
6594	return n;
6595	}
6596	m = n;
6597	n = n->m_next;
6598	len -= m->m_len;
6599	} else {
6600	if (len > MHLEN) {
6601	goto bad;
6602	}
6603	_MGET(m, M_DONTWAIT, n->m_type);
6604	if (m == `0`) {
6605	goto bad;
6606	}
6607	m->m_len = `0`;
6608	if (n->m_flags & M_PKTHDR) {
6609	M_COPY_PKTHDR(m, n);
6610	n->m_flags &= ~M_PKTHDR;
6611	}
6612	}
6613	space = m_mtod_upper_bound(m) - m_mtod_end(m);
6614	do {
6615	count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
6616	bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
6617	n: (unsigned)count);
6618	len -= count;
6619	m->m_len += count;
6620	n->m_len -= count;
6621	space -= count;
6622	if (n->m_len != `0`) {
6623	n->m_data += count;
6624	} else {
6625	n = m_free(m: n);
6626	}
6627	} while (len > `0` && n != NULL);
6628	if (len > `0`) {
6629	(void) m_free(m);
6630	goto bad;
6631	}
6632	m->m_next = n;
6633	return m;
6634	bad:
6635	m_freem(m: n);
6636	return `0`;
6637	}
6638
6639	/*
6640	* Like m_pullup(), except a new mbuf is always allocated, and we allow
6641	* the amount of empty space before the data in the new mbuf to be specified
6642	* (in the event that the caller expects to prepend later).
6643	*/
6644	__private_extern__ struct mbuf *
6645	m_copyup(struct mbuf n, int* len, int dstoff)
6646	{
6647	struct mbuf *m;
6648	int count, space;
6649
6650	VERIFY(len >= `0` && dstoff >= `0`);
6651
6652	if (len > (MHLEN - dstoff)) {
6653	goto bad;
6654	}
6655	MGET(m, M_DONTWAIT, n->m_type);
6656	if (m == NULL) {
6657	goto bad;
6658	}
6659	m->m_len = `0`;
6660	if (n->m_flags & M_PKTHDR) {
6661	m_copy_pkthdr(to: m, from: n);
6662	n->m_flags &= ~M_PKTHDR;
6663	}
6664	m->m_data += dstoff;
6665	space = m_mtod_upper_bound(m) - m_mtod_end(m);
6666	do {
6667	count = min(a: min(a: max(a: len, b: max_protohdr), b: space), b: n->m_len);
6668	memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
6669	n: (unsigned)count);
6670	len -= count;
6671	m->m_len += count;
6672	n->m_len -= count;
6673	space -= count;
6674	if (n->m_len) {
6675	n->m_data += count;
6676	} else {
6677	n = m_free(m: n);
6678	}
6679	} while (len > `0` && n);
6680	if (len > `0`) {
6681	(void) m_free(m);
6682	goto bad;
6683	}
6684	m->m_next = n;
6685	return m;
6686	bad:
6687	m_freem(m: n);
6688
6689	return NULL;
6690	}
6691
6692	/*
6693	* Partition an mbuf chain in two pieces, returning the tail --
6694	* all but the first len0 bytes. In case of failure, it returns NULL and
6695	* attempts to restore the chain to its original state.
6696	*/
6697	struct mbuf *
6698	m_split(struct mbuf m0, int* len0, int wait)
6699	{
6700	return m_split0(m0, len0, wait, `1`);
6701	}
6702
6703	static struct mbuf *
6704	m_split0(struct mbuf m0, int* len0, int wait, int copyhdr)
6705	{
6706	struct mbuf m, n;
6707	unsigned len = len0, remain;
6708
6709	/*
6710	* First iterate to the mbuf which contains the first byte of
6711	* data at offset len0
6712	*/
6713	for (m = m0; m && len > m->m_len; m = m->m_next) {
6714	len -= m->m_len;
6715	}
6716	if (m == NULL) {
6717	return NULL;
6718	}
6719	/*
6720	* len effectively is now the offset in the current
6721	* mbuf where we have to perform split.
6722	*
6723	* remain becomes the tail length.
6724	* Note that len can also be == m->m_len
6725	*/
6726	remain = m->m_len - len;
6727
6728	/*
6729	* If current mbuf len contains the entire remaining offset len,
6730	* just make the second mbuf chain pointing to next mbuf onwards
6731	* and return after making necessary adjustments
6732	*/
6733	if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == `0`) {
6734	_MGETHDR(n, wait, m0->m_type);
6735	if (n == NULL) {
6736	return NULL;
6737	}
6738	n->m_next = m->m_next;
6739	m->m_next = NULL;
6740	n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
6741	n->m_pkthdr.len = m0->m_pkthdr.len - len0;
6742	m0->m_pkthdr.len = len0;
6743	return n;
6744	}
6745	if (copyhdr && (m0->m_flags & M_PKTHDR)) {
6746	_MGETHDR(n, wait, m0->m_type);
6747	if (n == NULL) {
6748	return NULL;
6749	}
6750	n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
6751	n->m_pkthdr.len = m0->m_pkthdr.len - len0;
6752	m0->m_pkthdr.len = len0;
6753
6754	/*
6755	* If current points to external storage
6756	* then it can be shared by making last mbuf
6757	* of head chain and first mbuf of current chain
6758	* pointing to different data offsets
6759	*/
6760	if (m->m_flags & M_EXT) {
6761	goto extpacket;
6762	}
6763	if (remain > MHLEN) {
6764	/ m can't be the lead packet /
6765	MH_ALIGN(n, `0`);
6766	n->m_next = m_split(m0: m, len0: len, wait);
6767	if (n->m_next == NULL) {
6768	(void) m_free(m: n);
6769	return NULL;
6770	} else {
6771	return n;
6772	}
6773	} else {
6774	MH_ALIGN(n, remain);
6775	}
6776	} else if (remain == `0`) {
6777	n = m->m_next;
6778	m->m_next = NULL;
6779	return n;
6780	} else {
6781	_MGET(n, wait, m->m_type);
6782	if (n == NULL) {
6783	return NULL;
6784	}
6785
6786	if ((m->m_flags & M_EXT) == `0`) {
6787	VERIFY(remain <= MLEN);
6788	M_ALIGN(n, remain);
6789	}
6790	}
6791	extpacket:
6792	if (m->m_flags & M_EXT) {
6793	n->m_flags \|= M_EXT;
6794	n->m_ext = m->m_ext;
6795	m_incref(m);
6796	n->m_data = m->m_data + len;
6797	} else {
6798	bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), n: remain);
6799	}
6800	n->m_len = remain;
6801	m->m_len = len;
6802	n->m_next = m->m_next;
6803	m->m_next = NULL;
6804	return n;
6805	}
6806
6807	/*
6808	* Routine to copy from device local memory into mbufs.
6809	*/
6810	struct mbuf *
6811	m_devget(char buf, int* totlen, int off0, struct ifnet *ifp,
6812	void (copy)(const* void , void* *, size_t))
6813	{
6814	struct mbuf *m;
6815	struct mbuf top = NULL, *mp = &top;
6816	int off = off0, len;
6817	char *cp;
6818	char *epkt;
6819
6820	cp = buf;
6821	epkt = cp + totlen;
6822	if (off) {
6823	/*
6824	* If 'off' is non-zero, packet is trailer-encapsulated,
6825	* so we have to skip the type and length fields.
6826	*/
6827	cp += off + `2` * sizeof(u_int16_t);
6828	totlen -= `2` * sizeof(u_int16_t);
6829	}
6830	_MGETHDR(m, M_DONTWAIT, MT_DATA);
6831	if (m == NULL) {
6832	return NULL;
6833	}
6834	m->m_pkthdr.rcvif = ifp;
6835	m->m_pkthdr.len = totlen;
6836	m->m_len = MHLEN;
6837
6838	while (totlen > `0`) {
6839	if (top != NULL) {
6840	_MGET(m, M_DONTWAIT, MT_DATA);
6841	if (m == NULL) {
6842	m_freem(m: top);
6843	return NULL;
6844	}
6845	m->m_len = MLEN;
6846	}
6847	len = MIN(totlen, epkt - cp);
6848	if (len >= MINCLSIZE) {
6849	MCLGET(m, M_DONTWAIT);
6850	if (m->m_flags & M_EXT) {
6851	m->m_len = len = MIN(len, m_maxsize(MC_CL));
6852	} else {
6853	/ give up when it's out of cluster mbufs /
6854	if (top != NULL) {
6855	m_freem(m: top);
6856	}
6857	m_freem(m);
6858	return NULL;
6859	}
6860	} else {
6861	/*
6862	* Place initial small packet/header at end of mbuf.
6863	*/
6864	if (len < m->m_len) {
6865	if (top == NULL &&
6866	len + max_linkhdr <= m->m_len) {
6867	m->m_data += max_linkhdr;
6868	}
6869	m->m_len = len;
6870	} else {
6871	len = m->m_len;
6872	}
6873	}
6874	if (copy) {
6875	copy(cp, MTOD(m, caddr_t), (unsigned)len);
6876	} else {
6877	bcopy(src: cp, MTOD(m, caddr_t), n: (unsigned)len);
6878	}
6879	cp += len;
6880	*mp = m;
6881	mp = &m->m_next;
6882	totlen -= len;
6883	if (cp == epkt) {
6884	cp = buf;
6885	}
6886	}
6887	return top;
6888	}
6889
6890	#if CONFIG_MBUF_MCACHE
6891	#ifndef MBUF_GROWTH_NORMAL_THRESH
6892	#define MBUF_GROWTH_NORMAL_THRESH 25
6893	#endif
6894
6895	/*
6896	* Cluster freelist allocation check.
6897	*/
6898	static int
6899	m_howmany(int num, size_t bufsize)
6900	{
6901	int i = `0`, j = `0`;
6902	u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
6903	u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
6904	u_int32_t sumclusters, freeclusters;
6905	u_int32_t percent_pool, percent_kmem;
6906	u_int32_t mb_growth, mb_growth_thresh;
6907
6908	VERIFY(bufsize == m_maxsize(MC_BIGCL) \|\|
6909	bufsize == m_maxsize(MC_16KCL));
6910
6911	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6912
6913	/ Numbers in 2K cluster units /
6914	m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
6915	m_clusters = m_total(MC_CL);
6916	m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
6917	m_16kclusters = m_total(MC_16KCL);
6918	sumclusters = m_mbclusters + m_clusters + m_bigclusters;
6919
6920	m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
6921	m_clfree = m_infree(MC_CL);
6922	m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
6923	m_16kclfree = m_infree(MC_16KCL);
6924	freeclusters = m_mbfree + m_clfree + m_bigclfree;
6925
6926	/ Bail if we've maxed out the mbuf memory map /
6927	if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) \|\|
6928	(njcl > `0` && bufsize == m_maxsize(MC_16KCL) &&
6929	(m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
6930	mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
6931	sumclusters, nclusters,
6932	(m_16kclusters << NCLPJCLSHIFT), njcl);
6933	return `0`;
6934	}
6935
6936	if (bufsize == m_maxsize(MC_BIGCL)) {
6937	/ Under minimum /
6938	if (m_bigclusters < m_minlimit(MC_BIGCL)) {
6939	return m_minlimit(MC_BIGCL) - m_bigclusters;
6940	}
6941
6942	percent_pool =
6943	((sumclusters - freeclusters) * `100`) / sumclusters;
6944	percent_kmem = (sumclusters * `100`) / nclusters;
6945
6946	/*
6947	* If a light/normal user, grow conservatively (75%)
6948	* If a heavy user, grow aggressively (50%)
6949	*/
6950	if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
6951	mb_growth = MB_GROWTH_NORMAL;
6952	} else {
6953	mb_growth = MB_GROWTH_AGGRESSIVE;
6954	}
6955
6956	if (percent_kmem < `5`) {
6957	/ For initial allocations /
6958	i = num;
6959	} else {
6960	/ Return if >= MBIGCL_LOWAT clusters available /
6961	if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
6962	m_total(MC_BIGCL) >=
6963	MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
6964	return `0`;
6965	}
6966
6967	/ Ensure at least num clusters are accessible /
6968	if (num >= m_infree(MC_BIGCL)) {
6969	i = num - m_infree(MC_BIGCL);
6970	}
6971	if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
6972	j = num - (m_total(MC_BIGCL) -
6973	m_minlimit(MC_BIGCL));
6974	}
6975
6976	i = MAX(i, j);
6977
6978	/*
6979	* Grow pool if percent_pool > 75 (normal growth)
6980	* or percent_pool > 50 (aggressive growth).
6981	*/
6982	mb_growth_thresh = `100` - (`100` / (`1` << mb_growth));
6983	if (percent_pool > mb_growth_thresh) {
6984	j = ((sumclusters + num) >> mb_growth) -
6985	freeclusters;
6986	}
6987	i = MAX(i, j);
6988	}
6989
6990	/ Check to ensure we didn't go over limits /
6991	if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
6992	i = m_maxlimit(MC_BIGCL) - m_bigclusters;
6993	}
6994	if ((i << `1`) + sumclusters >= nclusters) {
6995	i = (nclusters - sumclusters) >> `1`;
6996	}
6997	VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
6998	VERIFY(sumclusters + (i << `1`) <= nclusters);
6999	} else { / 16K CL /
7000	VERIFY(njcl > `0`);
7001	/ Ensure at least num clusters are available /
7002	if (num >= m_16kclfree) {
7003	i = num - m_16kclfree;
7004	}
7005
7006	/ Always grow 16KCL pool aggressively /
7007	if (((m_16kclusters + num) >> `1`) > m_16kclfree) {
7008	j = ((m_16kclusters + num) >> `1`) - m_16kclfree;
7009	}
7010	i = MAX(i, j);
7011
7012	/ Check to ensure we don't go over limit /
7013	if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
7014	i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
7015	}
7016	}
7017	return i;
7018	}
7019	#endif /* CONFIG_MBUF_MCACHE */
7020	/*
7021	* Return the number of bytes in the mbuf chain, m.
7022	*/
7023	unsigned int
7024	m_length(struct mbuf *m)
7025	{
7026	struct mbuf *m0;
7027	unsigned int pktlen;
7028
7029	if (m->m_flags & M_PKTHDR) {
7030	return m->m_pkthdr.len;
7031	}
7032
7033	pktlen = `0`;
7034	for (m0 = m; m0 != NULL; m0 = m0->m_next) {
7035	pktlen += m0->m_len;
7036	}
7037	return pktlen;
7038	}
7039
7040	/*
7041	* Copy data from a buffer back into the indicated mbuf chain,
7042	* starting "off" bytes from the beginning, extending the mbuf
7043	* chain if necessary.
7044	*/
7045	void
7046	m_copyback(struct mbuf m0, int* off, int len, const void *cp)
7047	{
7048	#if DEBUG
7049	struct mbuf *origm = m0;
7050	int error;
7051	#endif /* DEBUG */
7052
7053	if (m0 == NULL) {
7054	return;
7055	}
7056
7057	#if DEBUG
7058	error =
7059	#endif /* DEBUG */
7060	m_copyback0(&m0, off, len, cp,
7061	M_COPYBACK0_COPYBACK \| M_COPYBACK0_EXTEND, M_DONTWAIT);
7062
7063	#if DEBUG
7064	if (error != `0` \|\| (m0 != NULL && origm != m0)) {
7065	panic("m_copyback");
7066	}
7067	#endif /* DEBUG */
7068	}
7069
7070	struct mbuf *
7071	m_copyback_cow(struct mbuf m0, int* off, int len, const void cp, int* how)
7072	{
7073	int error;
7074
7075	/ don't support chain expansion /
7076	VERIFY(off + len <= m_length(m0));
7077
7078	error = m_copyback0(&m0, off, len, cp,
7079	M_COPYBACK0_COPYBACK \| M_COPYBACK0_COW, how);
7080	if (error) {
7081	/*
7082	* no way to recover from partial success.
7083	* just free the chain.
7084	*/
7085	m_freem(m: m0);
7086	return NULL;
7087	}
7088	return m0;
7089	}
7090
7091	/*
7092	* m_makewritable: ensure the specified range writable.
7093	*/
7094	int
7095	m_makewritable(struct mbuf *mp, int* off, int len, int how)
7096	{
7097	int error;
7098	#if DEBUG
7099	struct mbuf *n;
7100	int origlen, reslen;
7101
7102	origlen = m_length(*mp);
7103	#endif /* DEBUG */
7104
7105	#if 0 /* M_COPYALL is large enough */
7106	if (len == M_COPYALL) {
7107	len = m_length(mp) - off; /* XXX /
7108	}
7109	#endif
7110
7111	error = m_copyback0(mp, off, len, NULL,
7112	M_COPYBACK0_PRESERVE \| M_COPYBACK0_COW, how);
7113
7114	#if DEBUG
7115	reslen = `0`;
7116	for (n = *mp; n; n = n->m_next) {
7117	reslen += n->m_len;
7118	}
7119	if (origlen != reslen) {
7120	panic("m_makewritable: length changed");
7121	}
7122	if (((mp)->m_flags & M_PKTHDR) && reslen != (mp)->m_pkthdr.len) {
7123	panic("m_makewritable: inconsist");
7124	}
7125	#endif /* DEBUG */
7126
7127	return error;
7128	}
7129
7130	static int
7131	m_copyback0(struct mbuf *mp0, int* off, int len, const void vp, int* flags,
7132	int how)
7133	{
7134	int mlen;
7135	struct mbuf m, n;
7136	struct mbuf **mp;
7137	int totlen = `0`;
7138	const char *cp = vp;
7139
7140	VERIFY(mp0 != NULL);
7141	VERIFY(*mp0 != NULL);
7142	VERIFY((flags & M_COPYBACK0_PRESERVE) == `0` \|\| cp == NULL);
7143	VERIFY((flags & M_COPYBACK0_COPYBACK) == `0` \|\| cp != NULL);
7144
7145	/*
7146	* we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
7147	* assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
7148	*/
7149
7150	VERIFY((~flags & (M_COPYBACK0_EXTEND \| M_COPYBACK0_COW)) != `0`);
7151
7152	mp = mp0;
7153	m = *mp;
7154	while (off > (mlen = m->m_len)) {
7155	off -= mlen;
7156	totlen += mlen;
7157	if (m->m_next == NULL) {
7158	int tspace;
7159	extend:
7160	if (!(flags & M_COPYBACK0_EXTEND)) {
7161	goto out;
7162	}
7163
7164	/*
7165	* try to make some space at the end of "m".
7166	*/
7167
7168	mlen = m->m_len;
7169	if (off + len >= MINCLSIZE &&
7170	!(m->m_flags & M_EXT) && m->m_len == `0`) {
7171	MCLGET(m, how);
7172	}
7173	tspace = M_TRAILINGSPACE(m);
7174	if (tspace > `0`) {
7175	tspace = MIN(tspace, off + len);
7176	VERIFY(tspace > `0`);
7177	bzero(mtod(m, char *) + m->m_len,
7178	MIN(off, tspace));
7179	m->m_len += tspace;
7180	off += mlen;
7181	totlen -= mlen;
7182	continue;
7183	}
7184
7185	/*
7186	* need to allocate an mbuf.
7187	*/
7188
7189	if (off + len >= MINCLSIZE) {
7190	n = m_getcl(wait: how, type: m->m_type, flags: `0`);
7191	} else {
7192	n = _M_GET(how, m->m_type);
7193	}
7194	if (n == NULL) {
7195	goto out;
7196	}
7197	n->m_len = `0`;
7198	n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
7199	bzero(mtod(n, char *), MIN(n->m_len, off));
7200	m->m_next = n;
7201	}
7202	mp = &m->m_next;
7203	m = m->m_next;
7204	}
7205	while (len > `0`) {
7206	mlen = m->m_len - off;
7207	if (mlen != `0` && m_mclhasreference(m)) {
7208	char *datap;
7209	int eatlen;
7210
7211	/*
7212	* this mbuf is read-only.
7213	* allocate a new writable mbuf and try again.
7214	*/
7215
7216	#if DIAGNOSTIC
7217	if (!(flags & M_COPYBACK0_COW)) {
7218	panic("m_copyback0: read-only");
7219	}
7220	#endif /* DIAGNOSTIC */
7221
7222	/*
7223	* if we're going to write into the middle of
7224	* a mbuf, split it first.
7225	*/
7226	if (off > `0` && len < mlen) {
7227	n = m_split0(m0: m, len0: off, wait: how, copyhdr: `0`);
7228	if (n == NULL) {
7229	goto enobufs;
7230	}
7231	m->m_next = n;
7232	mp = &m->m_next;
7233	m = n;
7234	off = `0`;
7235	continue;
7236	}
7237
7238	/*
7239	* XXX TODO coalesce into the trailingspace of
7240	* the previous mbuf when possible.
7241	*/
7242
7243	/*
7244	* allocate a new mbuf. copy packet header if needed.
7245	*/
7246	n = _M_GET(how, m->m_type);
7247	if (n == NULL) {
7248	goto enobufs;
7249	}
7250	if (off == `0` && (m->m_flags & M_PKTHDR)) {
7251	M_COPY_PKTHDR(n, m);
7252	n->m_len = MHLEN;
7253	} else {
7254	if (len >= MINCLSIZE) {
7255	MCLGET(n, M_DONTWAIT);
7256	}
7257	n->m_len =
7258	(n->m_flags & M_EXT) ? MCLBYTES : MLEN;
7259	}
7260	if (n->m_len > len) {
7261	n->m_len = len;
7262	}
7263
7264	/*
7265	* free the region which has been overwritten.
7266	* copying data from old mbufs if requested.
7267	*/
7268	if (flags & M_COPYBACK0_PRESERVE) {
7269	datap = mtod(n, char *);
7270	} else {
7271	datap = NULL;
7272	}
7273	eatlen = n->m_len;
7274	VERIFY(off == `0` \|\| eatlen >= mlen);
7275	if (off > `0`) {
7276	VERIFY(len >= mlen);
7277	m->m_len = off;
7278	m->m_next = n;
7279	if (datap) {
7280	m_copydata(m, off, len: mlen, vp: datap);
7281	datap += mlen;
7282	}
7283	eatlen -= mlen;
7284	mp = &m->m_next;
7285	m = m->m_next;
7286	}
7287	while (m != NULL && m_mclhasreference(m) &&
7288	n->m_type == m->m_type && eatlen > `0`) {
7289	mlen = MIN(eatlen, m->m_len);
7290	if (datap) {
7291	m_copydata(m, off: `0`, len: mlen, vp: datap);
7292	datap += mlen;
7293	}
7294	m->m_data += mlen;
7295	m->m_len -= mlen;
7296	eatlen -= mlen;
7297	if (m->m_len == `0`) {
7298	*mp = m = m_free(m);
7299	}
7300	}
7301	if (eatlen > `0`) {
7302	n->m_len -= eatlen;
7303	}
7304	n->m_next = m;
7305	*mp = m = n;
7306	continue;
7307	}
7308	mlen = MIN(mlen, len);
7309	if (flags & M_COPYBACK0_COPYBACK) {
7310	bcopy(src: cp, mtod(m, caddr_t) + off, n: (unsigned)mlen);
7311	cp += mlen;
7312	}
7313	len -= mlen;
7314	mlen += off;
7315	off = `0`;
7316	totlen += mlen;
7317	if (len == `0`) {
7318	break;
7319	}
7320	if (m->m_next == NULL) {
7321	goto extend;
7322	}
7323	mp = &m->m_next;
7324	m = m->m_next;
7325	}
7326	out:
7327	if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
7328	VERIFY(flags & M_COPYBACK0_EXTEND);
7329	m->m_pkthdr.len = totlen;
7330	}
7331
7332	return `0`;
7333
7334	enobufs:
7335	return ENOBUFS;
7336	}
7337
7338	uint64_t
7339	mcl_to_paddr(char *addr)
7340	{
7341	#if CONFIG_MBUF_MCACHE
7342	vm_offset_t base_phys;
7343
7344	if (!MBUF_IN_MAP(addr)) {
7345	return `0`;
7346	}
7347	base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
7348
7349	if (base_phys == `0`) {
7350	return `0`;
7351	}
7352	return (uint64_t)(ptoa_64(base_phys) \| ((uint64_t)addr & PAGE_MASK));
7353	#else
7354	extern addr64_t kvtophys(vm_offset_t va);
7355
7356	return kvtophys(va: (vm_offset_t)addr);
7357	#endif /* CONFIG_MBUF_MCACHE */
7358	}
7359
7360	/*
7361	* Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
7362	* And really copy the thing. That way, we don't "precompute" checksums
7363	* for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
7364	* small packets, don't dup into a cluster. That way received packets
7365	* don't take up too much room in the sockbuf (cf. sbspace()).
7366	*/
7367	struct mbuf *
7368	m_dup(struct mbuf m, int* how)
7369	{
7370	struct mbuf n, *np;
7371	struct mbuf *top;
7372	int copyhdr = `0`;
7373
7374	np = &top;
7375	top = NULL;
7376	if (m->m_flags & M_PKTHDR) {
7377	copyhdr = `1`;
7378	}
7379
7380	/*
7381	* Quick check: if we have one mbuf and its data fits in an
7382	* mbuf with packet header, just copy and go.
7383	*/
7384	if (m->m_next == NULL) {
7385	/ Then just move the data into an mbuf and be done... /
7386	if (copyhdr) {
7387	if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
7388	if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
7389	return NULL;
7390	}
7391	n->m_len = m->m_len;
7392	m_dup_pkthdr(to: n, from: m, how);
7393	bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), n: m->m_len);
7394	return n;
7395	}
7396	} else if (m->m_len <= MLEN) {
7397	if ((n = _M_GET(how, m->m_type)) == NULL) {
7398	return NULL;
7399	}
7400	bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), n: m->m_len);
7401	n->m_len = m->m_len;
7402	return n;
7403	}
7404	}
7405	while (m != NULL) {
7406	#if BLUE_DEBUG
7407	printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
7408	m->m_data);
7409	#endif
7410	if (copyhdr) {
7411	n = _M_GETHDR(how, m->m_type);
7412	} else {
7413	n = _M_GET(how, m->m_type);
7414	}
7415	if (n == NULL) {
7416	goto nospace;
7417	}
7418	if (m->m_flags & M_EXT) {
7419	if (m->m_len <= m_maxsize(MC_CL)) {
7420	MCLGET(n, how);
7421	} else if (m->m_len <= m_maxsize(MC_BIGCL)) {
7422	n = m_mbigget(m: n, wait: how);
7423	} else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > `0`) {
7424	n = m_m16kget(m: n, wait: how);
7425	}
7426	if (!(n->m_flags & M_EXT)) {
7427	(void) m_free(m: n);
7428	goto nospace;
7429	}
7430	} else {
7431	VERIFY((copyhdr == `1` && m->m_len <= MHLEN) \|\|
7432	(copyhdr == `0` && m->m_len <= MLEN));
7433	}
7434	*np = n;
7435	if (copyhdr) {
7436	/ Don't use M_COPY_PKTHDR: preserve m_data /
7437	m_dup_pkthdr(to: n, from: m, how);
7438	copyhdr = `0`;
7439	if (!(n->m_flags & M_EXT)) {
7440	n->m_data = (uintptr_t)n->m_pktdat;
7441	}
7442	}
7443	n->m_len = m->m_len;
7444	/*
7445	* Get the dup on the same bdry as the original
7446	* Assume that the two mbufs have the same offset to data area
7447	* (up to word boundaries)
7448	*/
7449	bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), n: (unsigned)n->m_len);
7450	m = m->m_next;
7451	np = &n->m_next;
7452	#if BLUE_DEBUG
7453	printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
7454	n->m_data);
7455	#endif
7456	}
7457
7458	return top;
7459
7460	nospace:
7461	m_freem(m: top);
7462	return NULL;
7463	}
7464
7465	#define MBUF_MULTIPAGES(m) \
7466	(((m)->m_flags & M_EXT) && \
7467	((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
7468	&& (m)->m_len > PAGE_SIZE) \|\| \
7469	(!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
7470	P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
7471
7472	static struct mbuf *
7473	m_expand(struct mbuf m, struct* mbuf **last)
7474	{
7475	struct mbuf *top = NULL;
7476	struct mbuf **nm = &top;
7477	uintptr_t data0, data;
7478	unsigned int len0, len;
7479
7480	VERIFY(MBUF_MULTIPAGES(m));
7481	VERIFY(m->m_next == NULL);
7482	data0 = (uintptr_t)m->m_data;
7483	len0 = m->m_len;
7484	*last = top;
7485
7486	for (;;) {
7487	struct mbuf *n;
7488
7489	data = data0;
7490	if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
7491	len = PAGE_SIZE;
7492	} else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
7493	P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
7494	len = P2ROUNDUP(data, PAGE_SIZE) - data;
7495	} else {
7496	len = len0;
7497	}
7498
7499	VERIFY(len > `0`);
7500	VERIFY(m->m_flags & M_EXT);
7501	m->m_data = data;
7502	m->m_len = len;
7503
7504	nm = last = m;
7505	nm = &m->m_next;
7506	m->m_next = NULL;
7507
7508	data0 += len;
7509	len0 -= len;
7510	if (len0 == `0`) {
7511	break;
7512	}
7513
7514	n = _M_RETRY(M_DONTWAIT, MT_DATA);
7515	if (n == NULL) {
7516	m_freem(m: top);
7517	top = *last = NULL;
7518	break;
7519	}
7520
7521	n->m_ext = m->m_ext;
7522	m_incref(m);
7523	n->m_flags \|= M_EXT;
7524	m = n;
7525	}
7526	return top;
7527	}
7528
7529	struct mbuf *
7530	m_normalize(struct mbuf *m)
7531	{
7532	struct mbuf *top = NULL;
7533	struct mbuf **nm = &top;
7534	boolean_t expanded = FALSE;
7535
7536	while (m != NULL) {
7537	struct mbuf *n;
7538
7539	n = m->m_next;
7540	m->m_next = NULL;
7541
7542	/ Does the data cross one or more page boundaries? /
7543	if (MBUF_MULTIPAGES(m)) {
7544	struct mbuf *last;
7545	if ((m = m_expand(m, last: &last)) == NULL) {
7546	m_freem(m: n);
7547	m_freem(m: top);
7548	top = NULL;
7549	break;
7550	}
7551	*nm = m;
7552	nm = &last->m_next;
7553	expanded = TRUE;
7554	} else {
7555	*nm = m;
7556	nm = &m->m_next;
7557	}
7558	m = n;
7559	}
7560	if (expanded) {
7561	os_atomic_inc(&mb_normalized, relaxed);
7562	}
7563	return top;
7564	}
7565
7566	/*
7567	* Append the specified data to the indicated mbuf chain,
7568	* Extend the mbuf chain if the new data does not fit in
7569	* existing space.
7570	*
7571	* Return 1 if able to complete the job; otherwise 0.
7572	*/
7573	int
7574	m_append(struct mbuf m0, int* len, caddr_t cp)
7575	{
7576	struct mbuf m, n;
7577	int remainder, space;
7578
7579	for (m = m0; m->m_next != NULL; m = m->m_next) {
7580	;
7581	}
7582	remainder = len;
7583	space = M_TRAILINGSPACE(m);
7584	if (space > `0`) {
7585	/*
7586	* Copy into available space.
7587	*/
7588	if (space > remainder) {
7589	space = remainder;
7590	}
7591	bcopy(src: cp, mtod(m, caddr_t) + m->m_len, n: space);
7592	m->m_len += space;
7593	cp += space;
7594	remainder -= space;
7595	}
7596	while (remainder > `0`) {
7597	/*
7598	* Allocate a new mbuf; could check space
7599	* and allocate a cluster instead.
7600	*/
7601	n = m_get(M_WAITOK, type: m->m_type);
7602	if (n == NULL) {
7603	break;
7604	}
7605	n->m_len = min(MLEN, b: remainder);
7606	bcopy(src: cp, mtod(n, caddr_t), n: n->m_len);
7607	cp += n->m_len;
7608	remainder -= n->m_len;
7609	m->m_next = n;
7610	m = n;
7611	}
7612	if (m0->m_flags & M_PKTHDR) {
7613	m0->m_pkthdr.len += len - remainder;
7614	}
7615	return remainder == `0`;
7616	}
7617
7618	struct mbuf *
7619	m_last(struct mbuf *m)
7620	{
7621	while (m->m_next != NULL) {
7622	m = m->m_next;
7623	}
7624	return m;
7625	}
7626
7627	unsigned int
7628	m_fixhdr(struct mbuf *m0)
7629	{
7630	u_int len;
7631
7632	VERIFY(m0->m_flags & M_PKTHDR);
7633
7634	len = m_length2(m0, NULL);
7635	m0->m_pkthdr.len = len;
7636	return len;
7637	}
7638
7639	unsigned int
7640	m_length2(struct mbuf m0, struct* mbuf **last)
7641	{
7642	struct mbuf *m;
7643	u_int len;
7644
7645	len = `0`;
7646	for (m = m0; m != NULL; m = m->m_next) {
7647	len += m->m_len;
7648	if (m->m_next == NULL) {
7649	break;
7650	}
7651	}
7652	if (last != NULL) {
7653	*last = m;
7654	}
7655	return len;
7656	}
7657
7658	/*
7659	* Defragment a mbuf chain, returning the shortest possible chain of mbufs
7660	* and clusters. If allocation fails and this cannot be completed, NULL will
7661	* be returned, but the passed in chain will be unchanged. Upon success,
7662	* the original chain will be freed, and the new chain will be returned.
7663	*
7664	* If a non-packet header is passed in, the original mbuf (chain?) will
7665	* be returned unharmed.
7666	*
7667	* If offset is specfied, the first mbuf in the chain will have a leading
7668	* space of the amount stated by the "off" parameter.
7669	*
7670	* This routine requires that the m_pkthdr.header field of the original
7671	* mbuf chain is cleared by the caller.
7672	*/
7673	struct mbuf *
7674	m_defrag_offset(struct mbuf m0, u_int32_t off, int* how)
7675	{
7676	struct mbuf m_new = NULL, m_final = NULL;
7677	int progress = `0`, length, pktlen;
7678
7679	if (!(m0->m_flags & M_PKTHDR)) {
7680	return m0;
7681	}
7682
7683	VERIFY(off < MHLEN);
7684	m_fixhdr(m0); / Needed sanity check /
7685
7686	pktlen = m0->m_pkthdr.len + off;
7687	if (pktlen > MHLEN) {
7688	m_final = m_getcl(wait: how, MT_DATA, M_PKTHDR);
7689	} else {
7690	m_final = m_gethdr(wait: how, MT_DATA);
7691	}
7692
7693	if (m_final == NULL) {
7694	goto nospace;
7695	}
7696
7697	if (off > `0`) {
7698	pktlen -= off;
7699	m_final->m_data += off;
7700	}
7701
7702	/*
7703	* Caller must have handled the contents pointed to by this
7704	* pointer before coming here, as otherwise it will point to
7705	* the original mbuf which will get freed upon success.
7706	*/
7707	VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
7708
7709	if (m_dup_pkthdr(to: m_final, from: m0, how) == `0`) {
7710	goto nospace;
7711	}
7712
7713	m_new = m_final;
7714
7715	while (progress < pktlen) {
7716	length = pktlen - progress;
7717	if (length > MCLBYTES) {
7718	length = MCLBYTES;
7719	}
7720	length -= ((m_new == m_final) ? off : `0`);
7721	if (length < `0`) {
7722	goto nospace;
7723	}
7724
7725	if (m_new == NULL) {
7726	if (length > MLEN) {
7727	m_new = m_getcl(wait: how, MT_DATA, flags: `0`);
7728	} else {
7729	m_new = m_get(wait: how, MT_DATA);
7730	}
7731	if (m_new == NULL) {
7732	goto nospace;
7733	}
7734	}
7735
7736	m_copydata(m: m0, off: progress, len: length, mtod(m_new, caddr_t));
7737	progress += length;
7738	m_new->m_len = length;
7739	if (m_new != m_final) {
7740	m_cat(m: m_final, n: m_new);
7741	}
7742	m_new = NULL;
7743	}
7744	m_freem(m: m0);
7745	m0 = m_final;
7746	return m0;
7747	nospace:
7748	if (m_final) {
7749	m_freem(m: m_final);
7750	}
7751	return NULL;
7752	}
7753
7754	struct mbuf *
7755	m_defrag(struct mbuf m0, int* how)
7756	{
7757	return m_defrag_offset(m0, off: `0`, how);
7758	}
7759
7760	void
7761	m_mchtype(struct mbuf m, int* t)
7762	{
7763	mtype_stat_inc(t);
7764	mtype_stat_dec(m->m_type);
7765	(m)->m_type = t;
7766	}
7767
7768	void *__unsafe_indexable
7769	m_mtod(struct mbuf *m)
7770	{
7771	return m_mtod_current(m);
7772	}
7773
7774	void
7775	m_mcheck(struct mbuf *m)
7776	{
7777	_MCHECK(m);
7778	}
7779
7780	/*
7781	* Return a pointer to mbuf/offset of location in mbuf chain.
7782	*/
7783	struct mbuf *
7784	m_getptr(struct mbuf m, int* loc, int *off)
7785	{
7786	while (loc >= `0`) {
7787	/ Normal end of search. /
7788	if (m->m_len > loc) {
7789	*off = loc;
7790	return m;
7791	} else {
7792	loc -= m->m_len;
7793	if (m->m_next == NULL) {
7794	if (loc == `0`) {
7795	/ Point at the end of valid data. /
7796	*off = m->m_len;
7797	return m;
7798	}
7799	return NULL;
7800	}
7801	m = m->m_next;
7802	}
7803	}
7804	return NULL;
7805	}
7806
7807	#if CONFIG_MBUF_MCACHE
7808	/*
7809	* Inform the corresponding mcache(s) that there's a waiter below.
7810	*/
7811	static void
7812	mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
7813	{
7814	mcache_waiter_inc(m_cache(class));
7815	if (comp) {
7816	if (class == MC_CL) {
7817	mcache_waiter_inc(m_cache(MC_MBUF_CL));
7818	} else if (class == MC_BIGCL) {
7819	mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
7820	} else if (class == MC_16KCL) {
7821	mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
7822	} else {
7823	mcache_waiter_inc(m_cache(MC_MBUF_CL));
7824	mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
7825	}
7826	}
7827	}
7828
7829	/*
7830	* Inform the corresponding mcache(s) that there's no more waiter below.
7831	*/
7832	static void
7833	mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
7834	{
7835	mcache_waiter_dec(m_cache(class));
7836	if (comp) {
7837	if (class == MC_CL) {
7838	mcache_waiter_dec(m_cache(MC_MBUF_CL));
7839	} else if (class == MC_BIGCL) {
7840	mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
7841	} else if (class == MC_16KCL) {
7842	mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
7843	} else {
7844	mcache_waiter_dec(m_cache(MC_MBUF_CL));
7845	mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
7846	}
7847	}
7848	}
7849
7850	static bool mbuf_watchdog_defunct_active = false;
7851
7852	#endif /* CONFIG_MBUF_MCACHE */
7853
7854	static uint32_t
7855	mbuf_watchdog_socket_space(struct socket *so)
7856	{
7857	uint32_t space = `0`;
7858
7859	if (so == NULL) {
7860	return `0`;
7861	}
7862
7863	space = so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
7864
7865	#if INET
7866	if ((SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) &&
7867	SOCK_PROTO(so) == IPPROTO_TCP) {
7868	space += tcp_reass_qlen_space(so);
7869	}
7870	#endif /* INET */
7871
7872	return space;
7873	}
7874
7875	struct mbuf_watchdog_defunct_args {
7876	struct proc *top_app;
7877	uint32_t top_app_space_used;
7878	bool non_blocking;
7879	};
7880
7881	static bool
7882	proc_fd_trylock(proc_t p)
7883	{
7884	return lck_mtx_try_lock(lck: &p->p_fd.fd_lock);
7885	}
7886
7887	static int
7888	mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
7889	{
7890	struct fileproc *fp = NULL;
7891	struct mbuf_watchdog_defunct_args *args =
7892	(struct mbuf_watchdog_defunct_args *)arg;
7893	uint32_t space_used = `0`;
7894
7895	/*
7896	* Non-blocking is only used when dumping the mbuf usage from the watchdog
7897	*/
7898	if (args->non_blocking) {
7899	if (!proc_fd_trylock(p)) {
7900	return PROC_RETURNED;
7901	}
7902	} else {
7903	proc_fdlock(p);
7904	}
7905	fdt_foreach(fp, p) {
7906	struct fileglob *fg = fp->fp_glob;
7907	struct socket *so = NULL;
7908
7909	if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
7910	continue;
7911	}
7912	so = fg_get_data(fg);
7913	/*
7914	* We calculate the space without the socket
7915	* lock because we don't want to be blocked
7916	* by another process that called send() and
7917	* is stuck waiting for mbufs.
7918	*
7919	* These variables are 32-bit so we don't have
7920	* to worry about incomplete reads.
7921	*/
7922	space_used += mbuf_watchdog_socket_space(so);
7923	}
7924	proc_fdunlock(p);
7925	if (space_used > args->top_app_space_used) {
7926	if (args->top_app != NULL) {
7927	proc_rele(p: args->top_app);
7928	}
7929	args->top_app = p;
7930	args->top_app_space_used = space_used;
7931
7932	return PROC_CLAIMED;
7933	} else {
7934	return PROC_RETURNED;
7935	}
7936	}
7937
7938	extern char proc_name_address(void* *p);
7939
7940	static void
7941	mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
7942	{
7943	#pragma unused(arg0, arg1)
7944	struct mbuf_watchdog_defunct_args args = {};
7945	struct fileproc *fp = NULL;
7946
7947	args.non_blocking = false;
7948	proc_iterate(PROC_ALLPROCLIST,
7949	callout: mbuf_watchdog_defunct_iterate, arg: &args, NULL, NULL);
7950
7951	/*
7952	* Defunct all sockets from this app.
7953	*/
7954	if (args.top_app != NULL) {
7955	#if CONFIG_MBUF_MCACHE
7956	/ Restart the watchdog count. /
7957	lck_mtx_lock(mbuf_mlock);
7958	microuptime(&mb_wdtstart);
7959	lck_mtx_unlock(mbuf_mlock);
7960	#endif
7961	os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
7962	__func__,
7963	proc_name_address(args.top_app),
7964	proc_pid(args.top_app));
7965	proc_fdlock(args.top_app);
7966	fdt_foreach(fp, args.top_app) {
7967	struct fileglob *fg = fp->fp_glob;
7968	struct socket *so = NULL;
7969
7970	if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
7971	continue;
7972	}
7973	so = (struct socket *)fp_get_data(fp);
7974	if (!socket_try_lock(so)) {
7975	continue;
7976	}
7977	if (sosetdefunct(args.top_app, so,
7978	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
7979	TRUE) == `0`) {
7980	sodefunct(args.top_app, so,
7981	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
7982	}
7983	socket_unlock(so, refcount: `0`);
7984	}
7985	proc_fdunlock(args.top_app);
7986	proc_rele(p: args.top_app);
7987	mbstat.m_forcedefunct++;
7988	#if !CONFIG_MBUF_MCACHE
7989	zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_2K);
7990	zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_4K);
7991	zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_16K);
7992	zone_drain(zone: zone_by_id(zid: ZONE_ID_MBUF));
7993	zone_drain(zone: zone_by_id(zid: ZONE_ID_CLUSTER_2K));
7994	zone_drain(zone: zone_by_id(zid: ZONE_ID_CLUSTER_4K));
7995	zone_drain(zone: zone_by_id(zid: ZONE_ID_CLUSTER_16K));
7996	zone_drain(zone: zone_by_id(zid: ZONE_ID_MBUF_REF));
7997	#endif
7998	}
7999	#if CONFIG_MBUF_MCACHE
8000	mbuf_watchdog_defunct_active = false;
8001	#endif
8002	}
8003
8004	#if !CONFIG_MBUF_MCACHE
8005	static LCK_GRP_DECLARE(mbuf_exhausted_grp, "mbuf-exhausted");
8006	static LCK_TICKET_DECLARE(mbuf_exhausted_lock, &mbuf_exhausted_grp);
8007	static uint32_t mbuf_exhausted_mask;
8008
8009	#define MBUF_EXHAUSTED_DRAIN_MASK (\
8010	(1u << MC_MBUF) \| \
8011	(1u << MC_CL) \| \
8012	(1u << MC_BIGCL) \| \
8013	(1u << MC_16KCL))
8014
8015	#define MBUF_EXHAUSTED_DEFUNCT_MASK (\
8016	(1u << MC_MBUF) \| \
8017	(1u << MC_MBUF_CL) \| \
8018	(1u << MC_MBUF_BIGCL) \| \
8019	(1u << MC_MBUF_16KCL))
8020
8021	static void
8022	mbuf_watchdog_drain_composite(thread_call_param_t arg0, thread_call_param_t arg1)
8023	{
8024	#pragma unused(arg0, arg1)
8025	zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_2K);
8026	zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_4K);
8027	zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_16K);
8028	}
8029
8030	static void
8031	mbuf_zone_exhausted_start(uint32_t bit)
8032	{
8033	uint64_t deadline;
8034	uint32_t mask;
8035
8036	mask = mbuf_exhausted_mask;
8037	mbuf_exhausted_mask = mask \| bit;
8038
8039	if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == `0` &&
8040	(bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
8041	clock_interval_to_deadline(MB_WDT_MAXTIME * `1000` / `10`,
8042	NSEC_PER_MSEC, result: &deadline);
8043	thread_call_enter_delayed(call: mbuf_drain_tcall, deadline);
8044	}
8045
8046	if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == `0` &&
8047	(bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
8048	clock_interval_to_deadline(MB_WDT_MAXTIME * `1000` / `2`,
8049	NSEC_PER_MSEC, result: &deadline);
8050	thread_call_enter_delayed(call: mbuf_defunct_tcall, deadline);
8051	}
8052	}
8053
8054	static void
8055	mbuf_zone_exhausted_end(uint32_t bit)
8056	{
8057	uint32_t mask;
8058
8059	mask = (mbuf_exhausted_mask &= ~bit);
8060
8061	if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == `0` &&
8062	(bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
8063	thread_call_cancel(call: mbuf_drain_tcall);
8064	}
8065
8066	if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == `0` &&
8067	(bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
8068	thread_call_cancel(call: mbuf_defunct_tcall);
8069	}
8070	}
8071
8072	static void
8073	mbuf_zone_exhausted(zone_id_t zid, zone_t zone __unused, bool exhausted)
8074	{
8075	uint32_t bit;
8076
8077	if (zid < m_class_to_zid(MBUF_CLASS_MIN) \|\|
8078	zid > m_class_to_zid(MBUF_CLASS_MAX)) {
8079	return;
8080	}
8081
8082	bit = `1u` << m_class_from_zid(zid);
8083
8084	lck_ticket_lock_nopreempt(tlock: &mbuf_exhausted_lock, grp: &mbuf_exhausted_grp);
8085
8086	if (exhausted) {
8087	mbuf_zone_exhausted_start(bit);
8088	} else {
8089	mbuf_zone_exhausted_end(bit);
8090	}
8091
8092	lck_ticket_unlock_nopreempt(tlock: &mbuf_exhausted_lock);
8093	}
8094	EVENT_REGISTER_HANDLER(ZONE_EXHAUSTED, mbuf_zone_exhausted);
8095	#endif /* !CONFIG_MBUF_MCACHE */
8096
8097	#if CONFIG_MBUF_MCACHE
8098	/*
8099	* Called during slab (blocking and non-blocking) allocation. If there
8100	* is at least one waiter, and the time since the first waiter is blocked
8101	* is greater than the watchdog timeout, panic the system.
8102	*/
8103	static void
8104	mbuf_watchdog(void)
8105	{
8106	struct timeval now;
8107	unsigned int since;
8108	static thread_call_t defunct_tcall = NULL;
8109
8110	if (mb_waiters == `0` \|\| !mb_watchdog) {
8111	return;
8112	}
8113
8114	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8115
8116	microuptime(&now);
8117	since = now.tv_sec - mb_wdtstart.tv_sec;
8118
8119	if (mbuf_watchdog_defunct_active) {
8120	/*
8121	* Don't panic the system while we are trying
8122	* to find sockets to defunct.
8123	*/
8124	return;
8125	}
8126	if (since >= MB_WDT_MAXTIME) {
8127	panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
8128	mb_waiters, since, mbuf_dump());
8129	/ NOTREACHED /
8130	}
8131	/*
8132	* Check if we are about to panic the system due
8133	* to lack of mbufs and start defuncting sockets
8134	* from processes that use too many sockets.
8135	*
8136	* We're always called with the mbuf_mlock held,
8137	* so that also protects mbuf_watchdog_defunct_active.
8138	*/
8139	if (since >= MB_WDT_MAXTIME / `2`) {
8140	/*
8141	* Start a thread to defunct sockets
8142	* from apps that are over-using their socket
8143	* buffers.
8144	*/
8145	if (defunct_tcall == NULL) {
8146	defunct_tcall =
8147	thread_call_allocate_with_options(mbuf_watchdog_defunct,
8148	NULL,
8149	THREAD_CALL_PRIORITY_KERNEL,
8150	THREAD_CALL_OPTIONS_ONCE);
8151	}
8152	if (defunct_tcall != NULL) {
8153	mbuf_watchdog_defunct_active = true;
8154	thread_call_enter(defunct_tcall);
8155	}
8156	}
8157	}
8158
8159	/*
8160	* Called during blocking allocation. Returns TRUE if one or more objects
8161	* are available at the per-CPU caches layer and that allocation should be
8162	* retried at that level.
8163	*/
8164	static boolean_t
8165	mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
8166	{
8167	boolean_t mcache_retry = FALSE;
8168
8169	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8170
8171	/ Check if there's anything at the cache layer /
8172	if (mbuf_cached_above(class, wait)) {
8173	mcache_retry = TRUE;
8174	goto done;
8175	}
8176
8177	/ Nothing? Then try hard to get it from somewhere /
8178	m_reclaim(class, num, (wait & MCR_COMP));
8179
8180	/ We tried hard and got something? /
8181	if (m_infree(class) > `0`) {
8182	mbstat.m_wait++;
8183	goto done;
8184	} else if (mbuf_cached_above(class, wait)) {
8185	mbstat.m_wait++;
8186	mcache_retry = TRUE;
8187	goto done;
8188	} else if (wait & MCR_TRYHARD) {
8189	mcache_retry = TRUE;
8190	goto done;
8191	}
8192
8193	/*
8194	* There's really nothing for us right now; inform the
8195	* cache(s) that there is a waiter below and go to sleep.
8196	*/
8197	mbuf_waiter_inc(class, (wait & MCR_COMP));
8198
8199	VERIFY(!(wait & MCR_NOSLEEP));
8200
8201	/*
8202	* If this is the first waiter, arm the watchdog timer. Otherwise
8203	* check if we need to panic the system due to watchdog timeout.
8204	*/
8205	if (mb_waiters == `0`) {
8206	microuptime(&mb_wdtstart);
8207	} else {
8208	mbuf_watchdog();
8209	}
8210
8211	mb_waiters++;
8212	m_region_expand(class) += m_total(class) + num;
8213	/ wake up the worker thread /
8214	if (mbuf_worker_ready &&
8215	mbuf_worker_needs_wakeup) {
8216	wakeup((caddr_t)&mbuf_worker_needs_wakeup);
8217	mbuf_worker_needs_wakeup = FALSE;
8218	}
8219	mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
8220	(void) msleep(mb_waitchan, mbuf_mlock, (PZERO - `1`), m_cname(class), NULL);
8221	mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
8222
8223	/ We are now up; stop getting notified until next round /
8224	mbuf_waiter_dec(class, (wait & MCR_COMP));
8225
8226	/ We waited and got something /
8227	if (m_infree(class) > `0`) {
8228	mbstat.m_wait++;
8229	goto done;
8230	} else if (mbuf_cached_above(class, wait)) {
8231	mbstat.m_wait++;
8232	mcache_retry = TRUE;
8233	}
8234	done:
8235	return mcache_retry;
8236	}
8237
8238	__attribute__((noreturn))
8239	static void
8240	mbuf_worker_thread(void)
8241	{
8242	int mbuf_expand;
8243
8244	while (`1`) {
8245	lck_mtx_lock(mbuf_mlock);
8246	mbwdog_logger("worker thread running");
8247	mbuf_worker_run_cnt++;
8248	mbuf_expand = `0`;
8249	/*
8250	* Allocations are based on page size, so if we have depleted
8251	* the reserved spaces, try to free mbufs from the major classes.
8252	*/
8253	#if PAGE_SIZE == 4096
8254	uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
8255	uint32_t m_clusters = m_total(MC_CL);
8256	uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
8257	uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
8258	if (sumclusters >= nclusters) {
8259	mbwdog_logger("reclaiming bigcl");
8260	mbuf_drain_locked(TRUE);
8261	m_reclaim(MC_BIGCL, `4`, FALSE);
8262	}
8263	#else
8264	uint32_t m_16kclusters = m_total(MC_16KCL);
8265	if (njcl > `0` && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
8266	mbwdog_logger("reclaiming 16kcl");
8267	mbuf_drain_locked(TRUE);
8268	m_reclaim(MC_16KCL, `4`, FALSE);
8269	}
8270	#endif
8271	if (m_region_expand(MC_CL) > `0`) {
8272	int n;
8273	mb_expand_cl_cnt++;
8274	/ Adjust to current number of cluster in use /
8275	n = m_region_expand(MC_CL) -
8276	(m_total(MC_CL) - m_infree(MC_CL));
8277	if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
8278	n = m_maxlimit(MC_CL) - m_total(MC_CL);
8279	}
8280	if (n > `0`) {
8281	mb_expand_cl_total += n;
8282	}
8283	m_region_expand(MC_CL) = `0`;
8284
8285	if (n > `0`) {
8286	mbwdog_logger("expanding MC_CL by %d", n);
8287	freelist_populate(MC_CL, n, M_WAIT);
8288	}
8289	}
8290	if (m_region_expand(MC_BIGCL) > `0`) {
8291	int n;
8292	mb_expand_bigcl_cnt++;
8293	/ Adjust to current number of 4 KB cluster in use /
8294	n = m_region_expand(MC_BIGCL) -
8295	(m_total(MC_BIGCL) - m_infree(MC_BIGCL));
8296	if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
8297	n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
8298	}
8299	if (n > `0`) {
8300	mb_expand_bigcl_total += n;
8301	}
8302	m_region_expand(MC_BIGCL) = `0`;
8303
8304	if (n > `0`) {
8305	mbwdog_logger("expanding MC_BIGCL by %d", n);
8306	freelist_populate(MC_BIGCL, n, M_WAIT);
8307	}
8308	}
8309	if (m_region_expand(MC_16KCL) > `0`) {
8310	int n;
8311	mb_expand_16kcl_cnt++;
8312	/ Adjust to current number of 16 KB cluster in use /
8313	n = m_region_expand(MC_16KCL) -
8314	(m_total(MC_16KCL) - m_infree(MC_16KCL));
8315	if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
8316	n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
8317	}
8318	if (n > `0`) {
8319	mb_expand_16kcl_total += n;
8320	}
8321	m_region_expand(MC_16KCL) = `0`;
8322
8323	if (n > `0`) {
8324	mbwdog_logger("expanding MC_16KCL by %d", n);
8325	(void) freelist_populate(MC_16KCL, n, M_WAIT);
8326	}
8327	}
8328
8329	/*
8330	* Because we can run out of memory before filling the mbuf
8331	* map, we should not allocate more clusters than they are
8332	* mbufs -- otherwise we could have a large number of useless
8333	* clusters allocated.
8334	*/
8335	mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
8336	m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
8337	m_total(MC_16KCL));
8338	uint32_t total_mbufs = m_total(MC_MBUF);
8339	uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
8340	m_total(MC_16KCL);
8341	if (total_mbufs < total_clusters) {
8342	mbwdog_logger("expanding MC_MBUF by %d",
8343	total_clusters - total_mbufs);
8344	}
8345	while (total_mbufs < total_clusters) {
8346	mb_expand_cnt++;
8347	if (freelist_populate(MC_MBUF, `1`, M_WAIT) == `0`) {
8348	break;
8349	}
8350	total_mbufs = m_total(MC_MBUF);
8351	total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
8352	m_total(MC_16KCL);
8353	}
8354
8355	mbuf_worker_needs_wakeup = TRUE;
8356	/*
8357	* If there's a deadlock and we're not sending / receiving
8358	* packets, net_uptime() won't be updated. Update it here
8359	* so we are sure it's correct.
8360	*/
8361	net_update_uptime();
8362	mbuf_worker_last_runtime = net_uptime();
8363	assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
8364	THREAD_UNINT);
8365	mbwdog_logger("worker thread sleeping");
8366	lck_mtx_unlock(mbuf_mlock);
8367	(void) thread_block((thread_continue_t)mbuf_worker_thread);
8368	}
8369	}
8370
8371	__attribute__((noreturn))
8372	static void
8373	mbuf_worker_thread_init(void)
8374	{
8375	mbuf_worker_ready++;
8376	mbuf_worker_thread();
8377	}
8378
8379	static mcl_slab_t *
8380	slab_get(void *buf)
8381	{
8382	mcl_slabg_t *slg;
8383	unsigned int ix, k;
8384
8385	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8386
8387	VERIFY(MBUF_IN_MAP(buf));
8388	ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
8389	VERIFY(ix < maxslabgrp);
8390
8391	if ((slg = slabstbl[ix]) == NULL) {
8392	/*
8393	* In the current implementation, we never shrink the slabs
8394	* table; if we attempt to reallocate a cluster group when
8395	* it's already allocated, panic since this is a sign of a
8396	* memory corruption (slabstbl[ix] got nullified).
8397	*/
8398	++slabgrp;
8399	VERIFY(ix < slabgrp);
8400	/*
8401	* Slabs expansion can only be done single threaded; when
8402	* we get here, it must be as a result of m_clalloc() which
8403	* is serialized and therefore mb_clalloc_busy must be set.
8404	*/
8405	VERIFY(mb_clalloc_busy);
8406	lck_mtx_unlock(mbuf_mlock);
8407
8408	/ This is a new buffer; create the slabs group for it /
8409	slg = zalloc_permanent_type(mcl_slabg_t);
8410	slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
8411	ZALIGN(mcl_slab_t));
8412
8413	lck_mtx_lock(mbuf_mlock);
8414	/*
8415	* No other thread could have gone into m_clalloc() after
8416	* we dropped the lock above, so verify that it's true.
8417	*/
8418	VERIFY(mb_clalloc_busy);
8419
8420	slabstbl[ix] = slg;
8421
8422	/ Chain each slab in the group to its forward neighbor /
8423	for (k = `1`; k < NSLABSPMB; k++) {
8424	slg->slg_slab[k - `1`].sl_next = &slg->slg_slab[k];
8425	}
8426	VERIFY(slg->slg_slab[NSLABSPMB - `1`].sl_next == NULL);
8427
8428	/ And chain the last slab in the previous group to this /
8429	if (ix > `0`) {
8430	VERIFY(slabstbl[ix - `1`]->
8431	slg_slab[NSLABSPMB - `1`].sl_next == NULL);
8432	slabstbl[ix - `1`]->slg_slab[NSLABSPMB - `1`].sl_next =
8433	&slg->slg_slab[`0`];
8434	}
8435	}
8436
8437	ix = MTOPG(buf) % NSLABSPMB;
8438	VERIFY(ix < NSLABSPMB);
8439
8440	return &slg->slg_slab[ix];
8441	}
8442
8443	static void
8444	slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
8445	void base, void* head, unsigned* int len, int refcnt, int chunks)
8446	{
8447	sp->sl_class = class;
8448	sp->sl_flags = flags;
8449	sp->sl_base = base;
8450	sp->sl_head = head;
8451	sp->sl_len = len;
8452	sp->sl_refcnt = refcnt;
8453	sp->sl_chunks = chunks;
8454	slab_detach(sp);
8455	}
8456
8457	static void
8458	slab_insert(mcl_slab_t *sp, mbuf_class_t class)
8459	{
8460	VERIFY(slab_is_detached(sp));
8461	m_slab_cnt(class)++;
8462	TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
8463	sp->sl_flags &= ~SLF_DETACHED;
8464
8465	/*
8466	* If a buffer spans multiple contiguous pages then mark them as
8467	* detached too
8468	*/
8469	if (class == MC_16KCL) {
8470	int k;
8471	for (k = `1`; k < NSLABSP16KB; k++) {
8472	sp = sp->sl_next;
8473	/ Next slab must already be present /
8474	VERIFY(sp != NULL && slab_is_detached(sp));
8475	sp->sl_flags &= ~SLF_DETACHED;
8476	}
8477	}
8478	}
8479
8480	static void
8481	slab_remove(mcl_slab_t *sp, mbuf_class_t class)
8482	{
8483	int k;
8484	VERIFY(!slab_is_detached(sp));
8485	VERIFY(m_slab_cnt(class) > `0`);
8486	m_slab_cnt(class)--;
8487	TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
8488	slab_detach(sp);
8489	if (class == MC_16KCL) {
8490	for (k = `1`; k < NSLABSP16KB; k++) {
8491	sp = sp->sl_next;
8492	/ Next slab must already be present /
8493	VERIFY(sp != NULL);
8494	VERIFY(!slab_is_detached(sp));
8495	slab_detach(sp);
8496	}
8497	}
8498	}
8499
8500	static boolean_t
8501	slab_inrange(mcl_slab_t sp, void* *buf)
8502	{
8503	return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
8504	(uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
8505	}
8506
8507	#undef panic
8508
8509	static void
8510	slab_nextptr_panic(mcl_slab_t sp, void* *addr)
8511	{
8512	int i;
8513	unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
8514	uintptr_t buf = (uintptr_t)sp->sl_base;
8515
8516	for (i = `0`; i < sp->sl_chunks; i++, buf += chunk_len) {
8517	void next = ((mcache_obj_t )buf)->obj_next;
8518	if (next != addr) {
8519	continue;
8520	}
8521	if (!mclverify) {
8522	if (next != NULL && !MBUF_IN_MAP(next)) {
8523	mcache_t *cp = m_cache(sp->sl_class);
8524	panic("%s: %s buffer %p in slab %p modified "
8525	"after free at offset 0: %p out of range "
8526	"[%p-%p)\n", __func__, cp->mc_name,
8527	(void *)buf, sp, next, mbutl, embutl);
8528	/ NOTREACHED /
8529	}
8530	} else {
8531	mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
8532	(mcache_obj_t *)buf);
8533	mcl_audit_verify_nextptr(next, mca);
8534	}
8535	}
8536	}
8537
8538	static void
8539	slab_detach(mcl_slab_t *sp)
8540	{
8541	sp->sl_link.tqe_next = (mcl_slab_t *)-`1`;
8542	sp->sl_link.tqe_prev = (mcl_slab_t **)-`1`;
8543	sp->sl_flags \|= SLF_DETACHED;
8544	}
8545
8546	static boolean_t
8547	slab_is_detached(mcl_slab_t *sp)
8548	{
8549	return (intptr_t)sp->sl_link.tqe_next == -`1` &&
8550	(intptr_t)sp->sl_link.tqe_prev == -`1` &&
8551	(sp->sl_flags & SLF_DETACHED);
8552	}
8553
8554	static void
8555	mcl_audit_init(void buf, mcache_audit_t *mca_list,
8556	mcache_obj_t *con_list, size_t con_size, unsigned* int num)
8557	{
8558	mcache_audit_t mca, mca_tail;
8559	mcache_obj_t *con = NULL;
8560	boolean_t save_contents = (con_list != NULL);
8561	unsigned int i, ix;
8562
8563	ASSERT(num <= NMBPG);
8564	ASSERT(con_list == NULL \|\| con_size != `0`);
8565
8566	ix = MTOPG(buf);
8567	VERIFY(ix < maxclaudit);
8568
8569	/ Make sure we haven't been here before /
8570	for (i = `0`; i < num; i++) {
8571	VERIFY(mclaudit[ix].cl_audit[i] == NULL);
8572	}
8573
8574	mca = mca_tail = *mca_list;
8575	if (save_contents) {
8576	con = *con_list;
8577	}
8578
8579	for (i = `0`; i < num; i++) {
8580	mcache_audit_t *next;
8581
8582	next = mca->mca_next;
8583	bzero(mca, sizeof(*mca));
8584	mca->mca_next = next;
8585	mclaudit[ix].cl_audit[i] = mca;
8586
8587	/ Attach the contents buffer if requested /
8588	if (save_contents) {
8589	mcl_saved_contents_t *msc =
8590	(mcl_saved_contents_t )(void* *)con;
8591
8592	VERIFY(msc != NULL);
8593	VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
8594	VERIFY(con_size == sizeof(*msc));
8595	mca->mca_contents_size = con_size;
8596	mca->mca_contents = msc;
8597	con = con->obj_next;
8598	bzero(mca->mca_contents, mca->mca_contents_size);
8599	}
8600
8601	mca_tail = mca;
8602	mca = mca->mca_next;
8603	}
8604
8605	if (save_contents) {
8606	*con_list = con;
8607	}
8608
8609	*mca_list = mca_tail->mca_next;
8610	mca_tail->mca_next = NULL;
8611	}
8612
8613	static void
8614	mcl_audit_free(void buf, unsigned* int num)
8615	{
8616	unsigned int i, ix;
8617	mcache_audit_t mca, mca_list;
8618
8619	ix = MTOPG(buf);
8620	VERIFY(ix < maxclaudit);
8621
8622	if (mclaudit[ix].cl_audit[`0`] != NULL) {
8623	mca_list = mclaudit[ix].cl_audit[`0`];
8624	for (i = `0`; i < num; i++) {
8625	mca = mclaudit[ix].cl_audit[i];
8626	mclaudit[ix].cl_audit[i] = NULL;
8627	if (mca->mca_contents) {
8628	mcache_free(mcl_audit_con_cache,
8629	mca->mca_contents);
8630	}
8631	}
8632	mcache_free_ext(mcache_audit_cache,
8633	(mcache_obj_t *)mca_list);
8634	}
8635	}
8636
8637	/*
8638	* Given an address of a buffer (mbuf/2KB/4KB/16KB), return
8639	* the corresponding audit structure for that buffer.
8640	*/
8641	static mcache_audit_t *
8642	mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
8643	{
8644	mcache_audit_t *mca = NULL;
8645	int ix = MTOPG(mobj), m_idx = `0`;
8646	unsigned char *page_addr;
8647
8648	VERIFY(ix < maxclaudit);
8649	VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
8650
8651	page_addr = PGTOM(ix);
8652
8653	switch (class) {
8654	case MC_MBUF:
8655	/*
8656	* For the mbuf case, find the index of the page
8657	* used by the mbuf and use that index to locate the
8658	* base address of the page. Then find out the
8659	* mbuf index relative to the page base and use
8660	* it to locate the audit structure.
8661	*/
8662	m_idx = MBPAGEIDX(page_addr, mobj);
8663	VERIFY(m_idx < (int)NMBPG);
8664	mca = mclaudit[ix].cl_audit[m_idx];
8665	break;
8666
8667	case MC_CL:
8668	/*
8669	* Same thing as above, but for 2KB clusters in a page.
8670	*/
8671	m_idx = CLPAGEIDX(page_addr, mobj);
8672	VERIFY(m_idx < (int)NCLPG);
8673	mca = mclaudit[ix].cl_audit[m_idx];
8674	break;
8675
8676	case MC_BIGCL:
8677	m_idx = BCLPAGEIDX(page_addr, mobj);
8678	VERIFY(m_idx < (int)NBCLPG);
8679	mca = mclaudit[ix].cl_audit[m_idx];
8680	break;
8681	case MC_16KCL:
8682	/*
8683	* Same as above, but only return the first element.
8684	*/
8685	mca = mclaudit[ix].cl_audit[`0`];
8686	break;
8687
8688	default:
8689	VERIFY(`0`);
8690	/ NOTREACHED /
8691	}
8692
8693	return mca;
8694	}
8695
8696	static void
8697	mcl_audit_mbuf(mcache_audit_t mca, void* *addr, boolean_t composite,
8698	boolean_t alloc)
8699	{
8700	struct mbuf *m = addr;
8701	mcache_obj_t next = ((mcache_obj_t )m)->obj_next;
8702
8703	VERIFY(mca->mca_contents != NULL &&
8704	mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8705
8706	if (mclverify) {
8707	mcl_audit_verify_nextptr(next, mca);
8708	}
8709
8710	if (!alloc) {
8711	/ Save constructed mbuf fields /
8712	mcl_audit_save_mbuf(m, mca);
8713	if (mclverify) {
8714	mcache_set_pattern(MCACHE_FREE_PATTERN, m,
8715	m_maxsize(MC_MBUF));
8716	}
8717	((mcache_obj_t *)m)->obj_next = next;
8718	return;
8719	}
8720
8721	/ Check if the buffer has been corrupted while in freelist /
8722	if (mclverify) {
8723	mcache_audit_free_verify_set(mca, addr, `0`, m_maxsize(MC_MBUF));
8724	}
8725	/ Restore constructed mbuf fields /
8726	mcl_audit_restore_mbuf(m, mca, composite);
8727	}
8728
8729	static void
8730	mcl_audit_restore_mbuf(struct mbuf m, mcache_audit_t mca, boolean_t composite)
8731	{
8732	struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
8733
8734	if (composite) {
8735	struct mbuf *next = m->m_next;
8736	VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
8737	MBUF_IS_COMPOSITE(ms));
8738	VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8739	/*
8740	* We could have hand-picked the mbuf fields and restore
8741	* them individually, but that will be a maintenance
8742	* headache. Instead, restore everything that was saved;
8743	* the mbuf layer will recheck and reinitialize anyway.
8744	*/
8745	bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
8746	m->m_next = next;
8747	} else {
8748	/*
8749	* For a regular mbuf (no cluster attached) there's nothing
8750	* to restore other than the type field, which is expected
8751	* to be MT_FREE.
8752	*/
8753	m->m_type = ms->m_type;
8754	}
8755	_MCHECK(m);
8756	}
8757
8758	static void
8759	mcl_audit_save_mbuf(struct mbuf m, mcache_audit_t mca)
8760	{
8761	VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8762	_MCHECK(m);
8763	bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
8764	}
8765
8766	static void
8767	mcl_audit_cluster(mcache_audit_t mca, void* *addr, size_t size, boolean_t alloc,
8768	boolean_t save_next)
8769	{
8770	mcache_obj_t next = ((mcache_obj_t )addr)->obj_next;
8771
8772	if (!alloc) {
8773	if (mclverify) {
8774	mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
8775	}
8776	if (save_next) {
8777	mcl_audit_verify_nextptr(next, mca);
8778	((mcache_obj_t *)addr)->obj_next = next;
8779	}
8780	} else if (mclverify) {
8781	/ Check if the buffer has been corrupted while in freelist /
8782	mcl_audit_verify_nextptr(next, mca);
8783	mcache_audit_free_verify_set(mca, addr, `0`, size);
8784	}
8785	}
8786
8787	static void
8788	mcl_audit_scratch(mcache_audit_t *mca)
8789	{
8790	void *stack[MCACHE_STACK_DEPTH + `1`];
8791	mcl_scratch_audit_t *msa;
8792	struct timeval now;
8793
8794	VERIFY(mca->mca_contents != NULL);
8795	msa = MCA_SAVED_SCRATCH_PTR(mca);
8796
8797	msa->msa_pthread = msa->msa_thread;
8798	msa->msa_thread = current_thread();
8799	bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
8800	msa->msa_pdepth = msa->msa_depth;
8801	bzero(stack, sizeof(stack));
8802	msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + `1`) - `1`;
8803	bcopy(&stack[`1`], msa->msa_stack, sizeof(msa->msa_stack));
8804
8805	msa->msa_ptstamp = msa->msa_tstamp;
8806	microuptime(&now);
8807	/ tstamp is in ms relative to base_ts /
8808	msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / `1000`);
8809	if ((now.tv_sec - mb_start.tv_sec) > `0`) {
8810	msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * `1000`);
8811	}
8812	}
8813
8814	__abortlike
8815	static void
8816	mcl_audit_mcheck_panic(struct mbuf *m)
8817	{
8818	char buf[DUMP_MCA_BUF_SIZE];
8819	mcache_audit_t *mca;
8820
8821	MRANGE(m);
8822	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
8823
8824	panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s",
8825	m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
8826	/ NOTREACHED /
8827	}
8828
8829	__abortlike
8830	static void
8831	mcl_audit_verify_nextptr_panic(void next, mcache_audit_t mca)
8832	{
8833	char buf[DUMP_MCA_BUF_SIZE];
8834	panic("mcl_audit: buffer %p modified after free at offset 0: "
8835	"%p out of range [%p-%p)\n%s\n",
8836	mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
8837	/ NOTREACHED /
8838	}
8839
8840	static void
8841	mcl_audit_verify_nextptr(void next, mcache_audit_t mca)
8842	{
8843	if (next != NULL && !MBUF_IN_MAP(next) &&
8844	(next != (void *)MCACHE_FREE_PATTERN \|\| !mclverify)) {
8845	mcl_audit_verify_nextptr_panic(next, mca);
8846	}
8847	}
8848
8849	static uintptr_t
8850	hash_mix(uintptr_t x)
8851	{
8852	#ifndef __LP64__
8853	x += ~(x << `15`);
8854	x ^= (x >> `10`);
8855	x += (x << `3`);
8856	x ^= (x >> `6`);
8857	x += ~(x << `11`);
8858	x ^= (x >> `16`);
8859	#else
8860	x += ~(x << `32`);
8861	x ^= (x >> `22`);
8862	x += ~(x << `13`);
8863	x ^= (x >> `8`);
8864	x += (x << `3`);
8865	x ^= (x >> `15`);
8866	x += ~(x << `27`);
8867	x ^= (x >> `31`);
8868	#endif
8869	return x;
8870	}
8871
8872	static uint32_t
8873	hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
8874	{
8875	uintptr_t hash = `0`;
8876	uintptr_t mask = max_size - `1`;
8877
8878	while (depth) {
8879	hash += bt[--depth];
8880	}
8881
8882	hash = hash_mix(hash) & mask;
8883
8884	assert(hash < max_size);
8885
8886	return (uint32_t) hash;
8887	}
8888
8889	static uint32_t
8890	hashaddr(uintptr_t pt, uint32_t max_size)
8891	{
8892	uintptr_t hash = `0`;
8893	uintptr_t mask = max_size - `1`;
8894
8895	hash = hash_mix(pt) & mask;
8896
8897	assert(hash < max_size);
8898
8899	return (uint32_t) hash;
8900	}
8901
8902	/ This function turns on mbuf leak detection /
8903	static void
8904	mleak_activate(void)
8905	{
8906	mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
8907	PE_parse_boot_argn("mleak_sample_factor",
8908	&mleak_table.mleak_sample_factor,
8909	sizeof(mleak_table.mleak_sample_factor));
8910
8911	if (mleak_table.mleak_sample_factor == `0`) {
8912	mclfindleak = `0`;
8913	}
8914
8915	if (mclfindleak == `0`) {
8916	return;
8917	}
8918
8919	vm_size_t alloc_size =
8920	mleak_alloc_buckets * sizeof(struct mallocation);
8921	vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
8922
8923	mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
8924	mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
8925	mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
8926	ZALIGN(mleak_stat_t));
8927
8928	mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
8929	#ifdef __LP64__
8930	mleak_stat->ml_isaddr64 = `1`;
8931	#endif /* __LP64__ */
8932	}
8933
8934	static void
8935	mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
8936	{
8937	int temp;
8938
8939	if (mclfindleak == `0`) {
8940	return;
8941	}
8942
8943	if (!alloc) {
8944	return mleak_free(addr);
8945	}
8946
8947	temp = os_atomic_inc_orig(&mleak_table.mleak_capture, relaxed);
8948
8949	if ((temp % mleak_table.mleak_sample_factor) == `0` && addr != NULL) {
8950	uintptr_t bt[MLEAK_STACK_DEPTH];
8951	unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
8952	mleak_log(bt, addr, logged, num);
8953	}
8954	}
8955
8956	/*
8957	* This function records the allocation in the mleak_allocations table
8958	* and the backtrace in the mleak_traces table; if allocation slot is in use,
8959	* replace old allocation with new one if the trace slot is in use, return
8960	* (or increment refcount if same trace).
8961	*/
8962	static boolean_t
8963	mleak_log(uintptr_t bt, mcache_obj_t addr, uint32_t depth, int num)
8964	{
8965	struct mallocation *allocation;
8966	struct mtrace *trace;
8967	uint32_t trace_index;
8968
8969	/ Quit if someone else modifying the tables /
8970	if (!lck_mtx_try_lock_spin(mleak_lock)) {
8971	mleak_table.total_conflicts++;
8972	return FALSE;
8973	}
8974
8975	allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
8976	mleak_alloc_buckets)];
8977	trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
8978	trace = &mleak_traces[trace_index];
8979
8980	VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - `1`]);
8981	VERIFY(trace <= &mleak_traces[mleak_trace_buckets - `1`]);
8982
8983	allocation->hitcount++;
8984	trace->hitcount++;
8985
8986	/*
8987	* If the allocation bucket we want is occupied
8988	* and the occupier has the same trace, just bail.
8989	*/
8990	if (allocation->element != NULL &&
8991	trace_index == allocation->trace_index) {
8992	mleak_table.alloc_collisions++;
8993	lck_mtx_unlock(mleak_lock);
8994	return TRUE;
8995	}
8996
8997	/*
8998	* Store the backtrace in the traces array;
8999	* Size of zero = trace bucket is free.
9000	*/
9001	if (trace->allocs > `0` &&
9002	bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != `0`) {
9003	/ Different, unique trace, but the same hash! Bail out. /
9004	trace->collisions++;
9005	mleak_table.trace_collisions++;
9006	lck_mtx_unlock(mleak_lock);
9007	return TRUE;
9008	} else if (trace->allocs > `0`) {
9009	/ Same trace, already added, so increment refcount /
9010	trace->allocs++;
9011	} else {
9012	/ Found an unused trace bucket, so record the trace here /
9013	if (trace->depth != `0`) {
9014	/ this slot previously used but not currently in use /
9015	mleak_table.trace_overwrites++;
9016	}
9017	mleak_table.trace_recorded++;
9018	trace->allocs = `1`;
9019	memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
9020	trace->depth = depth;
9021	trace->collisions = `0`;
9022	}
9023
9024	/ Step 2: Store the allocation record in the allocations array /
9025	if (allocation->element != NULL) {
9026	/*
9027	* Replace an existing allocation. No need to preserve
9028	* because only a subset of the allocations are being
9029	* recorded anyway.
9030	*/
9031	mleak_table.alloc_collisions++;
9032	} else if (allocation->trace_index != `0`) {
9033	mleak_table.alloc_overwrites++;
9034	}
9035	allocation->element = addr;
9036	allocation->trace_index = trace_index;
9037	allocation->count = num;
9038	mleak_table.alloc_recorded++;
9039	mleak_table.outstanding_allocs++;
9040
9041	lck_mtx_unlock(mleak_lock);
9042	return TRUE;
9043	}
9044
9045	static void
9046	mleak_free(mcache_obj_t *addr)
9047	{
9048	while (addr != NULL) {
9049	struct mallocation *allocation = &mleak_allocations
9050	[hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
9051
9052	if (allocation->element == addr &&
9053	allocation->trace_index < mleak_trace_buckets) {
9054	lck_mtx_lock_spin(mleak_lock);
9055	if (allocation->element == addr &&
9056	allocation->trace_index < mleak_trace_buckets) {
9057	struct mtrace *trace;
9058	trace = &mleak_traces[allocation->trace_index];
9059	/ allocs = 0 means trace bucket is unused /
9060	if (trace->allocs > `0`) {
9061	trace->allocs--;
9062	}
9063	if (trace->allocs == `0`) {
9064	trace->depth = `0`;
9065	}
9066	/ NULL element means alloc bucket is unused /
9067	allocation->element = NULL;
9068	mleak_table.outstanding_allocs--;
9069	}
9070	lck_mtx_unlock(mleak_lock);
9071	}
9072	addr = addr->obj_next;
9073	}
9074	}
9075
9076	static void
9077	mleak_sort_traces()
9078	{
9079	int i, j, k;
9080	struct mtrace *swap;
9081
9082	for (i = `0`; i < MLEAK_NUM_TRACES; i++) {
9083	mleak_top_trace[i] = NULL;
9084	}
9085
9086	for (i = `0`, j = `0`; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
9087	if (mleak_traces[i].allocs <= `0`) {
9088	continue;
9089	}
9090
9091	mleak_top_trace[j] = &mleak_traces[i];
9092	for (k = j; k > `0`; k--) {
9093	if (mleak_top_trace[k]->allocs <=
9094	mleak_top_trace[k - `1`]->allocs) {
9095	break;
9096	}
9097
9098	swap = mleak_top_trace[k - `1`];
9099	mleak_top_trace[k - `1`] = mleak_top_trace[k];
9100	mleak_top_trace[k] = swap;
9101	}
9102	j++;
9103	}
9104
9105	j--;
9106	for (; i < mleak_trace_buckets; i++) {
9107	if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
9108	continue;
9109	}
9110
9111	mleak_top_trace[j] = &mleak_traces[i];
9112
9113	for (k = j; k > `0`; k--) {
9114	if (mleak_top_trace[k]->allocs <=
9115	mleak_top_trace[k - `1`]->allocs) {
9116	break;
9117	}
9118
9119	swap = mleak_top_trace[k - `1`];
9120	mleak_top_trace[k - `1`] = mleak_top_trace[k];
9121	mleak_top_trace[k] = swap;
9122	}
9123	}
9124	}
9125
9126	static void
9127	mleak_update_stats()
9128	{
9129	mleak_trace_stat_t *mltr;
9130	int i;
9131
9132	VERIFY(mleak_stat != NULL);
9133	#ifdef __LP64__
9134	VERIFY(mleak_stat->ml_isaddr64);
9135	#else
9136	VERIFY(!mleak_stat->ml_isaddr64);
9137	#endif /* !__LP64__ */
9138	VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
9139
9140	mleak_sort_traces();
9141
9142	mltr = &mleak_stat->ml_trace[`0`];
9143	bzero(mltr, sizeof(mltr) MLEAK_NUM_TRACES);
9144	for (i = `0`; i < MLEAK_NUM_TRACES; i++) {
9145	int j;
9146
9147	if (mleak_top_trace[i] == NULL \|\|
9148	mleak_top_trace[i]->allocs == `0`) {
9149	continue;
9150	}
9151
9152	mltr->mltr_collisions = mleak_top_trace[i]->collisions;
9153	mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
9154	mltr->mltr_allocs = mleak_top_trace[i]->allocs;
9155	mltr->mltr_depth = mleak_top_trace[i]->depth;
9156
9157	VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
9158	for (j = `0`; j < mltr->mltr_depth; j++) {
9159	mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
9160	}
9161
9162	mltr++;
9163	}
9164	}
9165
9166	static struct mbtypes {
9167	int mt_type;
9168	const char *mt_name;
9169	} mbtypes[] = {
9170	{ MT_DATA, "data" },
9171	{ MT_OOBDATA, "oob data" },
9172	{ MT_CONTROL, "ancillary data" },
9173	{ MT_HEADER, "packet headers" },
9174	{ MT_SOCKET, "socket structures" },
9175	{ MT_PCB, "protocol control blocks" },
9176	{ MT_RTABLE, "routing table entries" },
9177	{ MT_HTABLE, "IMP host table entries" },
9178	{ MT_ATABLE, "address resolution tables" },
9179	{ MT_FTABLE, "fragment reassembly queue headers" },
9180	{ MT_SONAME, "socket names and addresses" },
9181	{ MT_SOOPTS, "socket options" },
9182	{ MT_RIGHTS, "access rights" },
9183	{ MT_IFADDR, "interface addresses" },
9184	{ MT_TAG, "packet tags" },
9185	{ `0`, NULL }
9186	};
9187
9188	#define MBUF_DUMP_BUF_CHK() { \
9189	clen -= k; \
9190	if (clen < 1) \
9191	goto done; \
9192	c += k; \
9193	}
9194
9195	static char *
9196	mbuf_dump(void)
9197	{
9198	unsigned long totmem = `0`, totfree = `0`, totmbufs, totused, totpct,
9199	totreturned = `0`;
9200	u_int32_t m_mbufs = `0`, m_clfree = `0`, m_bigclfree = `0`;
9201	u_int32_t m_mbufclfree = `0`, m_mbufbigclfree = `0`;
9202	u_int32_t m_16kclusters = `0`, m_16kclfree = `0`, m_mbuf16kclfree = `0`;
9203	int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
9204	uint8_t seen[`256`];
9205	struct mbtypes *mp;
9206	mb_class_stat_t *sp;
9207	mleak_trace_stat_t *mltr;
9208	char *c = mbuf_dump_buf;
9209	int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
9210	struct mbuf_watchdog_defunct_args args = {};
9211
9212	mbuf_dump_buf[`0`] = `'\0'`;
9213
9214	/ synchronize all statistics in the mbuf table /
9215	mbuf_stat_sync();
9216	mbuf_mtypes_sync(TRUE);
9217
9218	sp = &mb_stat->mbs_class[`0`];
9219	for (i = `0`; i < mb_stat->mbs_cnt; i++, sp++) {
9220	u_int32_t mem;
9221
9222	if (m_class(i) == MC_MBUF) {
9223	m_mbufs = sp->mbcl_active;
9224	} else if (m_class(i) == MC_CL) {
9225	m_clfree = sp->mbcl_total - sp->mbcl_active;
9226	} else if (m_class(i) == MC_BIGCL) {
9227	m_bigclfree = sp->mbcl_total - sp->mbcl_active;
9228	} else if (njcl > `0` && m_class(i) == MC_16KCL) {
9229	m_16kclfree = sp->mbcl_total - sp->mbcl_active;
9230	m_16kclusters = sp->mbcl_total;
9231	} else if (m_class(i) == MC_MBUF_CL) {
9232	m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
9233	} else if (m_class(i) == MC_MBUF_BIGCL) {
9234	m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
9235	} else if (njcl > `0` && m_class(i) == MC_MBUF_16KCL) {
9236	m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
9237	}
9238
9239	mem = sp->mbcl_ctotal * sp->mbcl_size;
9240	totmem += mem;
9241	totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
9242	sp->mbcl_size;
9243	totreturned += sp->mbcl_release_cnt;
9244	}
9245
9246	/ adjust free counts to include composite caches /
9247	m_clfree += m_mbufclfree;
9248	m_bigclfree += m_mbufbigclfree;
9249	m_16kclfree += m_mbuf16kclfree;
9250
9251	totmbufs = `0`;
9252	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
9253	totmbufs += mbstat.m_mtypes[mp->mt_type];
9254	}
9255	if (totmbufs > m_mbufs) {
9256	totmbufs = m_mbufs;
9257	}
9258	k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
9259	MBUF_DUMP_BUF_CHK();
9260
9261	bzero(&seen, sizeof(seen));
9262	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
9263	if (mbstat.m_mtypes[mp->mt_type] != `0`) {
9264	seen[mp->mt_type] = `1`;
9265	k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
9266	mbstat.m_mtypes[mp->mt_type], mp->mt_name);
9267	MBUF_DUMP_BUF_CHK();
9268	}
9269	}
9270	seen[MT_FREE] = `1`;
9271	for (i = `0`; i < nmbtypes; i++) {
9272	if (!seen[i] && mbstat.m_mtypes[i] != `0`) {
9273	k = scnprintf(c, clen, "\t%u mbufs allocated to "
9274	"<mbuf type %d>\n", mbstat.m_mtypes[i], i);
9275	MBUF_DUMP_BUF_CHK();
9276	}
9277	}
9278	if ((m_mbufs - totmbufs) > `0`) {
9279	k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
9280	m_mbufs - totmbufs);
9281	MBUF_DUMP_BUF_CHK();
9282	}
9283	k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
9284	"%u/%u mbuf 4KB clusters in use\n",
9285	(unsigned int)(mbstat.m_clusters - m_clfree),
9286	(unsigned int)mbstat.m_clusters,
9287	(unsigned int)(mbstat.m_bigclusters - m_bigclfree),
9288	(unsigned int)mbstat.m_bigclusters);
9289	MBUF_DUMP_BUF_CHK();
9290
9291	if (njcl > `0`) {
9292	k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
9293	m_16kclusters - m_16kclfree, m_16kclusters,
9294	njclbytes / `1024`);
9295	MBUF_DUMP_BUF_CHK();
9296	}
9297	totused = totmem - totfree;
9298	if (totmem == `0`) {
9299	totpct = `0`;
9300	} else if (totused < (ULONG_MAX / `100`)) {
9301	totpct = (totused * `100`) / totmem;
9302	} else {
9303	u_long totmem1 = totmem / `100`;
9304	u_long totused1 = totused / `100`;
9305	totpct = (totused1 * `100`) / totmem1;
9306	}
9307	k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
9308	"in use)\n", totmem / `1024`, totpct);
9309	MBUF_DUMP_BUF_CHK();
9310	k = scnprintf(c, clen, "%lu KB returned to the system\n",
9311	totreturned / `1024`);
9312	MBUF_DUMP_BUF_CHK();
9313
9314	net_update_uptime();
9315
9316	k = scnprintf(c, clen,
9317	"worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
9318	"bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
9319	mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
9320	mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
9321	mb_expand_16kcl_total);
9322	MBUF_DUMP_BUF_CHK();
9323	if (mbuf_worker_last_runtime != `0`) {
9324	k = scnprintf(c, clen, "worker thread last run time: "
9325	"%llu (%llu seconds ago)\n",
9326	mbuf_worker_last_runtime,
9327	net_uptime() - mbuf_worker_last_runtime);
9328	MBUF_DUMP_BUF_CHK();
9329	}
9330	if (mbuf_drain_last_runtime != `0`) {
9331	k = scnprintf(c, clen, "drain routine last run time: "
9332	"%llu (%llu seconds ago)\n",
9333	mbuf_drain_last_runtime,
9334	net_uptime() - mbuf_drain_last_runtime);
9335	MBUF_DUMP_BUF_CHK();
9336	}
9337
9338	/*
9339	* Log where the most mbufs have accumulated:
9340	* - Process socket buffers
9341	* - TCP reassembly queue
9342	* - Interface AQM queue (output) and DLIL input queue
9343	*/
9344	args.non_blocking = true;
9345	proc_iterate(PROC_ALLPROCLIST,
9346	mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
9347	if (args.top_app != NULL) {
9348	k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n",
9349	args.top_app_space_used,
9350	proc_name_address(args.top_app),
9351	proc_pid(args.top_app));
9352	proc_rele(args.top_app);
9353	}
9354	MBUF_DUMP_BUF_CHK();
9355
9356	#if INET
9357	k = dump_tcp_reass_qlen(c, clen);
9358	MBUF_DUMP_BUF_CHK();
9359	#endif /* INET */
9360
9361	#if MPTCP
9362	k = dump_mptcp_reass_qlen(c, clen);
9363	MBUF_DUMP_BUF_CHK();
9364	#endif /* MPTCP */
9365
9366	#if NETWORKING
9367	k = dlil_dump_top_if_qlen(c, clen);
9368	MBUF_DUMP_BUF_CHK();
9369	#endif /* NETWORKING */
9370
9371	/ mbuf leak detection statistics /
9372	mleak_update_stats();
9373
9374	k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
9375	MBUF_DUMP_BUF_CHK();
9376	k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
9377	mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
9378	mleak_table.mleak_sample_factor);
9379	MBUF_DUMP_BUF_CHK();
9380	k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
9381	mleak_table.outstanding_allocs);
9382	MBUF_DUMP_BUF_CHK();
9383	k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
9384	mleak_table.alloc_recorded, mleak_table.trace_recorded);
9385	MBUF_DUMP_BUF_CHK();
9386	k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
9387	mleak_table.alloc_collisions, mleak_table.trace_collisions);
9388	MBUF_DUMP_BUF_CHK();
9389	k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
9390	mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
9391	MBUF_DUMP_BUF_CHK();
9392	k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
9393	mleak_table.total_conflicts);
9394	MBUF_DUMP_BUF_CHK();
9395
9396	k = scnprintf(c, clen, "top %d outstanding traces:\n",
9397	mleak_stat->ml_cnt);
9398	MBUF_DUMP_BUF_CHK();
9399	for (i = `0`; i < mleak_stat->ml_cnt; i++) {
9400	mltr = &mleak_stat->ml_trace[i];
9401	k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
9402	"%llu hit(s), %llu collision(s)\n", (i + `1`),
9403	mltr->mltr_allocs, mltr->mltr_hitcount,
9404	mltr->mltr_collisions);
9405	MBUF_DUMP_BUF_CHK();
9406	}
9407
9408	if (mleak_stat->ml_isaddr64) {
9409	k = scnprintf(c, clen, MB_LEAK_HDR_64);
9410	} else {
9411	k = scnprintf(c, clen, MB_LEAK_HDR_32);
9412	}
9413	MBUF_DUMP_BUF_CHK();
9414
9415	for (i = `0`; i < MLEAK_STACK_DEPTH; i++) {
9416	k = scnprintf(c, clen, "%2d: ", (i + `1`));
9417	MBUF_DUMP_BUF_CHK();
9418	for (j = `0`; j < mleak_stat->ml_cnt; j++) {
9419	mltr = &mleak_stat->ml_trace[j];
9420	if (i < mltr->mltr_depth) {
9421	if (mleak_stat->ml_isaddr64) {
9422	k = scnprintf(c, clen, "0x%0llx ",
9423	(uint64_t)VM_KERNEL_UNSLIDE(
9424	mltr->mltr_addr[i]));
9425	} else {
9426	k = scnprintf(c, clen,
9427	"0x%08x ",
9428	(uint32_t)VM_KERNEL_UNSLIDE(
9429	mltr->mltr_addr[i]));
9430	}
9431	} else {
9432	if (mleak_stat->ml_isaddr64) {
9433	k = scnprintf(c, clen,
9434	MB_LEAK_SPACING_64);
9435	} else {
9436	k = scnprintf(c, clen,
9437	MB_LEAK_SPACING_32);
9438	}
9439	}
9440	MBUF_DUMP_BUF_CHK();
9441	}
9442	k = scnprintf(c, clen, "\n");
9443	MBUF_DUMP_BUF_CHK();
9444	}
9445
9446	done:
9447	return mbuf_dump_buf;
9448	}
9449
9450	#undef MBUF_DUMP_BUF_CHK
9451	#endif /* CONFIG_MBUF_MCACHE */
9452
9453	/*
9454	* Convert between a regular and a packet header mbuf. Caller is responsible
9455	* for setting or clearing M_PKTHDR; this routine does the rest of the work.
9456	*/
9457	int
9458	m_reinit(struct mbuf m, int* hdr)
9459	{
9460	int ret = `0`;
9461
9462	if (hdr) {
9463	VERIFY(!(m->m_flags & M_PKTHDR));
9464	if (!(m->m_flags & M_EXT) &&
9465	(m->m_data != (uintptr_t)m->m_dat \|\| m->m_len > `0`)) {
9466	/*
9467	* If there's no external cluster attached and the
9468	* mbuf appears to contain user data, we cannot
9469	* safely convert this to a packet header mbuf,
9470	* as the packet header structure might overlap
9471	* with the data.
9472	*/
9473	printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
9474	"m_data %llx (expected %llx), "
9475	"m_len %d (expected 0)\n",
9476	__func__,
9477	(uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m),
9478	(uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m->m_data),
9479	(uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(m->m_dat)), m->m_len);
9480	ret = EBUSY;
9481	} else {
9482	VERIFY((m->m_flags & M_EXT) \|\| m->m_data == (uintptr_t)m->m_dat);
9483	m->m_flags \|= M_PKTHDR;
9484	MBUF_INIT_PKTHDR(m);
9485	}
9486	} else {
9487	/ Check for scratch area overflow /
9488	m_redzone_verify(m);
9489	/ Free the aux data and tags if there is any /
9490	m_tag_delete_chain(m);
9491	m_do_tx_compl_callback(m, NULL);
9492	m->m_flags &= ~M_PKTHDR;
9493	}
9494
9495	return ret;
9496	}
9497
9498	int
9499	m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
9500	{
9501	ASSERT(m->m_flags & M_EXT);
9502	return os_atomic_cmpxchg(&MEXT_PRIV(m), o, n, acq_rel);
9503	}
9504
9505	uint32_t
9506	m_ext_get_prop(struct mbuf *m)
9507	{
9508	ASSERT(m->m_flags & M_EXT);
9509	return MEXT_PRIV(m);
9510	}
9511
9512	int
9513	m_ext_paired_is_active(struct mbuf *m)
9514	{
9515	return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : `1`;
9516	}
9517
9518	void
9519	m_ext_paired_activate(struct mbuf *m)
9520	{
9521	struct ext_ref *rfa;
9522	int hdr, type;
9523	caddr_t extbuf;
9524	m_ext_free_func_t extfree;
9525	u_int extsize;
9526
9527	VERIFY(MBUF_IS_PAIRED(m));
9528	VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
9529	VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
9530
9531	hdr = (m->m_flags & M_PKTHDR);
9532	type = m->m_type;
9533	extbuf = m->m_ext.ext_buf;
9534	extfree = m_get_ext_free(m);
9535	extsize = m->m_ext.ext_size;
9536	rfa = m_get_rfa(m);
9537
9538	VERIFY(extbuf != NULL && rfa != NULL);
9539
9540	/*
9541	* Safe to reinitialize packet header tags, since it's
9542	* already taken care of at m_free() time. Similar to
9543	* what's done in m_clattach() for the cluster. Bump
9544	* up MEXT_PREF to indicate activation.
9545	*/
9546	MBUF_INIT(m, hdr, type);
9547	MEXT_INIT(m, buf: extbuf, size: extsize, free: extfree, free_arg: (caddr_t)m, rfa,
9548	min: `1`, ref: `1`, pref: `2`, EXTF_PAIRED, MEXT_PRIV(m), pm: m);
9549	}
9550
9551	void
9552	m_scratch_init(struct mbuf *m)
9553	{
9554	struct pkthdr *pkt = &m->m_pkthdr;
9555
9556	VERIFY(m->m_flags & M_PKTHDR);
9557
9558	/ See comments in <rdar://problem/14040693> /
9559	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
9560	panic_plain("Invalid attempt to modify guarded module-private "
9561	"area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
9562	/ NOTREACHED /
9563	}
9564
9565	bzero(s: &pkt->pkt_mpriv, n: sizeof(pkt->pkt_mpriv));
9566	}
9567
9568	/*
9569	* This routine is reserved for mbuf_get_driver_scratch(); clients inside
9570	* xnu that intend on utilizing the module-private area should directly
9571	* refer to the pkt_mpriv structure in the pkthdr. They are also expected
9572	* to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
9573	* to handing it off to another module, respectively.
9574	*/
9575	u_int32_t
9576	m_scratch_get(struct mbuf m, u_int8_t *p)
9577	{
9578	struct pkthdr *pkt = &m->m_pkthdr;
9579
9580	VERIFY(m->m_flags & M_PKTHDR);
9581
9582	/ See comments in <rdar://problem/14040693> /
9583	if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
9584	panic_plain("Invalid attempt to access guarded module-private "
9585	"area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
9586	/ NOTREACHED /
9587	}
9588
9589	#if CONFIG_MBUF_MCACHE
9590	if (mcltrace) {
9591	mcache_audit_t *mca;
9592
9593	lck_mtx_lock(mbuf_mlock);
9594	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
9595	if (mca->mca_uflags & MB_SCVALID) {
9596	mcl_audit_scratch(mca);
9597	}
9598	lck_mtx_unlock(mbuf_mlock);
9599	}
9600	#endif /* CONFIG_MBUF_MCACHE */
9601
9602	p = (u_int8_t )&pkt->pkt_mpriv;
9603	return sizeof(pkt->pkt_mpriv);
9604	}
9605
9606	void
9607	m_add_crumb(struct mbuf *m, uint16_t crumb)
9608	{
9609	VERIFY(m->m_flags & M_PKTHDR);
9610
9611	m->m_pkthdr.pkt_crumbs \|= crumb;
9612	}
9613
9614	static void
9615	m_redzone_init(struct mbuf *m)
9616	{
9617	VERIFY(m->m_flags & M_PKTHDR);
9618	/*
9619	* Each mbuf has a unique red zone pattern, which is a XOR
9620	* of the red zone cookie and the address of the mbuf.
9621	*/
9622	m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
9623	}
9624
9625	static void
9626	m_redzone_verify(struct mbuf *m)
9627	{
9628	u_int32_t mb_redzone;
9629
9630	VERIFY(m->m_flags & M_PKTHDR);
9631
9632	mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
9633	if (m->m_pkthdr.redzone != mb_redzone) {
9634	panic("mbuf %p redzone violation with value 0x%x "
9635	"(instead of 0x%x, using cookie 0x%x)\n",
9636	m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
9637	/ NOTREACHED /
9638	}
9639	}
9640
9641	__private_extern__ inline void
9642	m_set_ext(struct mbuf m, struct* ext_ref *rfa, m_ext_free_func_t ext_free,
9643	caddr_t ext_arg)
9644	{
9645	VERIFY(m->m_flags & M_EXT);
9646	if (rfa != NULL) {
9647	m_set_rfa(m, rfa);
9648	if (ext_free != NULL) {
9649	rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
9650	mb_obscure_extfree;
9651	uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ rfa->ext_token;
9652	m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9653	if (ext_arg != NULL) {
9654	m->m_ext.ext_arg =
9655	(caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
9656	} else {
9657	m->m_ext.ext_arg = NULL;
9658	}
9659	} else {
9660	rfa->ext_token = `0`;
9661	m->m_ext.ext_free = NULL;
9662	m->m_ext.ext_arg = NULL;
9663	}
9664	} else {
9665	/*
9666	* If we are going to loose the cookie in ext_token by
9667	* resetting the rfa, we should use the global cookie
9668	* to obscure the ext_free and ext_arg pointers.
9669	*/
9670	if (ext_free != NULL) {
9671	uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ mb_obscure_extfree;
9672	m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9673	if (ext_arg != NULL) {
9674	m->m_ext.ext_arg =
9675	(caddr_t)((uintptr_t)ext_arg ^
9676	mb_obscure_extfree);
9677	} else {
9678	m->m_ext.ext_arg = NULL;
9679	}
9680	} else {
9681	m->m_ext.ext_free = NULL;
9682	m->m_ext.ext_arg = NULL;
9683	}
9684	m->m_ext.ext_refflags = NULL;
9685	}
9686	}
9687
9688	__private_extern__ inline struct ext_ref *
9689	m_get_rfa(struct mbuf *m)
9690	{
9691	if (m->m_ext.ext_refflags == NULL) {
9692	return NULL;
9693	} else {
9694	return (struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref);
9695	}
9696	}
9697
9698	static inline void
9699	m_set_rfa(struct mbuf m, struct* ext_ref *rfa)
9700	{
9701	if (rfa != NULL) {
9702	m->m_ext.ext_refflags =
9703	(struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
9704	} else {
9705	m->m_ext.ext_refflags = NULL;
9706	}
9707	}
9708
9709	__private_extern__ inline m_ext_free_func_t
9710	m_get_ext_free(struct mbuf *m)
9711	{
9712	struct ext_ref *rfa;
9713	if (m->m_ext.ext_free == NULL) {
9714	return NULL;
9715	}
9716
9717	rfa = m_get_rfa(m);
9718	if (rfa == NULL) {
9719	uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ mb_obscure_extfree;
9720	return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9721	} else {
9722	uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ rfa->ext_token;
9723	return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9724	}
9725	}
9726
9727	__private_extern__ inline caddr_t
9728	m_get_ext_arg(struct mbuf *m)
9729	{
9730	struct ext_ref *rfa;
9731	if (m->m_ext.ext_arg == NULL) {
9732	return NULL;
9733	}
9734
9735	rfa = m_get_rfa(m);
9736	if (rfa == NULL) {
9737	return (caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree);
9738	} else {
9739	return (caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
9740	rfa->ext_token);
9741	}
9742	}
9743
9744	#if CONFIG_MBUF_MCACHE
9745	/*
9746	* Simple routine to avoid taking the lock when we can't run the
9747	* mbuf drain.
9748	*/
9749	static int
9750	mbuf_drain_checks(boolean_t ignore_waiters)
9751	{
9752	if (mb_drain_maxint == `0`) {
9753	return `0`;
9754	}
9755	if (!ignore_waiters && mb_waiters != `0`) {
9756	return `0`;
9757	}
9758
9759	return `1`;
9760	}
9761
9762	/*
9763	* Called by the VM when there's memory pressure or when we exhausted
9764	* the 4k/16k reserved space.
9765	*/
9766	static void
9767	mbuf_drain_locked(boolean_t ignore_waiters)
9768	{
9769	mbuf_class_t mc;
9770	mcl_slab_t sp, sp_tmp, *nsp;
9771	unsigned int num, k, interval, released = `0`;
9772	unsigned long total_mem = `0`, use_mem = `0`;
9773	boolean_t ret, purge_caches = FALSE;
9774	ppnum_t offset;
9775	mcache_obj_t *obj;
9776	unsigned long per;
9777	static unsigned char scratch[`32`];
9778	static ppnum_t scratch_pa = `0`;
9779
9780	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
9781	if (!mbuf_drain_checks(ignore_waiters)) {
9782	return;
9783	}
9784	if (scratch_pa == `0`) {
9785	bzero(scratch, sizeof(scratch));
9786	scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
9787	VERIFY(scratch_pa);
9788	} else if (mclverify) {
9789	/*
9790	* Panic if a driver wrote to our scratch memory.
9791	*/
9792	for (k = `0`; k < sizeof(scratch); k++) {
9793	if (scratch[k]) {
9794	panic("suspect DMA to freed address");
9795	}
9796	}
9797	}
9798	/*
9799	* Don't free memory too often as that could cause excessive
9800	* waiting times for mbufs. Purge caches if we were asked to drain
9801	* in the last 5 minutes.
9802	*/
9803	if (mbuf_drain_last_runtime != `0`) {
9804	interval = net_uptime() - mbuf_drain_last_runtime;
9805	if (interval <= mb_drain_maxint) {
9806	return;
9807	}
9808	if (interval <= mb_drain_maxint * `5`) {
9809	purge_caches = TRUE;
9810	}
9811	}
9812	mbuf_drain_last_runtime = net_uptime();
9813	/*
9814	* Don't free any memory if we're using 60% or more.
9815	*/
9816	for (mc = `0`; mc < NELEM(mbuf_table); mc++) {
9817	total_mem += m_total(mc) * m_maxsize(mc);
9818	use_mem += m_active(mc) * m_maxsize(mc);
9819	}
9820	per = (use_mem * `100`) / total_mem;
9821	if (per >= `60`) {
9822	return;
9823	}
9824	/*
9825	* Purge all the caches. This effectively disables
9826	* caching for a few seconds, but the mbuf worker thread will
9827	* re-enable them again.
9828	*/
9829	if (purge_caches == TRUE) {
9830	for (mc = `0`; mc < NELEM(mbuf_table); mc++) {
9831	if (m_total(mc) < m_avgtotal(mc)) {
9832	continue;
9833	}
9834	lck_mtx_unlock(mbuf_mlock);
9835	ret = mcache_purge_cache(m_cache(mc), FALSE);
9836	lck_mtx_lock(mbuf_mlock);
9837	if (ret == TRUE) {
9838	m_purge_cnt(mc)++;
9839	}
9840	}
9841	}
9842	/*
9843	* Move the objects from the composite class freelist to
9844	* the rudimentary slabs list, but keep at least 10% of the average
9845	* total in the freelist.
9846	*/
9847	for (mc = `0`; mc < NELEM(mbuf_table); mc++) {
9848	while (m_cobjlist(mc) &&
9849	m_total(mc) < m_avgtotal(mc) &&
9850	m_infree(mc) > `0.1` * m_avgtotal(mc) + m_minlimit(mc)) {
9851	obj = m_cobjlist(mc);
9852	m_cobjlist(mc) = obj->obj_next;
9853	obj->obj_next = NULL;
9854	num = cslab_free(mc, obj, `1`);
9855	VERIFY(num == `1`);
9856	m_free_cnt(mc)++;
9857	m_infree(mc)--;
9858	/ cslab_free() handles m_total /
9859	}
9860	}
9861	/*
9862	* Free the buffers present in the slab list up to 10% of the total
9863	* average per class.
9864	*
9865	* We walk the list backwards in an attempt to reduce fragmentation.
9866	*/
9867	for (mc = NELEM(mbuf_table) - `1`; (int)mc >= `0`; mc--) {
9868	TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
9869	/*
9870	* Process only unused slabs occupying memory.
9871	*/
9872	if (sp->sl_refcnt != `0` \|\| sp->sl_len == `0` \|\|
9873	sp->sl_base == NULL) {
9874	continue;
9875	}
9876	if (m_total(mc) < m_avgtotal(mc) \|\|
9877	m_infree(mc) < `0.1` * m_avgtotal(mc) + m_minlimit(mc)) {
9878	break;
9879	}
9880	slab_remove(sp, mc);
9881	switch (mc) {
9882	case MC_MBUF:
9883	m_infree(mc) -= NMBPG;
9884	m_total(mc) -= NMBPG;
9885	if (mclaudit != NULL) {
9886	mcl_audit_free(sp->sl_base, NMBPG);
9887	}
9888	break;
9889	case MC_CL:
9890	m_infree(mc) -= NCLPG;
9891	m_total(mc) -= NCLPG;
9892	if (mclaudit != NULL) {
9893	mcl_audit_free(sp->sl_base, NMBPG);
9894	}
9895	break;
9896	case MC_BIGCL:
9897	{
9898	m_infree(mc) -= NBCLPG;
9899	m_total(mc) -= NBCLPG;
9900	if (mclaudit != NULL) {
9901	mcl_audit_free(sp->sl_base, NMBPG);
9902	}
9903	break;
9904	}
9905	case MC_16KCL:
9906	m_infree(mc)--;
9907	m_total(mc)--;
9908	for (nsp = sp, k = `1`; k < NSLABSP16KB; k++) {
9909	nsp = nsp->sl_next;
9910	VERIFY(nsp->sl_refcnt == `0` &&
9911	nsp->sl_base != NULL &&
9912	nsp->sl_len == `0`);
9913	slab_init(nsp, `0`, `0`, NULL, NULL, `0`, `0`,
9914	`0`);
9915	nsp->sl_flags = `0`;
9916	}
9917	if (mclaudit != NULL) {
9918	if (sp->sl_len == PAGE_SIZE) {
9919	mcl_audit_free(sp->sl_base,
9920	NMBPG);
9921	} else {
9922	mcl_audit_free(sp->sl_base, `1`);
9923	}
9924	}
9925	break;
9926	default:
9927	/*
9928	* The composite classes have their own
9929	* freelist (m_cobjlist), so we only
9930	* process rudimentary classes here.
9931	*/
9932	VERIFY(`0`);
9933	}
9934	m_release_cnt(mc) += m_size(mc);
9935	released += m_size(mc);
9936	VERIFY(sp->sl_base != NULL &&
9937	sp->sl_len >= PAGE_SIZE);
9938	offset = MTOPG(sp->sl_base);
9939	/*
9940	* Make sure the IOMapper points to a valid, but
9941	* bogus, address. This should prevent further DMA
9942	* accesses to freed memory.
9943	*/
9944	IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
9945	mcl_paddr[offset] = `0`;
9946	kmem_free(mb_map, (vm_offset_t)sp->sl_base,
9947	sp->sl_len);
9948	slab_init(sp, `0`, `0`, NULL, NULL, `0`, `0`, `0`);
9949	sp->sl_flags = `0`;
9950	}
9951	}
9952	mbstat.m_drain++;
9953	mbstat.m_bigclusters = m_total(MC_BIGCL);
9954	mbstat.m_clusters = m_total(MC_CL);
9955	mbstat.m_mbufs = m_total(MC_MBUF);
9956	mbuf_stat_sync();
9957	mbuf_mtypes_sync(TRUE);
9958	}
9959
9960	__private_extern__ void
9961	mbuf_drain(boolean_t ignore_waiters)
9962	{
9963	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
9964	if (!mbuf_drain_checks(ignore_waiters)) {
9965	return;
9966	}
9967	lck_mtx_lock(mbuf_mlock);
9968	mbuf_drain_locked(ignore_waiters);
9969	lck_mtx_unlock(mbuf_mlock);
9970	}
9971
9972
9973	static int
9974	m_drain_force_sysctl SYSCTL_HANDLER_ARGS
9975	{
9976	#pragma unused(arg1, arg2)
9977	int val = `0`, err;
9978
9979	err = sysctl_handle_int(oidp, &val, `0`, req);
9980	if (err != `0` \|\| req->newptr == USER_ADDR_NULL) {
9981	return err;
9982	}
9983	if (val) {
9984	mbuf_drain(TRUE);
9985	}
9986
9987	return err;
9988	}
9989
9990	#if DEBUG \|\| DEVELOPMENT
9991	__printflike(`3`, `4`)
9992	static void
9993	_mbwdog_logger(const char func, const* int line, const char *fmt, ...)
9994	{
9995	va_list ap;
9996	struct timeval now;
9997	char str[`384`], p[`256`];
9998	int len;
9999
10000	LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
10001	if (mbwdog_logging == NULL) {
10002	/*
10003	* This might block under a mutex, which isn't really great,
10004	* but this happens once, so we'll live.
10005	*/
10006	mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
10007	ZALIGN_NONE);
10008	}
10009	va_start(ap, fmt);
10010	vsnprintf(p, sizeof(p), fmt, ap);
10011	va_end(ap);
10012	microuptime(&now);
10013	len = scnprintf(str, sizeof(str),
10014	"\n%ld.%d (%d/%llx) %s:%d %s",
10015	now.tv_sec, now.tv_usec,
10016	proc_getpid(current_proc()),
10017	(uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
10018	func, line, p);
10019	if (len < `0`) {
10020	return;
10021	}
10022	if (mbwdog_logging_used + len > mbwdog_logging_size) {
10023	mbwdog_logging_used = mbwdog_logging_used / `2`;
10024	memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
10025	mbwdog_logging_size - mbwdog_logging_used);
10026	mbwdog_logging[mbwdog_logging_used] = `0`;
10027	}
10028	strlcat(mbwdog_logging, str, mbwdog_logging_size);
10029	mbwdog_logging_used += len;
10030	}
10031
10032	#endif // DEBUG \|\| DEVELOPMENT
10033
10034	static void
10035	mtracelarge_register(size_t size)
10036	{
10037	int i;
10038	struct mtracelarge *trace;
10039	uintptr_t bt[MLEAK_STACK_DEPTH];
10040	unsigned int depth;
10041
10042	depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
10043	/ Check if this entry is already on the list. /
10044	for (i = `0`; i < MTRACELARGE_NUM_TRACES; i++) {
10045	trace = &mtracelarge_table[i];
10046	if (trace->size == size && trace->depth == depth &&
10047	memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == `0`) {
10048	return;
10049	}
10050	}
10051	for (i = `0`; i < MTRACELARGE_NUM_TRACES; i++) {
10052	trace = &mtracelarge_table[i];
10053	if (size > trace->size) {
10054	trace->depth = depth;
10055	memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
10056	trace->size = size;
10057	break;
10058	}
10059	}
10060	}
10061
10062	#if DEBUG \|\| DEVELOPMENT
10063
10064	static int
10065	mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS
10066	{
10067	char *str;
10068
10069	ifnet_head_lock_shared();
10070	lck_mtx_lock(mbuf_mlock);
10071
10072	str = mbuf_dump();
10073
10074	lck_mtx_unlock(mbuf_mlock);
10075	ifnet_head_done();
10076
10077	return sysctl_io_string(req, str, `0`, `0`, NULL);
10078	}
10079
10080	#endif /* DEBUG \|\| DEVELOPMENT */
10081	#endif /* CONFIG_MBUF_MCACHE */
10082
10083	SYSCTL_DECL(_kern_ipc);
10084	#if DEBUG \|\| DEVELOPMENT
10085	#if SKYWALK && CONFIG_MBUF_MCACHE
10086	SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor,
10087	CTLFLAG_RW \| CTLFLAG_LOCKED, &mc_threshold_scale_down_factor,
10088	MC_THRESHOLD_SCALE_DOWN_FACTOR,
10089	"scale down factor for mbuf cache thresholds");
10090	#endif /* SKYWALK && CONFIG_MBUF_MCACHE */
10091	#if CONFIG_MBUF_MCACHE
10092	SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump,
10093	CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_LOCKED,
10094	`0`, `0`, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump");
10095	#endif /* CONFIG_MBUF_MCACHE */
10096	#endif /* DEBUG \|\| DEVELOPMENT */
10097	SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
10098	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
10099	`0`, `0`, mbstat_sysctl, "S,mbstat", "");
10100	SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
10101	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
10102	`0`, `0`, mb_stat_sysctl, "S,mb_stat", "");
10103	#if CONFIG_MBUF_MCACHE
10104	SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
10105	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
10106	`0`, `0`, mleak_top_trace_sysctl, "S,mb_top_trace", "");
10107	SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
10108	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
10109	`0`, `0`, mleak_table_sysctl, "S,mleak_table", "");
10110	SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
10111	CTLFLAG_RW \| CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, `0`, "");
10112	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
10113	CTLFLAG_RD \| CTLFLAG_LOCKED, &mb_normalized, `0`, "");
10114	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
10115	CTLFLAG_RW \| CTLFLAG_LOCKED, &mb_watchdog, `0`, "");
10116	SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
10117	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, NULL, `0`,
10118	m_drain_force_sysctl, "I",
10119	"Forces the mbuf garbage collection to run");
10120	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
10121	CTLFLAG_RW \| CTLFLAG_LOCKED, &mb_drain_maxint, `0`,
10122	"Minimum time interval between garbage collection");
10123	#endif /* CONFIG_MBUF_MCACHE */
10124	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
10125	CTLFLAG_RW \| CTLFLAG_LOCKED, &mb_memory_pressure_percentage, `0`,
10126	"Percentage of when we trigger memory-pressure for an mbuf-class");
10127	#if CONFIG_MBUF_MCACHE
10128	static int mb_uses_mcache = `1`;
10129	#else
10130	static int mb_uses_mcache = `0`;
10131	#endif /* CONFIG_MBUF_MCACHE */
10132	SYSCTL_INT(_kern_ipc, OID_AUTO, mb_uses_mcache,
10133	CTLFLAG_LOCKED, &mb_uses_mcache, `0`,
10134	"Whether mbufs use mcache");
10135

Browse the source code of xnu/bsd/kern/uipc_mbuf.c