1/*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <ptrauth.h>
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/malloc.h>
75#include <sys/mbuf.h>
76#include <sys/kernel.h>
77#include <sys/sysctl.h>
78#include <sys/syslog.h>
79#include <sys/protosw.h>
80#include <sys/domain.h>
81#include <sys/queue.h>
82#include <sys/proc.h>
83#include <sys/filedesc.h>
84#include <sys/file_internal.h>
85
86#include <dev/random/randomdev.h>
87
88#include <kern/kern_types.h>
89#include <kern/simple_lock.h>
90#include <kern/queue.h>
91#include <kern/sched_prim.h>
92#include <kern/backtrace.h>
93#include <kern/percpu.h>
94#include <kern/zalloc.h>
95
96#include <libkern/OSDebug.h>
97#include <libkern/libkern.h>
98
99#include <os/log.h>
100#include <os/ptrtools.h>
101
102#include <IOKit/IOMapper.h>
103
104#include <machine/limits.h>
105#include <machine/machine_routines.h>
106
107#if CONFIG_MBUF_MCACHE
108#include <sys/mcache.h>
109#endif /* CONFIG_MBUF_MCACHE */
110#include <net/ntstat.h>
111
112#if INET
113extern int dump_tcp_reass_qlen(char *, int);
114extern int tcp_reass_qlen_space(struct socket *);
115#endif /* INET */
116
117#if MPTCP
118extern int dump_mptcp_reass_qlen(char *, int);
119#endif /* MPTCP */
120
121
122#if NETWORKING
123extern int dlil_dump_top_if_qlen(char *, int);
124#endif /* NETWORKING */
125
126#if CONFIG_MBUF_MCACHE
127/*
128 * MBUF IMPLEMENTATION NOTES.
129 *
130 * There is a total of 5 per-CPU caches:
131 *
132 * MC_MBUF:
133 * This is a cache of rudimentary objects of _MSIZE in size; each
134 * object represents an mbuf structure. This cache preserves only
135 * the m_type field of the mbuf during its transactions.
136 *
137 * MC_CL:
138 * This is a cache of rudimentary objects of MCLBYTES in size; each
139 * object represents a mcluster structure. This cache does not
140 * preserve the contents of the objects during its transactions.
141 *
142 * MC_BIGCL:
143 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each
144 * object represents a mbigcluster structure. This cache does not
145 * preserve the contents of the objects during its transaction.
146 *
147 * MC_MBUF_CL:
148 * This is a cache of mbufs each having a cluster attached to it.
149 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
150 * fields of the mbuf related to the external cluster are preserved
151 * during transactions.
152 *
153 * MC_MBUF_BIGCL:
154 * This is a cache of mbufs each having a big cluster attached to it.
155 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
156 * fields of the mbuf related to the external cluster are preserved
157 * during transactions.
158 *
159 * OBJECT ALLOCATION:
160 *
161 * Allocation requests are handled first at the per-CPU (mcache) layer
162 * before falling back to the slab layer. Performance is optimal when
163 * the request is satisfied at the CPU layer because global data/lock
164 * never gets accessed. When the slab layer is entered for allocation,
165 * the slab freelist will be checked first for available objects before
166 * the VM backing store is invoked. Slab layer operations are serialized
167 * for all of the caches as the mbuf global lock is held most of the time.
168 * Allocation paths are different depending on the class of objects:
169 *
170 * a. Rudimentary object:
171 *
172 * { m_get_common(), m_clattach(), m_mclget(),
173 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
174 * composite object allocation }
175 * | ^
176 * | |
177 * | +-----------------------+
178 * v |
179 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit()
180 * | ^
181 * v |
182 * [CPU cache] -------> (found?) -------+
183 * | |
184 * v |
185 * mbuf_slab_alloc() |
186 * | |
187 * v |
188 * +---------> [freelist] -------> (found?) -------+
189 * | |
190 * | v
191 * | m_clalloc()
192 * | |
193 * | v
194 * +---<<---- kmem_mb_alloc()
195 *
196 * b. Composite object:
197 *
198 * { m_getpackets_internal(), m_allocpacket_internal() }
199 * | ^
200 * | |
201 * | +------ (done) ---------+
202 * v |
203 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit()
204 * | ^
205 * v |
206 * [CPU cache] -------> (found?) -------+
207 * | |
208 * v |
209 * mbuf_cslab_alloc() |
210 * | |
211 * v |
212 * [freelist] -------> (found?) -------+
213 * | |
214 * v |
215 * (rudimentary object) |
216 * mcache_alloc/mcache_alloc_ext() ------>>-----+
217 *
218 * Auditing notes: If auditing is enabled, buffers will be subjected to
219 * integrity checks by the audit routine. This is done by verifying their
220 * contents against DEADBEEF (free) pattern before returning them to caller.
221 * As part of this step, the routine will also record the transaction and
222 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will
223 * also restore any constructed data structure fields if necessary.
224 *
225 * OBJECT DEALLOCATION:
226 *
227 * Freeing an object simply involves placing it into the CPU cache; this
228 * pollutes the cache to benefit subsequent allocations. The slab layer
229 * will only be entered if the object is to be purged out of the cache.
230 * During normal operations, this happens only when the CPU layer resizes
231 * its bucket while it's adjusting to the allocation load. Deallocation
232 * paths are different depending on the class of objects:
233 *
234 * a. Rudimentary object:
235 *
236 * { m_free(), m_freem_list(), composite object deallocation }
237 * | ^
238 * | |
239 * | +------ (done) ---------+
240 * v |
241 * mcache_free/mcache_free_ext() |
242 * | |
243 * v |
244 * mbuf_slab_audit() |
245 * | |
246 * v |
247 * [CPU cache] ---> (not purging?) -----+
248 * | |
249 * v |
250 * mbuf_slab_free() |
251 * | |
252 * v |
253 * [freelist] ----------->>------------+
254 * (objects get purged to VM only on demand)
255 *
256 * b. Composite object:
257 *
258 * { m_free(), m_freem_list() }
259 * | ^
260 * | |
261 * | +------ (done) ---------+
262 * v |
263 * mcache_free/mcache_free_ext() |
264 * | |
265 * v |
266 * mbuf_cslab_audit() |
267 * | |
268 * v |
269 * [CPU cache] ---> (not purging?) -----+
270 * | |
271 * v |
272 * mbuf_cslab_free() |
273 * | |
274 * v |
275 * [freelist] ---> (not purging?) -----+
276 * | |
277 * v |
278 * (rudimentary object) |
279 * mcache_free/mcache_free_ext() ------->>------+
280 *
281 * Auditing notes: If auditing is enabled, the audit routine will save
282 * any constructed data structure fields (if necessary) before filling the
283 * contents of the buffers with DEADBEEF (free) pattern and recording the
284 * transaction. Buffers that are freed (whether at CPU or slab layer) are
285 * expected to contain the free pattern.
286 *
287 * DEBUGGING:
288 *
289 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
290 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally,
291 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
292 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak
293 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
294 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory.
295 *
296 * Each object is associated with exactly one mcache_audit_t structure that
297 * contains the information related to its last buffer transaction. Given
298 * an address of an object, the audit structure can be retrieved by finding
299 * the position of the object relevant to the base address of the cluster:
300 *
301 * +------------+ +=============+
302 * | mbuf addr | | mclaudit[i] |
303 * +------------+ +=============+
304 * | | cl_audit[0] |
305 * i = MTOBG(addr) +-------------+
306 * | +-----> | cl_audit[1] | -----> mcache_audit_t
307 * b = BGTOM(i) | +-------------+
308 * | | | ... |
309 * x = MCLIDX(b, addr) | +-------------+
310 * | | | cl_audit[7] |
311 * +-----------------+ +-------------+
312 * (e.g. x == 1)
313 *
314 * The mclaudit[] array is allocated at initialization time, but its contents
315 * get populated when the corresponding cluster is created. Because a page
316 * can be turned into NMBPG number of mbufs, we preserve enough space for the
317 * mbufs so that there is a 1-to-1 mapping between them. A page that never
318 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
319 * remaining entries unused. For 16KB cluster, only one entry from the first
320 * page is allocated and used for the entire object.
321 */
322#else
323/*
324 * MBUF IMPLEMENTATION NOTES (using zalloc).
325 *
326 * There are a total of 4 zones and 3 zcaches.
327 *
328 * MC_MBUF:
329 * This is a zone of rudimentary objects of _MSIZE in size; each
330 * object represents an mbuf structure. This cache preserves only
331 * the m_type field of the mbuf during its transactions.
332 *
333 * MC_CL:
334 * This is a zone of rudimentary objects of MCLBYTES in size; each
335 * object represents a mcluster structure. This cache does not
336 * preserve the contents of the objects during its transactions.
337 *
338 * MC_BIGCL:
339 * This is a zone of rudimentary objects of MBIGCLBYTES in size; each
340 * object represents a mbigcluster structure. This cache does not
341 * preserve the contents of the objects during its transaction.
342 *
343 * MC_16KCL:
344 * This is a zone of rudimentary objects of M16KCLBYTES in size; each
345 * object represents a m16kcluster structure. This cache does not
346 * preserve the contents of the objects during its transaction.
347 *
348 * MC_MBUF_CL:
349 * This is a cache of mbufs each having a cluster attached to it.
350 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several
351 * fields of the mbuf related to the external cluster are preserved
352 * during transactions.
353 *
354 * MC_MBUF_BIGCL:
355 * This is a cache of mbufs each having a big cluster attached to it.
356 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several
357 * fields of the mbuf related to the external cluster are preserved
358 * during transactions.
359 *
360 * MC_MBUF_16KCL:
361 * This is a cache of mbufs each having a big cluster attached to it.
362 * It is backed by MC_MBUF and MC_16KCL rudimentary caches. Several
363 * fields of the mbuf related to the external cluster are preserved
364 * during transactions.
365 *
366 * OBJECT ALLOCATION:
367 *
368 * Allocation requests are handled first at the zalloc per-CPU layer
369 * before falling back to the zalloc depot. Performance is optimal when
370 * the request is satisfied at the CPU layer. zalloc has an additional
371 * overflow layer called the depot, not pictured in the diagram below.
372 *
373 * Allocation paths are different depending on the class of objects:
374 *
375 * a. Rudimentary object:
376 *
377 * { m_get_common(), m_clattach(), m_mclget(),
378 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
379 * composite object allocation }
380 * | ^
381 * | |
382 * | +------- (done) --------+
383 * v |
384 * zalloc_flags/zalloc_n() KASAN
385 * | ^
386 * v |
387 * +----> [zalloc per-CPU cache] -----> (found?) --+
388 * | | |
389 * | v |
390 * | [zalloc recirculation layer] --> (found?) ---+
391 * | |
392 * | v
393 * +--<<-- [zone backing store]
394 *
395 * b. Composite object:
396 *
397 * { m_getpackets_internal(), m_allocpacket_internal() }
398 * | ^
399 * | |
400 * | +------ (done) ---------+
401 * v |
402 * mz_composite_alloc() KASAN
403 * | ^
404 * v |
405 * zcache_alloc_n() |
406 * | |
407 * v |
408 * [zalloc per-CPU cache] --> mark_valid() ---+
409 * | |
410 * v |
411 * [zalloc recirculation layer] -> mark_valid() -+
412 * | |
413 * v |
414 * mz_composite_build() |
415 * | |
416 * v |
417 * (rudimentary objects) |
418 * zalloc_id() ---------------->>-----+
419 *
420 * Auditing notes: If KASAN enabled, buffers will be subjected to
421 * integrity checks by the AddressSanitizer.
422 *
423 * OBJECT DEALLOCATION:
424 *
425 * Freeing an object simply involves placing it into the CPU cache; this
426 * pollutes the cache to benefit subsequent allocations. The depot
427 * will only be entered if the object is to be purged out of the cache.
428 * Objects may be purged based on the overall memory pressure or
429 * during zone garbage collection.
430 * To improve performance, objects are not zero-filled when freed
431 * as it's custom for other zalloc zones.
432 *
433 * Deallocation paths are different depending on the class of objects:
434 *
435 * a. Rudimentary object:
436 *
437 * { m_free(), m_freem_list(), composite object deallocation }
438 * | ^
439 * | |
440 * | +------ (done) ---------+
441 * v |
442 * zfree_nozero() |
443 * | |
444 * v |
445 * KASAN |
446 * | |
447 * v |
448 * [zalloc per-CPU cache] -> (not purging?) --+
449 * | |
450 * v |
451 * [zalloc recirculation layer] --->>----------+
452 *
453 *
454 * b. Composite object:
455 *
456 * { m_free(), m_freem_list() }
457 * | ^
458 * | |
459 * | +------ (done) ---------+
460 * v |
461 * mz_composite_free() |
462 * | |
463 * v |
464 * zcache_free_n() |
465 * | |
466 * v |
467 * KASAN |
468 * | |
469 * v |
470 * [zalloc per-CPU cache] -> mark_invalid() --+
471 * | |
472 * v |
473 * mz_composite_destroy() |
474 * | |
475 * v |
476 * (rudimentary object) |
477 * zfree_nozero() -------------->>------+
478 *
479 * Auditing notes: If KASAN enabled, buffers will be subjected to
480 * integrity checks by the AddressSanitizer.
481 *
482 * DEBUGGING:
483 *
484 * Debugging mbufs can be done by booting a KASAN enabled kernel.
485 */
486
487#endif /* CONFIG_MBUF_MCACHE */
488
489/* TODO: should be in header file */
490/* kernel translater */
491extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
492extern vm_map_t mb_map; /* special map */
493
494#if CONFIG_MBUF_MCACHE
495static uint32_t mb_kmem_contig_failed;
496static uint32_t mb_kmem_failed;
497static uint32_t mb_kmem_one_failed;
498/* Timestamp of allocation failures. */
499static uint64_t mb_kmem_contig_failed_ts;
500static uint64_t mb_kmem_failed_ts;
501static uint64_t mb_kmem_one_failed_ts;
502static uint64_t mb_kmem_contig_failed_size;
503static uint64_t mb_kmem_failed_size;
504static uint32_t mb_kmem_stats[6];
505#endif /* CONFIG_MBUF_MCACHE */
506
507/* Global lock */
508static LCK_GRP_DECLARE(mbuf_mlock_grp, "mbuf");
509static LCK_MTX_DECLARE(mbuf_mlock_data, &mbuf_mlock_grp);
510static lck_mtx_t *const mbuf_mlock = &mbuf_mlock_data;
511
512#if CONFIG_MBUF_MCACHE
513/* Back-end (common) layer */
514static uint64_t mb_expand_cnt;
515static uint64_t mb_expand_cl_cnt;
516static uint64_t mb_expand_cl_total;
517static uint64_t mb_expand_bigcl_cnt;
518static uint64_t mb_expand_bigcl_total;
519static uint64_t mb_expand_16kcl_cnt;
520static uint64_t mb_expand_16kcl_total;
521static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
522static uint32_t mbuf_worker_run_cnt;
523static uint64_t mbuf_worker_last_runtime;
524static uint64_t mbuf_drain_last_runtime;
525static int mbuf_worker_ready; /* worker thread is runnable */
526static unsigned int ncpu; /* number of CPUs */
527static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */
528static ppnum_t mcl_pages; /* Size of array (# physical pages) */
529static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */
530static mcache_t *ref_cache; /* Cache of cluster reference & flags */
531static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
532unsigned int mbuf_debug; /* patchable mbuf mcache flags */
533#endif /* CONFIG_MBUF_DEBUG */
534static unsigned int mb_normalized; /* number of packets "normalized" */
535
536extern unsigned int mb_tag_mbuf;
537
538#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */
539#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */
540
541typedef enum {
542 MC_MBUF = 0, /* Regular mbuf */
543 MC_CL, /* Cluster */
544 MC_BIGCL, /* Large (4KB) cluster */
545 MC_16KCL, /* Jumbo (16KB) cluster */
546 MC_MBUF_CL, /* mbuf + cluster */
547 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */
548 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */
549} mbuf_class_t;
550
551#define MBUF_CLASS_MIN MC_MBUF
552#define MBUF_CLASS_MAX MC_MBUF_16KCL
553#define MBUF_CLASS_LAST MC_16KCL
554#define MBUF_CLASS_VALID(c) \
555 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
556#define MBUF_CLASS_COMPOSITE(c) \
557 ((int)(c) > MBUF_CLASS_LAST)
558
559
560/*
561 * mbuf specific mcache allocation request flags.
562 */
563#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
564
565/*
566 * Per-cluster slab structure.
567 *
568 * A slab is a cluster control structure that contains one or more object
569 * chunks; the available chunks are chained in the slab's freelist (sl_head).
570 * Each time a chunk is taken out of the slab, the slab's reference count
571 * gets incremented. When all chunks have been taken out, the empty slab
572 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is
573 * returned to a slab causes the slab's reference count to be decremented;
574 * it also causes the slab to be reinserted back to class's slab list, if
575 * it's not already done.
576 *
577 * Compartmentalizing of the object chunks into slabs allows us to easily
578 * merge one or more slabs together when the adjacent slabs are idle, as
579 * well as to convert or move a slab from one class to another; e.g. the
580 * mbuf cluster slab can be converted to a regular cluster slab when all
581 * mbufs in the slab have been freed.
582 *
583 * A slab may also span across multiple clusters for chunks larger than
584 * a cluster's size. In this case, only the slab of the first cluster is
585 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate
586 * that they are part of the larger slab.
587 *
588 * Each slab controls a page of memory.
589 */
590typedef struct mcl_slab {
591 struct mcl_slab *sl_next; /* neighboring slab */
592 u_int8_t sl_class; /* controlling mbuf class */
593 int8_t sl_refcnt; /* outstanding allocations */
594 int8_t sl_chunks; /* chunks (bufs) in this slab */
595 u_int16_t sl_flags; /* slab flags (see below) */
596 u_int16_t sl_len; /* slab length */
597 void *sl_base; /* base of allocated memory */
598 void *sl_head; /* first free buffer */
599 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */
600} mcl_slab_t;
601
602#define SLF_MAPPED 0x0001 /* backed by a mapped page */
603#define SLF_PARTIAL 0x0002 /* part of another slab */
604#define SLF_DETACHED 0x0004 /* not in slab freelist */
605
606/*
607 * The array of slabs are broken into groups of arrays per 1MB of kernel
608 * memory to reduce the footprint. Each group is allocated on demand
609 * whenever a new piece of memory mapped in from the VM crosses the 1MB
610 * boundary.
611 */
612#define NSLABSPMB ((1 << MBSHIFT) >> PAGE_SHIFT)
613
614typedef struct mcl_slabg {
615 mcl_slab_t *slg_slab; /* group of slabs */
616} mcl_slabg_t;
617
618/*
619 * Number of slabs needed to control a 16KB cluster object.
620 */
621#define NSLABSP16KB (M16KCLBYTES >> PAGE_SHIFT)
622
623#if CONFIG_MBUF_MCACHE
624/*
625 * Per-cluster audit structure.
626 */
627typedef struct {
628 mcache_audit_t **cl_audit; /* array of audits */
629} mcl_audit_t;
630
631typedef struct {
632 struct thread *msa_thread; /* thread doing transaction */
633 struct thread *msa_pthread; /* previous transaction thread */
634 uint32_t msa_tstamp; /* transaction timestamp (ms) */
635 uint32_t msa_ptstamp; /* prev transaction timestamp (ms) */
636 uint16_t msa_depth; /* pc stack depth */
637 uint16_t msa_pdepth; /* previous transaction pc stack */
638 void *msa_stack[MCACHE_STACK_DEPTH];
639 void *msa_pstack[MCACHE_STACK_DEPTH];
640} mcl_scratch_audit_t;
641
642typedef struct {
643 /*
644 * Size of data from the beginning of an mbuf that covers m_hdr,
645 * pkthdr and m_ext structures. If auditing is enabled, we allocate
646 * a shadow mbuf structure of this size inside each audit structure,
647 * and the contents of the real mbuf gets copied into it when the mbuf
648 * is freed. This allows us to pattern-fill the mbuf for integrity
649 * check, and to preserve any constructed mbuf fields (e.g. mbuf +
650 * cluster cache case). Note that we don't save the contents of
651 * clusters when they are freed; we simply pattern-fill them.
652 */
653 u_int8_t sc_mbuf[(_MSIZE - _MHLEN) + sizeof(_m_ext_t)];
654 mcl_scratch_audit_t sc_scratch __attribute__((aligned(8)));
655} mcl_saved_contents_t;
656
657#define AUDIT_CONTENTS_SIZE (sizeof (mcl_saved_contents_t))
658
659#define MCA_SAVED_MBUF_PTR(_mca) \
660 ((struct mbuf *)(void *)((mcl_saved_contents_t *) \
661 (_mca)->mca_contents)->sc_mbuf)
662#define MCA_SAVED_MBUF_SIZE \
663 (sizeof (((mcl_saved_contents_t *)0)->sc_mbuf))
664#define MCA_SAVED_SCRATCH_PTR(_mca) \
665 (&((mcl_saved_contents_t *)(_mca)->mca_contents)->sc_scratch)
666
667/*
668 * mbuf specific mcache audit flags
669 */
670#define MB_INUSE 0x01 /* object has not been returned to slab */
671#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */
672#define MB_SCVALID 0x04 /* object has valid saved contents */
673
674/*
675 * Each of the following two arrays hold up to nmbclusters elements.
676 */
677static mcl_audit_t *mclaudit; /* array of cluster audit information */
678static unsigned int maxclaudit; /* max # of entries in audit table */
679static mcl_slabg_t **slabstbl; /* cluster slabs table */
680static unsigned int maxslabgrp; /* max # of entries in slabs table */
681static unsigned int slabgrp; /* # of entries in slabs table */
682#endif /* CONFIG_MBUF_MCACHE */
683
684/* Globals */
685int nclusters; /* # of clusters for non-jumbo (legacy) sizes */
686int njcl; /* # of clusters for jumbo sizes */
687int njclbytes; /* size of a jumbo cluster */
688unsigned char *mbutl; /* first mapped cluster address */
689unsigned char *embutl; /* ending virtual address of mclusters */
690int max_linkhdr; /* largest link-level header */
691int max_protohdr; /* largest protocol header */
692int max_hdr; /* largest link+protocol header */
693int max_datalen; /* MHLEN - max_hdr */
694
695#if CONFIG_MBUF_MCACHE
696static boolean_t mclverify; /* debug: pattern-checking */
697static boolean_t mcltrace; /* debug: stack tracing */
698static boolean_t mclfindleak; /* debug: leak detection */
699static boolean_t mclexpleak; /* debug: expose leak info to user space */
700
701static struct timeval mb_start; /* beginning of time */
702
703/* mbuf leak detection variables */
704static struct mleak_table mleak_table;
705static mleak_stat_t *mleak_stat;
706
707#define MLEAK_STAT_SIZE(n) \
708 __builtin_offsetof(mleak_stat_t, ml_trace[n])
709
710struct mallocation {
711 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */
712 u_int32_t trace_index; /* mtrace index for corresponding backtrace */
713 u_int32_t count; /* How many objects were requested */
714 u_int64_t hitcount; /* for determining hash effectiveness */
715};
716
717struct mtrace {
718 u_int64_t collisions;
719 u_int64_t hitcount;
720 u_int64_t allocs;
721 u_int64_t depth;
722 uintptr_t addr[MLEAK_STACK_DEPTH];
723};
724
725/* Size must be a power of two for the zhash to be able to just mask off bits */
726#define MLEAK_ALLOCATION_MAP_NUM 512
727#define MLEAK_TRACE_MAP_NUM 256
728
729/*
730 * Sample factor for how often to record a trace. This is overwritable
731 * by the boot-arg mleak_sample_factor.
732 */
733#define MLEAK_SAMPLE_FACTOR 500
734
735/*
736 * Number of top leakers recorded.
737 */
738#define MLEAK_NUM_TRACES 5
739
740#define MB_LEAK_SPACING_64 " "
741#define MB_LEAK_SPACING_32 " "
742
743
744#define MB_LEAK_HDR_32 "\n\
745 trace [1] trace [2] trace [3] trace [4] trace [5] \n\
746 ---------- ---------- ---------- ---------- ---------- \n\
747"
748
749#define MB_LEAK_HDR_64 "\n\
750 trace [1] trace [2] trace [3] \
751 trace [4] trace [5] \n\
752 ------------------ ------------------ ------------------ \
753 ------------------ ------------------ \n\
754"
755
756static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
757static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
758
759/* Hashmaps of allocations and their corresponding traces */
760static struct mallocation *mleak_allocations;
761static struct mtrace *mleak_traces;
762static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
763
764/* Lock to protect mleak tables from concurrent modification */
765static LCK_GRP_DECLARE(mleak_lock_grp, "mleak_lock");
766static LCK_MTX_DECLARE(mleak_lock_data, &mleak_lock_grp);
767static lck_mtx_t *const mleak_lock = &mleak_lock_data;
768
769/* *Failed* large allocations. */
770struct mtracelarge {
771 uint64_t size;
772 uint64_t depth;
773 uintptr_t addr[MLEAK_STACK_DEPTH];
774};
775
776#define MTRACELARGE_NUM_TRACES 5
777static struct mtracelarge mtracelarge_table[MTRACELARGE_NUM_TRACES];
778
779static void mtracelarge_register(size_t size);
780#endif /* CONFIG_MBUF_MCACHE */
781
782/* Lock to protect the completion callback table */
783static LCK_GRP_DECLARE(mbuf_tx_compl_tbl_lck_grp, "mbuf_tx_compl_tbl");
784LCK_RW_DECLARE(mbuf_tx_compl_tbl_lock, &mbuf_tx_compl_tbl_lck_grp);
785
786extern u_int32_t high_sb_max;
787
788/* The minimum number of objects that are allocated, to start. */
789#define MINCL 32
790#define MINBIGCL (MINCL >> 1)
791#define MIN16KCL (MINCL >> 2)
792
793/* Low watermarks (only map in pages once free counts go below) */
794#define MBIGCL_LOWAT MINBIGCL
795#define M16KCL_LOWAT MIN16KCL
796
797typedef struct {
798 mbuf_class_t mtbl_class; /* class type */
799#if CONFIG_MBUF_MCACHE
800 mcache_t *mtbl_cache; /* mcache for this buffer class */
801 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
802 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */
803#endif /* CONFIG_MBUF_MCACHE */
804 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */
805 u_int32_t mtbl_maxsize; /* maximum buffer size */
806 int mtbl_minlimit; /* minimum allowed */
807 int mtbl_maxlimit; /* maximum allowed */
808 u_int32_t mtbl_wantpurge; /* purge during next reclaim */
809 uint32_t mtbl_avgtotal; /* average total on iOS */
810 u_int32_t mtbl_expand; /* worker should expand the class */
811} mbuf_table_t;
812
813#define m_class(c) mbuf_table[c].mtbl_class
814#if CONFIG_MBUF_MCACHE
815#define m_cache(c) mbuf_table[c].mtbl_cache
816#define m_slablist(c) mbuf_table[c].mtbl_slablist
817#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist
818#else
819#define m_stats(c) mbuf_table[c].mtbl_stats
820#endif /* CONFIG_MBUF_MCACHE */
821#define m_maxsize(c) mbuf_table[c].mtbl_maxsize
822#define m_minlimit(c) mbuf_table[c].mtbl_minlimit
823#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit
824#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge
825#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname
826#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size
827#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total
828#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active
829#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree
830#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt
831#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
832#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt
833#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified
834#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt
835#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt
836#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal
837#define m_release_cnt(c) mbuf_table[c].mtbl_stats->mbcl_release_cnt
838#define m_region_expand(c) mbuf_table[c].mtbl_expand
839
840static mbuf_table_t mbuf_table[] = {
841#if CONFIG_MBUF_MCACHE
842 /*
843 * The caches for mbufs, regular clusters and big clusters.
844 * The average total values were based on data gathered by actual
845 * usage patterns on iOS.
846 */
847 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
848 NULL, NULL, 0, 0, 0, 0, 3000, 0 },
849 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
850 NULL, NULL, 0, 0, 0, 0, 2000, 0 },
851 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
852 NULL, NULL, 0, 0, 0, 0, 1000, 0 },
853 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
854 NULL, NULL, 0, 0, 0, 0, 200, 0 },
855 /*
856 * The following are special caches; they serve as intermediate
857 * caches backed by the above rudimentary caches. Each object
858 * in the cache is an mbuf with a cluster attached to it. Unlike
859 * the above caches, these intermediate caches do not directly
860 * deal with the slab structures; instead, the constructed
861 * cached elements are simply stored in the freelists.
862 */
863 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 2000, 0 },
864 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 1000, 0 },
865 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0, 200, 0 },
866#else
867 { .mtbl_class = MC_MBUF },
868 { .mtbl_class = MC_CL },
869 { .mtbl_class = MC_BIGCL },
870 { .mtbl_class = MC_16KCL },
871 { .mtbl_class = MC_MBUF_CL },
872 { .mtbl_class = MC_MBUF_BIGCL },
873 { .mtbl_class = MC_MBUF_16KCL },
874#endif /* CONFIG_MBUF_MCACHE */
875};
876
877#define NELEM(a) (sizeof (a) / sizeof ((a)[0]))
878
879#if SKYWALK && CONFIG_MBUF_MCACHE
880#define MC_THRESHOLD_SCALE_DOWN_FACTOR 2
881static unsigned int mc_threshold_scale_down_factor =
882 MC_THRESHOLD_SCALE_DOWN_FACTOR;
883#endif /* SKYWALK */
884
885#if CONFIG_MBUF_MCACHE
886static uint32_t
887m_avgtotal(mbuf_class_t c)
888{
889#if SKYWALK
890 return if_is_fsw_transport_netagent_enabled() ?
891 (mbuf_table[c].mtbl_avgtotal / mc_threshold_scale_down_factor) :
892 mbuf_table[c].mtbl_avgtotal;
893#else /* !SKYWALK */
894 return mbuf_table[c].mtbl_avgtotal;
895#endif /* SKYWALK */
896}
897#endif /* CONFIG_MBUF_MCACHE */
898
899#if CONFIG_MBUF_MCACHE
900static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */
901static int mb_waiters; /* number of waiters */
902#endif /* CONFIG_MBUF_MCACHE */
903
904#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */
905#if CONFIG_MBUF_MCACHE
906static struct timeval mb_wdtstart; /* watchdog start timestamp */
907static char *mbuf_dump_buf;
908
909#define MBUF_DUMP_BUF_SIZE 4096
910
911/*
912 * mbuf watchdog is enabled by default. It is also toggeable via the
913 * kern.ipc.mb_watchdog sysctl.
914 * Garbage collection is enabled by default on embedded platforms.
915 * mb_drain_maxint controls the amount of time to wait (in seconds) before
916 * consecutive calls to mbuf_drain().
917 */
918static unsigned int mb_watchdog = 1;
919#if !XNU_TARGET_OS_OSX
920static unsigned int mb_drain_maxint = 60;
921#else /* XNU_TARGET_OS_OSX */
922static unsigned int mb_drain_maxint = 0;
923#endif /* XNU_TARGET_OS_OSX */
924#endif /* CONFIG_MBUF_MCACHE */
925static unsigned int mb_memory_pressure_percentage = 80;
926
927uintptr_t mb_obscure_extfree __attribute__((visibility("hidden")));
928uintptr_t mb_obscure_extref __attribute__((visibility("hidden")));
929
930/* Red zone */
931static u_int32_t mb_redzone_cookie;
932static void m_redzone_init(struct mbuf *);
933static void m_redzone_verify(struct mbuf *m);
934
935static void m_set_rfa(struct mbuf *, struct ext_ref *);
936
937#if CONFIG_MBUF_MCACHE
938/* The following are used to serialize m_clalloc() */
939static boolean_t mb_clalloc_busy;
940static void *mb_clalloc_waitchan = &mb_clalloc_busy;
941static int mb_clalloc_waiters;
942#endif /* CONFIG_MBUF_MCACHE */
943
944static void mbuf_mtypes_sync(boolean_t);
945static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
946static void mbuf_stat_sync(void);
947static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
948#if CONFIG_MBUF_MCACHE
949static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
950static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
951static char *mbuf_dump(void);
952#endif /* CONFIG_MBUF_MCACHE */
953static void mbuf_table_init(void);
954static inline void m_incref(struct mbuf *);
955static inline u_int16_t m_decref(struct mbuf *);
956static void mbuf_watchdog_defunct(thread_call_param_t, thread_call_param_t);
957#if CONFIG_MBUF_MCACHE
958static int m_clalloc(const u_int32_t, const int, const u_int32_t);
959static void mbuf_worker_thread_init(void);
960static mcache_obj_t *slab_alloc(mbuf_class_t, int);
961static void slab_free(mbuf_class_t, mcache_obj_t *);
962static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
963 unsigned int, int);
964static void mbuf_slab_free(void *, mcache_obj_t *, int);
965static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
966static void mbuf_slab_notify(void *, u_int32_t);
967static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
968 unsigned int);
969static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
970static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
971 unsigned int, int);
972static void mbuf_cslab_free(void *, mcache_obj_t *, int);
973static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
974static int freelist_populate(mbuf_class_t, unsigned int, int);
975static void freelist_init(mbuf_class_t);
976static boolean_t mbuf_cached_above(mbuf_class_t, int);
977static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
978static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
979static int m_howmany(int, size_t);
980static void mbuf_worker_thread(void);
981static void mbuf_watchdog(void);
982static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
983
984static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
985 size_t, unsigned int);
986static void mcl_audit_free(void *, unsigned int);
987static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
988static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
989static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
990 boolean_t);
991static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
992static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
993static void mcl_audit_scratch(mcache_audit_t *);
994static void mcl_audit_mcheck_panic(struct mbuf *);
995static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
996
997static void mleak_activate(void);
998static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
999static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
1000static void mleak_free(mcache_obj_t *);
1001static void mleak_sort_traces(void);
1002static void mleak_update_stats(void);
1003
1004static mcl_slab_t *slab_get(void *);
1005static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
1006 void *, void *, unsigned int, int, int);
1007static void slab_insert(mcl_slab_t *, mbuf_class_t);
1008static void slab_remove(mcl_slab_t *, mbuf_class_t);
1009static boolean_t slab_inrange(mcl_slab_t *, void *);
1010static void slab_nextptr_panic(mcl_slab_t *, void *);
1011static void slab_detach(mcl_slab_t *);
1012static boolean_t slab_is_detached(mcl_slab_t *);
1013#else /* !CONFIG_MBUF_MCACHE */
1014static void mbuf_watchdog_drain_composite(thread_call_param_t, thread_call_param_t);
1015static struct mbuf *mz_alloc(zalloc_flags_t);
1016static void mz_free(struct mbuf *);
1017static struct ext_ref *mz_ref_alloc(zalloc_flags_t);
1018static void mz_ref_free(struct ext_ref *);
1019static void *mz_cl_alloc(zone_id_t, zalloc_flags_t);
1020static void mz_cl_free(zone_id_t, void *);
1021static struct mbuf *mz_composite_alloc(mbuf_class_t, zalloc_flags_t);
1022static zstack_t mz_composite_alloc_n(mbuf_class_t, unsigned int, zalloc_flags_t);
1023static void mz_composite_free(mbuf_class_t, struct mbuf *);
1024static void mz_composite_free_n(mbuf_class_t, zstack_t);
1025static void *mz_composite_build(zone_id_t, zalloc_flags_t);
1026static void *mz_composite_mark_valid(zone_id_t, void *);
1027static void *mz_composite_mark_invalid(zone_id_t, void *);
1028static void mz_composite_destroy(zone_id_t, void *);
1029
1030ZONE_DEFINE_ID(ZONE_ID_MBUF_REF, "mbuf.ref", struct ext_ref,
1031 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE);
1032ZONE_DEFINE_ID(ZONE_ID_MBUF, "mbuf", struct mbuf,
1033 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE);
1034ZONE_DEFINE_ID(ZONE_ID_CLUSTER_2K, "mbuf.cluster.2k", union mcluster,
1035 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA);
1036ZONE_DEFINE_ID(ZONE_ID_CLUSTER_4K, "mbuf.cluster.4k", union mbigcluster,
1037 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA);
1038ZONE_DEFINE_ID(ZONE_ID_CLUSTER_16K, "mbuf.cluster.16k", union m16kcluster,
1039 ZC_CACHING | ZC_NOPGZ | ZC_KASAN_NOQUARANTINE | ZC_DATA);
1040static_assert(sizeof(union mcluster) == MCLBYTES);
1041static_assert(sizeof(union mbigcluster) == MBIGCLBYTES);
1042static_assert(sizeof(union m16kcluster) == M16KCLBYTES);
1043
1044static const struct zone_cache_ops mz_composite_ops = {
1045 .zc_op_alloc = mz_composite_build,
1046 .zc_op_mark_valid = mz_composite_mark_valid,
1047 .zc_op_mark_invalid = mz_composite_mark_invalid,
1048 .zc_op_free = mz_composite_destroy,
1049};
1050ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_2K, "mbuf.composite.2k", struct mbuf,
1051 sizeof(struct mbuf) + sizeof(struct ext_ref) + MCLBYTES,
1052 &mz_composite_ops);
1053ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_4K, "mbuf.composite.4k", struct mbuf,
1054 sizeof(struct mbuf) + sizeof(struct ext_ref) + MBIGCLBYTES,
1055 &mz_composite_ops);
1056ZCACHE_DEFINE(ZONE_ID_MBUF_CLUSTER_16K, "mbuf.composite.16k", struct mbuf,
1057 sizeof(struct mbuf) + sizeof(struct ext_ref) + M16KCLBYTES,
1058 &mz_composite_ops);
1059static_assert(ZONE_ID_MBUF + MC_MBUF == ZONE_ID_MBUF);
1060static_assert(ZONE_ID_MBUF + MC_CL == ZONE_ID_CLUSTER_2K);
1061static_assert(ZONE_ID_MBUF + MC_BIGCL == ZONE_ID_CLUSTER_4K);
1062static_assert(ZONE_ID_MBUF + MC_16KCL == ZONE_ID_CLUSTER_16K);
1063static_assert(ZONE_ID_MBUF + MC_MBUF_CL == ZONE_ID_MBUF_CLUSTER_2K);
1064static_assert(ZONE_ID_MBUF + MC_MBUF_BIGCL == ZONE_ID_MBUF_CLUSTER_4K);
1065static_assert(ZONE_ID_MBUF + MC_MBUF_16KCL == ZONE_ID_MBUF_CLUSTER_16K);
1066
1067/* Converts a an mbuf class to a zalloc zone ID. */
1068__attribute__((always_inline))
1069static inline zone_id_t
1070m_class_to_zid(mbuf_class_t class)
1071{
1072 return ZONE_ID_MBUF + class - MC_MBUF;
1073}
1074
1075__attribute__((always_inline))
1076static inline mbuf_class_t
1077m_class_from_zid(zone_id_t zid)
1078{
1079 return MC_MBUF + zid - ZONE_ID_MBUF;
1080}
1081
1082static thread_call_t mbuf_defunct_tcall;
1083static thread_call_t mbuf_drain_tcall;
1084#endif /* CONFIG_MBUF_MCACHE */
1085
1086static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
1087static struct mbuf *m_split0(struct mbuf *, int, int, int);
1088#if CONFIG_MBUF_MCACHE && (DEBUG || DEVELOPMENT)
1089#define mbwdog_logger(fmt, ...) _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
1090static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
1091static char *mbwdog_logging;
1092const unsigned mbwdog_logging_size = 4096;
1093static size_t mbwdog_logging_used;
1094#else
1095#define mbwdog_logger(fmt, ...) do { } while (0)
1096#endif /* CONFIG_MBUF_MCACHE &&DEBUG || DEVELOPMENT */
1097#if CONFIG_MBUF_MCACHE
1098static void mbuf_drain_locked(boolean_t);
1099#endif /* CONFIG_MBUF_MCACHE */
1100
1101/* flags for m_copyback0 */
1102#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */
1103#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */
1104#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */
1105#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */
1106
1107/*
1108 * This flag is set for all mbufs that come out of and into the composite
1109 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that
1110 * are marked with such a flag have clusters attached to them, and will be
1111 * treated differently when they are freed; instead of being placed back
1112 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
1113 * are placed back into the appropriate composite cache's freelist, and the
1114 * actual freeing is deferred until the composite objects are purged. At
1115 * such a time, this flag will be cleared from the mbufs and the objects
1116 * will be freed into their own separate freelists.
1117 */
1118#define EXTF_COMPOSITE 0x1
1119
1120/*
1121 * This flag indicates that the external cluster is read-only, i.e. it is
1122 * or was referred to by more than one mbufs. Once set, this flag is never
1123 * cleared.
1124 */
1125#define EXTF_READONLY 0x2
1126/*
1127 * This flag indicates that the external cluster is paired with the mbuf.
1128 * Pairing implies an external free routine defined which will be invoked
1129 * when the reference count drops to the minimum at m_free time. This
1130 * flag is never cleared.
1131 */
1132#define EXTF_PAIRED 0x4
1133
1134#define EXTF_MASK \
1135 (EXTF_COMPOSITE | EXTF_READONLY | EXTF_PAIRED)
1136
1137#define MEXT_MINREF(m) ((m_get_rfa(m))->minref)
1138#define MEXT_REF(m) ((m_get_rfa(m))->refcnt)
1139#define MEXT_PREF(m) ((m_get_rfa(m))->prefcnt)
1140#define MEXT_FLAGS(m) ((m_get_rfa(m))->flags)
1141#define MEXT_PRIV(m) ((m_get_rfa(m))->priv)
1142#define MEXT_PMBUF(m) ((m_get_rfa(m))->paired)
1143#define MEXT_TOKEN(m) ((m_get_rfa(m))->ext_token)
1144#define MBUF_IS_COMPOSITE(m) \
1145 (MEXT_REF(m) == MEXT_MINREF(m) && \
1146 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
1147/*
1148 * This macro can be used to test if the mbuf is paired to an external
1149 * cluster. The test for MEXT_PMBUF being equal to the mbuf in subject
1150 * is important, as EXTF_PAIRED alone is insufficient since it is immutable,
1151 * and thus survives calls to m_free_paired.
1152 */
1153#define MBUF_IS_PAIRED(m) \
1154 (((m)->m_flags & M_EXT) && \
1155 (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_PAIRED && \
1156 MEXT_PMBUF(m) == (m))
1157
1158/*
1159 * Macros used to verify the integrity of the mbuf.
1160 */
1161#if CONFIG_MBUF_MCACHE
1162#define _MCHECK(m) { \
1163 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
1164 if (mclaudit == NULL) \
1165 panic("MCHECK: m_type=%d m=%p", \
1166 (u_int16_t)(m)->m_type, m); \
1167 else \
1168 mcl_audit_mcheck_panic(m); \
1169 } \
1170}
1171#else
1172#define _MCHECK(m) \
1173 if ((m)->m_type != MT_FREE && !MBUF_IS_PAIRED(m)) { \
1174 panic("MCHECK: m_type=%d m=%p", \
1175 (u_int16_t)(m)->m_type, m); \
1176 }
1177#endif /* CONFIG_MBUF_MCACHE */
1178
1179/*
1180 * Macro version of mtod.
1181 */
1182#define MTOD(m, t) ((t)((m)->m_data))
1183
1184#if CONFIG_MBUF_MCACHE
1185#define MBUF_IN_MAP(addr) \
1186 ((unsigned char *)(addr) >= mbutl && \
1187 (unsigned char *)(addr) < embutl)
1188
1189#define MRANGE(addr) { \
1190 if (!MBUF_IN_MAP(addr)) \
1191 panic("MRANGE: address out of range 0x%p", addr); \
1192}
1193
1194/*
1195 * Macros to obtain page index given a base cluster address
1196 */
1197#define MTOPG(x) (((unsigned char *)x - mbutl) >> PAGE_SHIFT)
1198#define PGTOM(x) (mbutl + (x << PAGE_SHIFT))
1199
1200/*
1201 * Macro to find the mbuf index relative to a base.
1202 */
1203#define MBPAGEIDX(c, m) \
1204 (((unsigned char *)(m) - (unsigned char *)(c)) >> _MSIZESHIFT)
1205
1206/*
1207 * Same thing for 2KB cluster index.
1208 */
1209#define CLPAGEIDX(c, m) \
1210 (((unsigned char *)(m) - (unsigned char *)(c)) >> MCLSHIFT)
1211
1212/*
1213 * Macro to find 4KB cluster index relative to a base
1214 */
1215#define BCLPAGEIDX(c, m) \
1216 (((unsigned char *)(m) - (unsigned char *)(c)) >> MBIGCLSHIFT)
1217#endif /* CONFIG_MBUF_MCACHE */
1218
1219/*
1220 * Macros used during mbuf and cluster initialization.
1221 */
1222#define MBUF_INIT_PKTHDR(m) { \
1223 (m)->m_pkthdr.rcvif = NULL; \
1224 (m)->m_pkthdr.pkt_hdr = NULL; \
1225 (m)->m_pkthdr.len = 0; \
1226 (m)->m_pkthdr.csum_flags = 0; \
1227 (m)->m_pkthdr.csum_data = 0; \
1228 (m)->m_pkthdr.vlan_tag = 0; \
1229 (m)->m_pkthdr.comp_gencnt = 0; \
1230 (m)->m_pkthdr.pkt_crumbs = 0; \
1231 m_classifier_init(m, 0); \
1232 m_tag_init(m, 1); \
1233 m_scratch_init(m); \
1234 m_redzone_init(m); \
1235}
1236
1237#define MBUF_INIT(m, pkthdr, type) { \
1238 _MCHECK(m); \
1239 (m)->m_next = (m)->m_nextpkt = NULL; \
1240 (m)->m_len = 0; \
1241 (m)->m_type = type; \
1242 if ((pkthdr) == 0) { \
1243 (m)->m_data = (uintptr_t)(m)->m_dat; \
1244 (m)->m_flags = 0; \
1245 } else { \
1246 (m)->m_data = (uintptr_t)(m)->m_pktdat; \
1247 (m)->m_flags = M_PKTHDR; \
1248 MBUF_INIT_PKTHDR(m); \
1249 } \
1250}
1251
1252#define MEXT_INIT mext_init
1253
1254#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \
1255 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, 0, \
1256 ref, 0, flag, 0, NULL)
1257
1258#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \
1259 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, 0, \
1260 ref, 0, flag, 0, NULL)
1261
1262#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \
1263 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, 0, \
1264 ref, 0, flag, 0, NULL)
1265
1266/*
1267 * Macro to convert BSD malloc sleep flag to mcache's
1268 */
1269#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
1270
1271/*
1272 * The structure that holds all mbuf class statistics exportable via sysctl.
1273 * Similar to mbstat structure, the mb_stat structure is protected by the
1274 * global mbuf lock. It contains additional information about the classes
1275 * that allows for a more accurate view of the state of the allocator.
1276 */
1277struct mb_stat *mb_stat;
1278struct omb_stat *omb_stat; /* For backwards compatibility */
1279
1280#define MB_STAT_SIZE(n) \
1281 __builtin_offsetof(mb_stat_t, mbs_class[n])
1282#define OMB_STAT_SIZE(n) \
1283 __builtin_offsetof(struct omb_stat, mbs_class[n])
1284
1285/*
1286 * The legacy structure holding all of the mbuf allocation statistics.
1287 * The actual statistics used by the kernel are stored in the mbuf_table
1288 * instead, and are updated atomically while the global mbuf lock is held.
1289 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
1290 * Unlike before, the kernel no longer relies on the contents of mbstat for
1291 * its operations (e.g. cluster expansion) because the structure is exposed
1292 * to outside and could possibly be modified, therefore making it unsafe.
1293 * With the exception of the mbstat.m_mtypes array (see below), all of the
1294 * statistics are updated as they change.
1295 */
1296struct mbstat mbstat;
1297
1298#define MBSTAT_MTYPES_MAX \
1299 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
1300
1301/*
1302 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
1303 * atomically and stored in a per-CPU structure which is lock-free; this is
1304 * done in order to avoid writing to the global mbstat data structure which
1305 * would cause false sharing. During sysctl request for kern.ipc.mbstat,
1306 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
1307 * array and returned to the application. Any updates for types greater or
1308 * equal than MT_MAX would be done atomically to the mbstat; this slows down
1309 * performance but is okay since the kernel uses only up to MT_MAX-1 while
1310 * anything beyond that (up to type 255) is considered a corner case.
1311 */
1312typedef struct {
1313 unsigned int cpu_mtypes[MT_MAX];
1314} mbuf_mtypes_t;
1315
1316static mbuf_mtypes_t PERCPU_DATA(mbuf_mtypes);
1317
1318#define mtype_stat_add(type, n) { \
1319 if ((unsigned)(type) < MT_MAX) { \
1320 mbuf_mtypes_t *mbs = PERCPU_GET(mbuf_mtypes); \
1321 os_atomic_add(&mbs->cpu_mtypes[type], n, relaxed); \
1322 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \
1323 os_atomic_add((int16_t *)&mbstat.m_mtypes[type], n, relaxed); \
1324 } \
1325}
1326
1327#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n))
1328#define mtype_stat_inc(t) mtype_stat_add(t, 1)
1329#define mtype_stat_dec(t) mtype_stat_sub(t, 1)
1330
1331static inline void
1332mext_init(struct mbuf *m, void *__sized_by(size)buf, u_int size,
1333 m_ext_free_func_t free, caddr_t free_arg, struct ext_ref *rfa,
1334 u_int16_t min, u_int16_t ref, u_int16_t pref, u_int16_t flag,
1335 u_int32_t priv, struct mbuf *pm)
1336{
1337 m->m_ext.ext_buf = buf;
1338 m->m_ext.ext_size = size;
1339 m->m_data = (uintptr_t)m->m_ext.ext_buf;
1340 m->m_len = 0;
1341 m->m_flags |= M_EXT;
1342 m_set_ext(m, rfa, free, free_arg);
1343 MEXT_MINREF(m) = min;
1344 MEXT_REF(m) = ref;
1345 MEXT_PREF(m) = pref;
1346 MEXT_FLAGS(m) = flag;
1347 MEXT_PRIV(m) = priv;
1348 MEXT_PMBUF(m) = pm;
1349}
1350
1351static void
1352mbuf_mtypes_sync(boolean_t locked)
1353{
1354 mbuf_mtypes_t mtc;
1355
1356 if (locked) {
1357 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1358 }
1359
1360 mtc = *PERCPU_GET_MASTER(mbuf_mtypes);
1361 percpu_foreach_secondary(mtype, mbuf_mtypes) {
1362 for (int n = 0; n < MT_MAX; n++) {
1363 mtc.cpu_mtypes[n] += mtype->cpu_mtypes[n];
1364 }
1365 }
1366
1367 if (!locked) {
1368 lck_mtx_lock(lck: mbuf_mlock);
1369 }
1370 for (int n = 0; n < MT_MAX; n++) {
1371 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
1372 }
1373 if (!locked) {
1374 lck_mtx_unlock(lck: mbuf_mlock);
1375 }
1376}
1377
1378static int
1379mbstat_sysctl SYSCTL_HANDLER_ARGS
1380{
1381#pragma unused(oidp, arg1, arg2)
1382
1383#if CONFIG_MBUF_MCACHE
1384 mbuf_mtypes_sync(FALSE);
1385#else
1386 lck_mtx_lock(lck: mbuf_mlock);
1387 mbuf_stat_sync();
1388 mbuf_mtypes_sync(TRUE);
1389 lck_mtx_unlock(lck: mbuf_mlock);
1390#endif
1391
1392 return SYSCTL_OUT(req, &mbstat, sizeof(mbstat));
1393}
1394
1395static void
1396mbuf_stat_sync(void)
1397{
1398 mb_class_stat_t *sp;
1399#if CONFIG_MBUF_MCACHE
1400 mcache_cpu_t *ccp;
1401 mcache_t *cp;
1402 int k, m, bktsize;
1403#else
1404 int k;
1405 uint64_t drops = 0;
1406#endif /* CONFIG_MBUF_MCACHE */
1407
1408
1409 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1410
1411#if CONFIG_MBUF_MCACHE
1412 for (k = 0; k < NELEM(mbuf_table); k++) {
1413 cp = m_cache(k);
1414 ccp = &cp->mc_cpu[0];
1415 bktsize = ccp->cc_bktsize;
1416 sp = mbuf_table[k].mtbl_stats;
1417
1418 if (cp->mc_flags & MCF_NOCPUCACHE) {
1419 sp->mbcl_mc_state = MCS_DISABLED;
1420 } else if (cp->mc_purge_cnt > 0) {
1421 sp->mbcl_mc_state = MCS_PURGING;
1422 } else if (bktsize == 0) {
1423 sp->mbcl_mc_state = MCS_OFFLINE;
1424 } else {
1425 sp->mbcl_mc_state = MCS_ONLINE;
1426 }
1427
1428 sp->mbcl_mc_cached = 0;
1429 for (m = 0; m < ncpu; m++) {
1430 ccp = &cp->mc_cpu[m];
1431 if (ccp->cc_objs > 0) {
1432 sp->mbcl_mc_cached += ccp->cc_objs;
1433 }
1434 if (ccp->cc_pobjs > 0) {
1435 sp->mbcl_mc_cached += ccp->cc_pobjs;
1436 }
1437 }
1438 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
1439 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
1440 sp->mbcl_infree;
1441
1442 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
1443 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
1444 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
1445
1446 /* Calculate total count specific to each class */
1447 sp->mbcl_ctotal = sp->mbcl_total;
1448 switch (m_class(k)) {
1449 case MC_MBUF:
1450 /* Deduct mbufs used in composite caches */
1451 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
1452 m_total(MC_MBUF_BIGCL) - m_total(MC_MBUF_16KCL));
1453 break;
1454
1455 case MC_CL:
1456 /* Deduct clusters used in composite cache */
1457 sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
1458 break;
1459
1460 case MC_BIGCL:
1461 /* Deduct clusters used in composite cache */
1462 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
1463 break;
1464
1465 case MC_16KCL:
1466 /* Deduct clusters used in composite cache */
1467 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
1468 break;
1469
1470 default:
1471 break;
1472 }
1473 }
1474#else
1475 for (k = 0; k < NELEM(mbuf_table); k++) {
1476 const zone_id_t zid = m_class_to_zid(m_class(k));
1477 const zone_t zone = zone_by_id(zid);
1478 struct zone_basic_stats stats = {};
1479
1480 sp = m_stats(k);
1481 zone_get_stats(zone, stats: &stats);
1482 drops += stats.zbs_alloc_fail;
1483 sp->mbcl_total = stats.zbs_avail;
1484 sp->mbcl_active = stats.zbs_alloc;
1485 /*
1486 * infree is what mcache considers the freelist (uncached)
1487 * free_cnt contains all the cached/uncached elements
1488 * in a zone.
1489 */
1490 sp->mbcl_infree = stats.zbs_free - stats.zbs_cached;
1491 sp->mbcl_fail_cnt = stats.zbs_alloc_fail;
1492 sp->mbcl_ctotal = sp->mbcl_total;
1493
1494 /* These stats are not available in zalloc. */
1495 sp->mbcl_alloc_cnt = 0;
1496 sp->mbcl_free_cnt = 0;
1497 sp->mbcl_notified = 0;
1498 sp->mbcl_purge_cnt = 0;
1499 sp->mbcl_slab_cnt = 0;
1500 sp->mbcl_release_cnt = 0;
1501
1502 /* zalloc caches are always on. */
1503 sp->mbcl_mc_state = MCS_ONLINE;
1504 sp->mbcl_mc_cached = stats.zbs_cached;
1505 /* These stats are not collected by zalloc. */
1506 sp->mbcl_mc_waiter_cnt = 0;
1507 sp->mbcl_mc_wretry_cnt = 0;
1508 sp->mbcl_mc_nwretry_cnt = 0;
1509 }
1510 /* Deduct clusters used in composite cache */
1511 m_ctotal(MC_MBUF) -= (m_total(MC_MBUF_CL) +
1512 m_total(MC_MBUF_BIGCL) -
1513 m_total(MC_MBUF_16KCL));
1514 m_ctotal(MC_CL) -= m_total(MC_MBUF_CL);
1515 m_ctotal(MC_BIGCL) -= m_total(MC_MBUF_BIGCL);
1516 m_ctotal(MC_16KCL) -= m_total(MC_MBUF_16KCL);
1517
1518 /* Update mbstat. */
1519 mbstat.m_mbufs = m_total(MC_MBUF);
1520 mbstat.m_clusters = m_total(MC_CL);
1521 mbstat.m_clfree = m_infree(MC_CL) + m_infree(MC_MBUF_CL);
1522 mbstat.m_drops = drops;
1523 mbstat.m_bigclusters = m_total(MC_BIGCL);
1524 mbstat.m_bigclfree = m_infree(MC_BIGCL) + m_infree(MC_MBUF_BIGCL);
1525#endif /* CONFIG_MBUF_MCACHE */
1526}
1527
1528static int
1529mb_stat_sysctl SYSCTL_HANDLER_ARGS
1530{
1531#pragma unused(oidp, arg1, arg2)
1532 void *statp;
1533 int k, statsz, proc64 = proc_is64bit(req->p);
1534
1535 lck_mtx_lock(lck: mbuf_mlock);
1536 mbuf_stat_sync();
1537
1538 if (!proc64) {
1539 struct omb_class_stat *oc;
1540 struct mb_class_stat *c;
1541
1542 omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1543 oc = &omb_stat->mbs_class[0];
1544 c = &mb_stat->mbs_class[0];
1545 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1546 (void) snprintf(oc->mbcl_cname, count: sizeof(oc->mbcl_cname),
1547 "%s", c->mbcl_cname);
1548 oc->mbcl_size = c->mbcl_size;
1549 oc->mbcl_total = c->mbcl_total;
1550 oc->mbcl_active = c->mbcl_active;
1551 oc->mbcl_infree = c->mbcl_infree;
1552 oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1553 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1554 oc->mbcl_free_cnt = c->mbcl_free_cnt;
1555 oc->mbcl_notified = c->mbcl_notified;
1556 oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1557 oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1558 oc->mbcl_ctotal = c->mbcl_ctotal;
1559 oc->mbcl_release_cnt = c->mbcl_release_cnt;
1560 oc->mbcl_mc_state = c->mbcl_mc_state;
1561 oc->mbcl_mc_cached = c->mbcl_mc_cached;
1562 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1563 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1564 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1565 }
1566 statp = omb_stat;
1567 statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1568 } else {
1569 statp = mb_stat;
1570 statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1571 }
1572
1573 lck_mtx_unlock(lck: mbuf_mlock);
1574
1575 return SYSCTL_OUT(req, statp, statsz);
1576}
1577
1578#if !CONFIG_MBUF_MCACHE
1579/*
1580 * The following functions are wrappers around mbuf
1581 * allocation for zalloc. They all have the prefix "mz"
1582 * which was chosen to avoid conflicts with the mbuf KPIs.
1583 *
1584 * Z_NOPAGEWAIT is used in place of Z_NOWAIT because
1585 * Z_NOPAGEWAIT maps closer to MCR_TRYHARD. Z_NOWAIT will
1586 * fail immediately if it has to take a mutex and that
1587 * may cause packets to be dropped more frequently.
1588 * In general, the mbuf subsystem can sustain grabbing a mutex
1589 * during "non-blocking" allocation and that's the reason
1590 * why Z_NOPAGEWAIT was chosen.
1591 *
1592 * mbufs are elided (removed all pointers) before they are
1593 * returned to the cache. The exception are composite mbufs which
1594 * are re-initialized on allocation.
1595 */
1596__attribute__((always_inline))
1597static inline void
1598m_elide(struct mbuf *m)
1599{
1600 m->m_next = m->m_nextpkt = NULL;
1601 m->m_data = 0;
1602 memset(s: &m->m_ext, c: 0, n: sizeof(m->m_ext));
1603 m->m_pkthdr.rcvif = NULL;
1604 m->m_pkthdr.pkt_hdr = NULL;
1605 m->m_flags |= M_PKTHDR;
1606 m_tag_init(m, 1);
1607 m->m_pkthdr.pkt_flags = 0;
1608 m_scratch_init(m);
1609 m->m_pkthdr.redzone = 0;
1610 m->m_flags &= ~M_PKTHDR;
1611}
1612
1613__attribute__((always_inline))
1614static inline struct mbuf *
1615mz_alloc(zalloc_flags_t flags)
1616{
1617 if (flags & Z_NOWAIT) {
1618 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1619 } else if (!(flags & Z_NOPAGEWAIT)) {
1620 flags |= Z_NOFAIL;
1621 }
1622 return zalloc_id(ZONE_ID_MBUF, flags | Z_NOZZC);
1623}
1624
1625__attribute__((always_inline))
1626static inline zstack_t
1627mz_alloc_n(uint32_t count, zalloc_flags_t flags)
1628{
1629 if (flags & Z_NOWAIT) {
1630 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1631 } else if (!(flags & Z_NOPAGEWAIT)) {
1632 flags |= Z_NOFAIL;
1633 }
1634 return zalloc_n(zone_id: ZONE_ID_MBUF, count, flags: flags | Z_NOZZC);
1635}
1636
1637__attribute__((always_inline))
1638static inline void
1639mz_free(struct mbuf *m)
1640{
1641#if KASAN
1642 zone_require(zone_by_id(ZONE_ID_MBUF), m);
1643#endif
1644 m_elide(m);
1645 zfree_nozero(ZONE_ID_MBUF, m);
1646}
1647
1648__attribute__((always_inline))
1649static inline void
1650mz_free_n(zstack_t list)
1651{
1652 /* Callers of this function have already elided the mbuf. */
1653 zfree_nozero_n(ZONE_ID_MBUF, list);
1654}
1655
1656__attribute__((always_inline))
1657static inline struct ext_ref *
1658mz_ref_alloc(zalloc_flags_t flags)
1659{
1660 if (flags & Z_NOWAIT) {
1661 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1662 }
1663 return zalloc_id(ZONE_ID_MBUF_REF, flags | Z_NOZZC);
1664}
1665
1666__attribute__((always_inline))
1667static inline void
1668mz_ref_free(struct ext_ref *rfa)
1669{
1670 VERIFY(rfa->minref == rfa->refcnt);
1671#if KASAN
1672 zone_require(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1673#endif
1674 zfree_nozero(ZONE_ID_MBUF_REF, rfa);
1675}
1676
1677__attribute__((always_inline))
1678static inline void *
1679mz_cl_alloc(zone_id_t zid, zalloc_flags_t flags)
1680{
1681 if (flags & Z_NOWAIT) {
1682 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1683 } else if (!(flags & Z_NOPAGEWAIT)) {
1684 flags |= Z_NOFAIL;
1685 }
1686 return (zalloc_id)(zid, flags: flags | Z_NOZZC);
1687}
1688
1689__attribute__((always_inline))
1690static inline void
1691mz_cl_free(zone_id_t zid, void *cl)
1692{
1693#if KASAN
1694 zone_require(zone_by_id(zid), cl);
1695#endif
1696 zfree_nozero(zid, cl);
1697}
1698
1699__attribute__((always_inline))
1700static inline zstack_t
1701mz_composite_alloc_n(mbuf_class_t class, unsigned int n, zalloc_flags_t flags)
1702{
1703 if (flags & Z_NOWAIT) {
1704 flags ^= Z_NOWAIT | Z_NOPAGEWAIT;
1705 }
1706 return (zcache_alloc_n)(zone_id: m_class_to_zid(class), count: n, flags,
1707 ops: &mz_composite_ops);
1708}
1709
1710__attribute__((always_inline))
1711static inline struct mbuf *
1712mz_composite_alloc(mbuf_class_t class, zalloc_flags_t flags)
1713{
1714 zstack_t list = {};
1715 list = mz_composite_alloc_n(class, n: 1, flags);
1716 if (!zstack_empty(stack: list)) {
1717 return zstack_pop(stack: &list);
1718 } else {
1719 return NULL;
1720 }
1721}
1722
1723__attribute__((always_inline))
1724static inline void
1725mz_composite_free_n(mbuf_class_t class, zstack_t list)
1726{
1727 (zcache_free_n)(zone_id: m_class_to_zid(class), stack: list, ops: &mz_composite_ops);
1728}
1729
1730__attribute__((always_inline))
1731static inline void
1732mz_composite_free(mbuf_class_t class, struct mbuf *m)
1733{
1734 zstack_t list = {};
1735 zstack_push(stack: &list, elem: m);
1736 (zcache_free_n)(zone_id: m_class_to_zid(class), stack: list, ops: &mz_composite_ops);
1737}
1738
1739/* Converts composite zone ID to the cluster zone ID. */
1740__attribute__((always_inline))
1741static inline zone_id_t
1742mz_cl_zid(zone_id_t zid)
1743{
1744 return ZONE_ID_CLUSTER_2K + zid - ZONE_ID_MBUF_CLUSTER_2K;
1745}
1746
1747static void *
1748mz_composite_build(zone_id_t zid, zalloc_flags_t flags)
1749{
1750 const zone_id_t cl_zid = mz_cl_zid(zid);
1751 struct mbuf *m = NULL;
1752 struct ext_ref *rfa = NULL;
1753 void *cl = NULL;
1754
1755 cl = mz_cl_alloc(zid: cl_zid, flags);
1756 if (__improbable(cl == NULL)) {
1757 goto out;
1758 }
1759 rfa = mz_ref_alloc(flags);
1760 if (__improbable(rfa == NULL)) {
1761 goto out_free_cl;
1762 }
1763 m = mz_alloc(flags);
1764 if (__improbable(m == NULL)) {
1765 goto out_free_rfa;
1766 }
1767 MBUF_INIT(m, 0, MT_FREE);
1768 if (zid == ZONE_ID_MBUF_CLUSTER_2K) {
1769 MBUF_CL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
1770 } else if (zid == ZONE_ID_MBUF_CLUSTER_4K) {
1771 MBUF_BIGCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
1772 } else {
1773 MBUF_16KCL_INIT(m, cl, rfa, 0, EXTF_COMPOSITE);
1774 }
1775 VERIFY(m->m_flags == M_EXT);
1776 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
1777
1778 return m;
1779out_free_rfa:
1780 mz_ref_free(rfa);
1781out_free_cl:
1782 mz_cl_free(zid: cl_zid, cl);
1783out:
1784 return NULL;
1785}
1786
1787static void *
1788mz_composite_mark_valid(zone_id_t zid, void *p)
1789{
1790 struct mbuf *m = p;
1791
1792 m = zcache_mark_valid(zone: zone_by_id(zid: ZONE_ID_MBUF), elem: m);
1793#if KASAN
1794 struct ext_ref *rfa = m_get_rfa(m);
1795 const zone_id_t cl_zid = mz_cl_zid(zid);
1796 void *cl = m->m_ext.ext_buf;
1797
1798 cl = zcache_mark_valid(zone_by_id(cl_zid), cl);
1799 rfa = zcache_mark_valid(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1800 m->m_data = (uintptr_t)cl;
1801 m->m_ext.ext_buf = cl;
1802 m_set_rfa(m, rfa);
1803#else
1804#pragma unused(zid)
1805#endif
1806 VERIFY(MBUF_IS_COMPOSITE(m));
1807
1808 return m;
1809}
1810
1811static void *
1812mz_composite_mark_invalid(zone_id_t zid, void *p)
1813{
1814 struct mbuf *m = p;
1815
1816 VERIFY(MBUF_IS_COMPOSITE(m));
1817 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
1818#if KASAN
1819 struct ext_ref *rfa = m_get_rfa(m);
1820 const zone_id_t cl_zid = mz_cl_zid(zid);
1821 void *cl = m->m_ext.ext_buf;
1822
1823 cl = zcache_mark_invalid(zone_by_id(cl_zid), cl);
1824 rfa = zcache_mark_invalid(zone_by_id(ZONE_ID_MBUF_REF), rfa);
1825 m->m_data = (uintptr_t)cl;
1826 m->m_ext.ext_buf = cl;
1827 m_set_rfa(m, rfa);
1828#else
1829#pragma unused(zid)
1830#endif
1831
1832 return zcache_mark_invalid(zone: zone_by_id(zid: ZONE_ID_MBUF), elem: m);
1833}
1834
1835static void
1836mz_composite_destroy(zone_id_t zid, void *p)
1837{
1838 const zone_id_t cl_zid = mz_cl_zid(zid);
1839 struct ext_ref *rfa = NULL;
1840 struct mbuf *m = p;
1841
1842 VERIFY(MBUF_IS_COMPOSITE(m));
1843
1844 MEXT_MINREF(m) = 0;
1845 MEXT_REF(m) = 0;
1846 MEXT_PREF(m) = 0;
1847 MEXT_FLAGS(m) = 0;
1848 MEXT_PRIV(m) = 0;
1849 MEXT_PMBUF(m) = NULL;
1850 MEXT_TOKEN(m) = 0;
1851
1852 rfa = m_get_rfa(m);
1853 m_set_ext(m, NULL, NULL, NULL);
1854
1855 m->m_type = MT_FREE;
1856 m->m_flags = m->m_len = 0;
1857 m->m_next = m->m_nextpkt = NULL;
1858
1859 mz_cl_free(zid: cl_zid, cl: m->m_ext.ext_buf);
1860 m->m_ext.ext_buf = NULL;
1861 mz_ref_free(rfa);
1862 mz_free(m);
1863}
1864#endif /* !CONFIG_MBUF_MCACHE */
1865
1866#if CONFIG_MBUF_MCACHE
1867static int
1868mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1869{
1870#pragma unused(oidp, arg1, arg2)
1871 int i;
1872
1873 /* Ensure leak tracing turned on */
1874 if (!mclfindleak || !mclexpleak) {
1875 return ENXIO;
1876 }
1877
1878 lck_mtx_lock(mleak_lock);
1879 mleak_update_stats();
1880 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1881 lck_mtx_unlock(mleak_lock);
1882
1883 return i;
1884}
1885
1886static int
1887mleak_table_sysctl SYSCTL_HANDLER_ARGS
1888{
1889#pragma unused(oidp, arg1, arg2)
1890 int i = 0;
1891
1892 /* Ensure leak tracing turned on */
1893 if (!mclfindleak || !mclexpleak) {
1894 return ENXIO;
1895 }
1896
1897 lck_mtx_lock(mleak_lock);
1898 i = SYSCTL_OUT(req, &mleak_table, sizeof(mleak_table));
1899 lck_mtx_unlock(mleak_lock);
1900
1901 return i;
1902}
1903#endif /* CONFIG_MBUF_MCACHE */
1904
1905static inline void
1906m_incref(struct mbuf *m)
1907{
1908 uint16_t new = os_atomic_inc(&MEXT_REF(m), relaxed);
1909
1910 VERIFY(new != 0);
1911 /*
1912 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1913 * we don't clear the flag when the refcount goes back to the
1914 * minimum, to simplify code calling m_mclhasreference().
1915 */
1916 if (new > (MEXT_MINREF(m) + 1) && !(MEXT_FLAGS(m) & EXTF_READONLY)) {
1917 os_atomic_or(&MEXT_FLAGS(m), EXTF_READONLY, relaxed);
1918 }
1919}
1920
1921static inline uint16_t
1922m_decref(struct mbuf *m)
1923{
1924 VERIFY(MEXT_REF(m) != 0);
1925
1926 return os_atomic_dec(&MEXT_REF(m), acq_rel);
1927}
1928
1929static void
1930mbuf_table_init(void)
1931{
1932 unsigned int b, c, s;
1933 int m, config_mbuf_jumbo = 0;
1934
1935 omb_stat = zalloc_permanent(OMB_STAT_SIZE(NELEM(mbuf_table)),
1936 ZALIGN(struct omb_stat));
1937
1938 mb_stat = zalloc_permanent(MB_STAT_SIZE(NELEM(mbuf_table)),
1939 ZALIGN(mb_stat_t));
1940
1941 mb_stat->mbs_cnt = NELEM(mbuf_table);
1942 for (m = 0; m < NELEM(mbuf_table); m++) {
1943 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1944 }
1945
1946#if CONFIG_MBUF_JUMBO
1947 config_mbuf_jumbo = 1;
1948#endif /* CONFIG_MBUF_JUMBO */
1949
1950 if (config_mbuf_jumbo == 1 || PAGE_SIZE == M16KCLBYTES) {
1951 /*
1952 * Set aside 1/3 of the mbuf cluster map for jumbo
1953 * clusters; we do this only on platforms where jumbo
1954 * cluster pool is enabled.
1955 */
1956 njcl = nmbclusters / 3;
1957 njclbytes = M16KCLBYTES;
1958 }
1959
1960 /*
1961 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1962 * a multiple of 4KB clusters.
1963 */
1964 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1965 if (njcl > 0) {
1966 /*
1967 * Each jumbo cluster takes 8 2KB clusters, so make
1968 * sure that the pool size is evenly divisible by 8;
1969 * njcl is in 2KB unit, hence treated as such.
1970 */
1971 njcl = P2ROUNDDOWN(nmbclusters - nclusters, NCLPJCL);
1972
1973 /* Update nclusters with rounded down value of njcl */
1974 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPG);
1975 }
1976
1977 /*
1978 * njcl is valid only on platforms with 16KB jumbo clusters or
1979 * with 16KB pages, where it is configured to 1/3 of the pool
1980 * size. On these platforms, the remaining is used for 2KB
1981 * and 4KB clusters. On platforms without 16KB jumbo clusters,
1982 * the entire pool is used for both 2KB and 4KB clusters. A 4KB
1983 * cluster can either be splitted into 16 mbufs, or into 2 2KB
1984 * clusters.
1985 *
1986 * +---+---+------------ ... -----------+------- ... -------+
1987 * | c | b | s | njcl |
1988 * +---+---+------------ ... -----------+------- ... -------+
1989 *
1990 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1991 * clusters (1/64th each.)
1992 */
1993 c = P2ROUNDDOWN((nclusters >> 6), NCLPG); /* in 2KB unit */
1994 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), NBCLPG); /* in 4KB unit */
1995 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */
1996
1997 /*
1998 * 1/64th (c) is reserved for 2KB clusters.
1999 */
2000 m_minlimit(MC_CL) = c;
2001 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */
2002 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
2003 snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
2004
2005 /*
2006 * Another 1/64th (b) of the map is reserved for 4KB clusters.
2007 * It cannot be turned into 2KB clusters or mbufs.
2008 */
2009 m_minlimit(MC_BIGCL) = b;
2010 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */
2011 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
2012 snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
2013
2014 /*
2015 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
2016 */
2017 m_minlimit(MC_MBUF) = 0;
2018 m_maxlimit(MC_MBUF) = s * NMBPCL; /* in mbuf unit */
2019 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = _MSIZE;
2020 snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
2021
2022 /*
2023 * Set limits for the composite classes.
2024 */
2025 m_minlimit(MC_MBUF_CL) = 0;
2026 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
2027 m_maxsize(MC_MBUF_CL) = MCLBYTES;
2028 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
2029 snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
2030
2031 m_minlimit(MC_MBUF_BIGCL) = 0;
2032 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
2033 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
2034 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
2035 snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
2036
2037 /*
2038 * And for jumbo classes.
2039 */
2040 m_minlimit(MC_16KCL) = 0;
2041 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */
2042 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
2043 snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
2044
2045 m_minlimit(MC_MBUF_16KCL) = 0;
2046 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
2047 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
2048 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
2049 snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
2050
2051 /*
2052 * Initialize the legacy mbstat structure.
2053 */
2054 bzero(s: &mbstat, n: sizeof(mbstat));
2055 mbstat.m_msize = m_maxsize(MC_MBUF);
2056 mbstat.m_mclbytes = m_maxsize(MC_CL);
2057 mbstat.m_minclsize = MINCLSIZE;
2058 mbstat.m_mlen = MLEN;
2059 mbstat.m_mhlen = MHLEN;
2060 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
2061}
2062
2063static int
2064mbuf_get_class(struct mbuf *m)
2065{
2066 if (m->m_flags & M_EXT) {
2067 uint32_t composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
2068 m_ext_free_func_t m_free_func = m_get_ext_free(m);
2069
2070 if (m_free_func == NULL) {
2071 if (composite) {
2072 return MC_MBUF_CL;
2073 } else {
2074 return MC_CL;
2075 }
2076 } else if (m_free_func == m_bigfree) {
2077 if (composite) {
2078 return MC_MBUF_BIGCL;
2079 } else {
2080 return MC_BIGCL;
2081 }
2082 } else if (m_free_func == m_16kfree) {
2083 if (composite) {
2084 return MC_MBUF_16KCL;
2085 } else {
2086 return MC_16KCL;
2087 }
2088 }
2089 }
2090
2091 return MC_MBUF;
2092}
2093
2094bool
2095mbuf_class_under_pressure(struct mbuf *m)
2096{
2097 int mclass = mbuf_get_class(m);
2098
2099#if CONFIG_MBUF_MCACHE
2100 if (m_total(mclass) - m_infree(mclass) >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
2101 /*
2102 * The above computation does not include the per-CPU cached objects.
2103 * As a fast-path check this is good-enough. But now we do
2104 * the "slower" count of the cached objects to know exactly the
2105 * number of active mbufs in use.
2106 *
2107 * We do not take the mbuf_lock here to avoid lock-contention. Numbers
2108 * might be slightly off but we don't try to be 100% accurate.
2109 * At worst, we drop a packet that we shouldn't have dropped or
2110 * we might go slightly above our memory-pressure threshold.
2111 */
2112 mcache_t *cp = m_cache(mclass);
2113 mcache_cpu_t *ccp = &cp->mc_cpu[0];
2114
2115 int bktsize = os_access_once(ccp->cc_bktsize);
2116 uint32_t bl_total = os_access_once(cp->mc_full.bl_total);
2117 uint32_t cached = 0;
2118 int i;
2119
2120 for (i = 0; i < ncpu; i++) {
2121 ccp = &cp->mc_cpu[i];
2122
2123 int cc_objs = os_access_once(ccp->cc_objs);
2124 if (cc_objs > 0) {
2125 cached += cc_objs;
2126 }
2127
2128 int cc_pobjs = os_access_once(ccp->cc_pobjs);
2129 if (cc_pobjs > 0) {
2130 cached += cc_pobjs;
2131 }
2132 }
2133 cached += (bl_total * bktsize);
2134 if (m_total(mclass) - m_infree(mclass) - cached >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
2135 os_log(OS_LOG_DEFAULT,
2136 "%s memory-pressure on mbuf due to class %u, total %u free %u cached %u max %u",
2137 __func__, mclass, m_total(mclass), m_infree(mclass), cached, m_maxlimit(mclass));
2138 return true;
2139 }
2140 }
2141#else
2142 /*
2143 * Grab the statistics from zalloc.
2144 * We can't call mbuf_stat_sync() since that requires a lock.
2145 */
2146 const zone_id_t zid = m_class_to_zid(m_class(mclass));
2147 const zone_t zone = zone_by_id(zid);
2148 struct zone_basic_stats stats = {};
2149
2150 zone_get_stats(zone, stats: &stats);
2151 if (stats.zbs_avail - stats.zbs_free >= (m_maxlimit(mclass) * mb_memory_pressure_percentage) / 100) {
2152 os_log(OS_LOG_DEFAULT,
2153 "%s memory-pressure on mbuf due to class %u, total %llu free %llu max %u",
2154 __func__, mclass, stats.zbs_avail, stats.zbs_free, m_maxlimit(mclass));
2155 return true;
2156 }
2157#endif /* CONFIG_MBUF_MCACHE */
2158
2159 return false;
2160}
2161
2162#if defined(__LP64__)
2163typedef struct ncl_tbl {
2164 uint64_t nt_maxmem; /* memory (sane) size */
2165 uint32_t nt_mbpool; /* mbuf pool size */
2166} ncl_tbl_t;
2167
2168static const ncl_tbl_t ncl_table[] = {
2169 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ },
2170 { .nt_maxmem: (1ULL << (GBSHIFT + 2)) /* 4 GB */, .nt_mbpool: (96 << MBSHIFT) /* 96 MB */ },
2171 { .nt_maxmem: (1ULL << (GBSHIFT + 3)) /* 8 GB */, .nt_mbpool: (128 << MBSHIFT) /* 128 MB */ },
2172 { .nt_maxmem: (1ULL << (GBSHIFT + 4)) /* 16 GB */, .nt_mbpool: (256 << MBSHIFT) /* 256 MB */ },
2173 { .nt_maxmem: (1ULL << (GBSHIFT + 5)) /* 32 GB */, .nt_mbpool: (512 << MBSHIFT) /* 512 MB */ },
2174 { .nt_maxmem: 0, .nt_mbpool: 0 }
2175};
2176#endif /* __LP64__ */
2177
2178__private_extern__ unsigned int
2179mbuf_default_ncl(uint64_t mem)
2180{
2181#if !defined(__LP64__)
2182 unsigned int n;
2183 /*
2184 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
2185 */
2186 if ((n = ((mem / 16) / MCLBYTES)) > 32768) {
2187 n = 32768;
2188 }
2189#else
2190 unsigned int n, i;
2191 /*
2192 * 64-bit kernel (mbuf pool size based on table).
2193 */
2194 n = ncl_table[0].nt_mbpool;
2195 for (i = 0; ncl_table[i].nt_mbpool != 0; i++) {
2196 if (mem < ncl_table[i].nt_maxmem) {
2197 break;
2198 }
2199 n = ncl_table[i].nt_mbpool;
2200 }
2201 n >>= MCLSHIFT;
2202#endif /* !__LP64__ */
2203 return n;
2204}
2205
2206__private_extern__ void
2207mbinit(void)
2208{
2209 unsigned int m;
2210#if CONFIG_MBUF_MCACHE
2211 unsigned int initmcl = 0;
2212 thread_t thread = THREAD_NULL;
2213#endif /* CONFIG_MBUF_MCACHE */
2214
2215#if CONFIG_MBUF_MCACHE
2216 microuptime(&mb_start);
2217#endif /* CONFIG_MBUF_MCACHE */
2218
2219 /*
2220 * These MBUF_ values must be equal to their private counterparts.
2221 */
2222 _CASSERT(MBUF_EXT == M_EXT);
2223 _CASSERT(MBUF_PKTHDR == M_PKTHDR);
2224 _CASSERT(MBUF_EOR == M_EOR);
2225 _CASSERT(MBUF_LOOP == M_LOOP);
2226 _CASSERT(MBUF_BCAST == M_BCAST);
2227 _CASSERT(MBUF_MCAST == M_MCAST);
2228 _CASSERT(MBUF_FRAG == M_FRAG);
2229 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
2230 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
2231 _CASSERT(MBUF_PROMISC == M_PROMISC);
2232 _CASSERT(MBUF_HASFCS == M_HASFCS);
2233
2234 _CASSERT(MBUF_TYPE_FREE == MT_FREE);
2235 _CASSERT(MBUF_TYPE_DATA == MT_DATA);
2236 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
2237 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
2238 _CASSERT(MBUF_TYPE_PCB == MT_PCB);
2239 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
2240 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
2241 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
2242 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
2243 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
2244 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
2245 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
2246 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
2247 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
2248 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
2249
2250 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
2251 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
2252 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_PARTIAL);
2253 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
2254 _CASSERT(MBUF_CSUM_REQ_ZERO_INVERT == CSUM_ZERO_INVERT);
2255 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
2256 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
2257 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
2258 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
2259 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
2260 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
2261 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
2262 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
2263 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
2264
2265 _CASSERT(MBUF_WAITOK == M_WAIT);
2266 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
2267 _CASSERT(MBUF_COPYALL == M_COPYALL);
2268
2269 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
2270 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
2271 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
2272 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
2273 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
2274 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
2275 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
2276 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
2277 _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
2278 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
2279 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
2280
2281 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
2282 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
2283 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
2284 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
2285
2286 /* Module specific scratch space (32-bit alignment requirement) */
2287 _CASSERT(!(offsetof(struct mbuf, m_pkthdr.pkt_mpriv) %
2288 sizeof(uint32_t)));
2289
2290 /* pktdata needs to start at 128-bit offset! */
2291 _CASSERT((offsetof(struct mbuf, m_pktdat) % 16) == 0);
2292
2293 /* Initialize random red zone cookie value */
2294 _CASSERT(sizeof(mb_redzone_cookie) ==
2295 sizeof(((struct pkthdr *)0)->redzone));
2296 read_random(buffer: &mb_redzone_cookie, numBytes: sizeof(mb_redzone_cookie));
2297 read_random(buffer: &mb_obscure_extref, numBytes: sizeof(mb_obscure_extref));
2298 read_random(buffer: &mb_obscure_extfree, numBytes: sizeof(mb_obscure_extfree));
2299 mb_obscure_extref |= 0x3;
2300 mb_obscure_extref = 0;
2301 mb_obscure_extfree |= 0x3;
2302
2303#if CONFIG_MBUF_MCACHE
2304 /* Make sure we don't save more than we should */
2305 _CASSERT(MCA_SAVED_MBUF_SIZE <= sizeof(struct mbuf));
2306#endif /* CONFIG_MBUF_MCACHE */
2307
2308 if (nmbclusters == 0) {
2309 nmbclusters = NMBCLUSTERS;
2310 }
2311
2312 /* This should be a sane (at least even) value by now */
2313 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
2314
2315 /* Setup the mbuf table */
2316 mbuf_table_init();
2317
2318 _CASSERT(sizeof(struct mbuf) == _MSIZE);
2319
2320#if CONFIG_MBUF_MCACHE
2321 /*
2322 * Allocate cluster slabs table:
2323 *
2324 * maxslabgrp = (N * 2048) / (1024 * 1024)
2325 *
2326 * Where N is nmbclusters rounded up to the nearest 512. This yields
2327 * mcl_slab_g_t units, each one representing a MB of memory.
2328 */
2329 maxslabgrp =
2330 (P2ROUNDUP(nmbclusters, (MBSIZE >> MCLSHIFT)) << MCLSHIFT) >> MBSHIFT;
2331 slabstbl = zalloc_permanent(maxslabgrp * sizeof(mcl_slabg_t *),
2332 ZALIGN(mcl_slabg_t));
2333
2334 /*
2335 * Allocate audit structures, if needed:
2336 *
2337 * maxclaudit = (maxslabgrp * 1024 * 1024) / PAGE_SIZE
2338 *
2339 * This yields mcl_audit_t units, each one representing a page.
2340 */
2341 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof(mbuf_debug));
2342 mbuf_debug |= mcache_getflags();
2343 if (mbuf_debug & MCF_DEBUG) {
2344 int l;
2345 mcl_audit_t *mclad;
2346 maxclaudit = ((maxslabgrp << MBSHIFT) >> PAGE_SHIFT);
2347 mclaudit = zalloc_permanent(maxclaudit * sizeof(*mclaudit),
2348 ZALIGN(mcl_audit_t));
2349 for (l = 0, mclad = mclaudit; l < maxclaudit; l++) {
2350 mclad[l].cl_audit = zalloc_permanent(NMBPG * sizeof(mcache_audit_t *),
2351 ZALIGN_PTR);
2352 }
2353
2354 mcl_audit_con_cache = mcache_create("mcl_audit_contents",
2355 AUDIT_CONTENTS_SIZE, sizeof(u_int64_t), 0, MCR_SLEEP);
2356 VERIFY(mcl_audit_con_cache != NULL);
2357 }
2358 mclverify = (mbuf_debug & MCF_VERIFY);
2359 mcltrace = (mbuf_debug & MCF_TRACE);
2360 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
2361 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
2362
2363 /* Enable mbuf leak logging, with a lock to protect the tables */
2364
2365 mleak_activate();
2366
2367 /*
2368 * Allocate structure for per-CPU statistics that's aligned
2369 * on the CPU cache boundary; this code assumes that we never
2370 * uninitialize this framework, since the original address
2371 * before alignment is not saved.
2372 */
2373 ncpu = ml_wait_max_cpus();
2374
2375 /* Calculate the number of pages assigned to the cluster pool */
2376 mcl_pages = (nmbclusters << MCLSHIFT) / PAGE_SIZE;
2377 mcl_paddr = zalloc_permanent(mcl_pages * sizeof(ppnum_t),
2378 ZALIGN(ppnum_t));
2379
2380 /* Register with the I/O Bus mapper */
2381 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
2382
2383 embutl = (mbutl + (nmbclusters * MCLBYTES));
2384 VERIFY(((embutl - mbutl) % MBIGCLBYTES) == 0);
2385
2386 /* Prime up the freelist */
2387 PE_parse_boot_argn("initmcl", &initmcl, sizeof(initmcl));
2388 if (initmcl != 0) {
2389 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */
2390 if (initmcl > m_maxlimit(MC_BIGCL)) {
2391 initmcl = m_maxlimit(MC_BIGCL);
2392 }
2393 }
2394 if (initmcl < m_minlimit(MC_BIGCL)) {
2395 initmcl = m_minlimit(MC_BIGCL);
2396 }
2397
2398 lck_mtx_lock(mbuf_mlock);
2399
2400 /*
2401 * For classes with non-zero minimum limits, populate their freelists
2402 * so that m_total(class) is at least m_minlimit(class).
2403 */
2404 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
2405 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
2406 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2407 freelist_init(m_class(MC_CL));
2408#else
2409 /*
2410 * We have yet to create the non composite zones
2411 * and thus we haven't asked zalloc to allocate
2412 * anything yet, which means that at this point
2413 * m_total() is zero. Once we create the zones and
2414 * raise the reserve, m_total() will be calculated,
2415 * but until then just assume that we will have
2416 * at least the minium limit allocated.
2417 */
2418 m_total(MC_BIGCL) = m_minlimit(MC_BIGCL);
2419 m_total(MC_CL) = m_minlimit(MC_CL);
2420#endif /* CONFIG_MBUF_MCACHE */
2421
2422 for (m = 0; m < NELEM(mbuf_table); m++) {
2423 /* Make sure we didn't miss any */
2424 VERIFY(m_minlimit(m_class(m)) == 0 ||
2425 m_total(m_class(m)) >= m_minlimit(m_class(m)));
2426 }
2427
2428#if CONFIG_MBUF_MCACHE
2429 lck_mtx_unlock(mbuf_mlock);
2430
2431 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
2432 NULL, &thread);
2433 thread_deallocate(thread);
2434
2435 ref_cache = mcache_create("mext_ref", sizeof(struct ext_ref),
2436 0, 0, MCR_SLEEP);
2437#endif /* CONFIG_MBUF_MCACHE */
2438
2439 /* Create the cache for each class */
2440 for (m = 0; m < NELEM(mbuf_table); m++) {
2441#if CONFIG_MBUF_MCACHE
2442 void *allocfunc, *freefunc, *auditfunc, *logfunc;
2443 u_int32_t flags;
2444
2445 flags = mbuf_debug;
2446 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
2447 m_class(m) == MC_MBUF_16KCL) {
2448 allocfunc = mbuf_cslab_alloc;
2449 freefunc = mbuf_cslab_free;
2450 auditfunc = mbuf_cslab_audit;
2451 logfunc = mleak_logger;
2452 } else {
2453 allocfunc = mbuf_slab_alloc;
2454 freefunc = mbuf_slab_free;
2455 auditfunc = mbuf_slab_audit;
2456 logfunc = mleak_logger;
2457 }
2458
2459 /*
2460 * Disable per-CPU caches for jumbo classes if there
2461 * is no jumbo cluster pool available in the system.
2462 * The cache itself is still created (but will never
2463 * be populated) since it simplifies the code.
2464 */
2465 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
2466 njcl == 0) {
2467 flags |= MCF_NOCPUCACHE;
2468 }
2469
2470 if (!mclfindleak) {
2471 flags |= MCF_NOLEAKLOG;
2472 }
2473
2474 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
2475 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
2476 (void *)(uintptr_t)m, flags, MCR_SLEEP);
2477#else
2478 if (!MBUF_CLASS_COMPOSITE(m)) {
2479 zone_t zone = zone_by_id(zid: m_class_to_zid(class: m));
2480
2481 zone_set_exhaustible(zone, m_maxlimit(m), false);
2482 zone_raise_reserve(zone_or_view: zone, m_minlimit(m));
2483 /*
2484 * Pretend that we have allocated m_total() items
2485 * at this point. zalloc will eventually do that
2486 * but it's an async operation.
2487 */
2488 m_total(m) = m_minlimit(m);
2489 }
2490#endif /* CONFIG_MBUF_MCACHE */
2491 }
2492
2493 /*
2494 * Set the max limit on sb_max to be 1/16 th of the size of
2495 * memory allocated for mbuf clusters.
2496 */
2497 high_sb_max = (nmbclusters << (MCLSHIFT - 4));
2498 if (high_sb_max < sb_max) {
2499 /* sb_max is too large for this configuration, scale it down */
2500 if (high_sb_max > (1 << MBSHIFT)) {
2501 /* We have atleast 16 M of mbuf pool */
2502 sb_max = high_sb_max;
2503 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
2504 /*
2505 * If we have more than 1M of mbufpool, cap the size of
2506 * max sock buf at 1M
2507 */
2508 sb_max = high_sb_max = (1 << MBSHIFT);
2509 } else {
2510 sb_max = high_sb_max;
2511 }
2512 }
2513
2514#if CONFIG_MBUF_MCACHE
2515 /* allocate space for mbuf_dump_buf */
2516 mbuf_dump_buf = zalloc_permanent(MBUF_DUMP_BUF_SIZE, ZALIGN_NONE);
2517
2518 if (mbuf_debug & MCF_DEBUG) {
2519 printf("%s: MLEN %d, MHLEN %d\n", __func__,
2520 (int)_MLEN, (int)_MHLEN);
2521 }
2522#else
2523 mbuf_defunct_tcall =
2524 thread_call_allocate_with_options(func: mbuf_watchdog_defunct,
2525 NULL,
2526 pri: THREAD_CALL_PRIORITY_KERNEL,
2527 options: THREAD_CALL_OPTIONS_ONCE);
2528 mbuf_drain_tcall =
2529 thread_call_allocate_with_options(func: mbuf_watchdog_drain_composite,
2530 NULL,
2531 pri: THREAD_CALL_PRIORITY_KERNEL,
2532 options: THREAD_CALL_OPTIONS_ONCE);
2533#endif /* CONFIG_MBUF_MCACHE */
2534 printf("%s: done [%d MB total pool size, (%d/%d) split]\n", __func__,
2535 (nmbclusters << MCLSHIFT) >> MBSHIFT,
2536 (nclusters << MCLSHIFT) >> MBSHIFT,
2537 (njcl << MCLSHIFT) >> MBSHIFT);
2538
2539 PE_parse_boot_argn(arg_string: "mb_tag_mbuf", arg_ptr: &mb_tag_mbuf, max_arg: sizeof(mb_tag_mbuf));
2540}
2541
2542#if CONFIG_MBUF_MCACHE
2543/*
2544 * Obtain a slab of object(s) from the class's freelist.
2545 */
2546static mcache_obj_t *
2547slab_alloc(mbuf_class_t class, int wait)
2548{
2549 mcl_slab_t *sp;
2550 mcache_obj_t *buf;
2551
2552 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2553
2554 /* This should always be NULL for us */
2555 VERIFY(m_cobjlist(class) == NULL);
2556
2557 /*
2558 * Treat composite objects as having longer lifespan by using
2559 * a slab from the reverse direction, in hoping that this could
2560 * reduce the probability of fragmentation for slabs that hold
2561 * more than one buffer chunks (e.g. mbuf slabs). For other
2562 * slabs, this probably doesn't make much of a difference.
2563 */
2564 if ((class == MC_MBUF || class == MC_CL || class == MC_BIGCL)
2565 && (wait & MCR_COMP)) {
2566 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
2567 } else {
2568 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
2569 }
2570
2571 if (sp == NULL) {
2572 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
2573 /* The slab list for this class is empty */
2574 return NULL;
2575 }
2576
2577 VERIFY(m_infree(class) > 0);
2578 VERIFY(!slab_is_detached(sp));
2579 VERIFY(sp->sl_class == class &&
2580 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2581 buf = sp->sl_head;
2582 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
2583 sp->sl_head = buf->obj_next;
2584 /* Increment slab reference */
2585 sp->sl_refcnt++;
2586
2587 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == sp->sl_chunks);
2588
2589 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
2590 slab_nextptr_panic(sp, sp->sl_head);
2591 /* In case sl_head is in the map but not in the slab */
2592 VERIFY(slab_inrange(sp, sp->sl_head));
2593 /* NOTREACHED */
2594 }
2595
2596 if (mclaudit != NULL) {
2597 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2598 mca->mca_uflags = 0;
2599 /* Save contents on mbuf objects only */
2600 if (class == MC_MBUF) {
2601 mca->mca_uflags |= MB_SCVALID;
2602 }
2603 }
2604
2605 if (class == MC_CL) {
2606 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2607 /*
2608 * A 2K cluster slab can have at most NCLPG references.
2609 */
2610 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPG &&
2611 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2612 VERIFY(sp->sl_refcnt < NCLPG || sp->sl_head == NULL);
2613 } else if (class == MC_BIGCL) {
2614 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
2615 m_infree(MC_MBUF_BIGCL);
2616 /*
2617 * A 4K cluster slab can have NBCLPG references.
2618 */
2619 VERIFY(sp->sl_refcnt >= 1 && sp->sl_chunks == NBCLPG &&
2620 sp->sl_len == PAGE_SIZE &&
2621 (sp->sl_refcnt < NBCLPG || sp->sl_head == NULL));
2622 } else if (class == MC_16KCL) {
2623 mcl_slab_t *nsp;
2624 int k;
2625
2626 --m_infree(MC_16KCL);
2627 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
2628 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2629 /*
2630 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
2631 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
2632 * most 1 reference.
2633 */
2634 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2635 nsp = nsp->sl_next;
2636 /* Next slab must already be present */
2637 VERIFY(nsp != NULL);
2638 nsp->sl_refcnt++;
2639 VERIFY(!slab_is_detached(nsp));
2640 VERIFY(nsp->sl_class == MC_16KCL &&
2641 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
2642 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
2643 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2644 nsp->sl_head == NULL);
2645 }
2646 } else {
2647 VERIFY(class == MC_MBUF);
2648 --m_infree(MC_MBUF);
2649 /*
2650 * If auditing is turned on, this check is
2651 * deferred until later in mbuf_slab_audit().
2652 */
2653 if (mclaudit == NULL) {
2654 _MCHECK((struct mbuf *)buf);
2655 }
2656 /*
2657 * Since we have incremented the reference count above,
2658 * an mbuf slab (formerly a 4KB cluster slab that was cut
2659 * up into mbufs) must have a reference count between 1
2660 * and NMBPG at this point.
2661 */
2662 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPG &&
2663 sp->sl_chunks == NMBPG &&
2664 sp->sl_len == PAGE_SIZE);
2665 VERIFY(sp->sl_refcnt < NMBPG || sp->sl_head == NULL);
2666 }
2667
2668 /* If empty, remove this slab from the class's freelist */
2669 if (sp->sl_head == NULL) {
2670 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPG);
2671 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPG);
2672 VERIFY(class != MC_BIGCL || sp->sl_refcnt == NBCLPG);
2673 slab_remove(sp, class);
2674 }
2675
2676 return buf;
2677}
2678
2679/*
2680 * Place a slab of object(s) back into a class's slab list.
2681 */
2682static void
2683slab_free(mbuf_class_t class, mcache_obj_t *buf)
2684{
2685 mcl_slab_t *sp;
2686 boolean_t reinit_supercl = false;
2687 mbuf_class_t super_class;
2688
2689 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2690
2691 VERIFY(class != MC_16KCL || njcl > 0);
2692 VERIFY(buf->obj_next == NULL);
2693
2694 /*
2695 * Synchronizing with m_clalloc, as it reads m_total, while we here
2696 * are modifying m_total.
2697 */
2698 while (mb_clalloc_busy) {
2699 mb_clalloc_waiters++;
2700 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2701 (PZERO - 1), "m_clalloc", NULL);
2702 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2703 }
2704
2705 /* We are busy now; tell everyone else to go away */
2706 mb_clalloc_busy = TRUE;
2707
2708 sp = slab_get(buf);
2709 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
2710 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2711
2712 /* Decrement slab reference */
2713 sp->sl_refcnt--;
2714
2715 if (class == MC_CL) {
2716 VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
2717 /*
2718 * A slab that has been splitted for 2KB clusters can have
2719 * at most 1 outstanding reference at this point.
2720 */
2721 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPG - 1) &&
2722 sp->sl_chunks == NCLPG && sp->sl_len == PAGE_SIZE);
2723 VERIFY(sp->sl_refcnt < (NCLPG - 1) ||
2724 (slab_is_detached(sp) && sp->sl_head == NULL));
2725 } else if (class == MC_BIGCL) {
2726 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
2727
2728 /* A 4KB cluster slab can have NBCLPG references at most */
2729 VERIFY(sp->sl_refcnt >= 0 && sp->sl_chunks == NBCLPG);
2730 VERIFY(sp->sl_refcnt < (NBCLPG - 1) ||
2731 (slab_is_detached(sp) && sp->sl_head == NULL));
2732 } else if (class == MC_16KCL) {
2733 mcl_slab_t *nsp;
2734 int k;
2735 /*
2736 * A 16KB cluster takes NSLABSP16KB slabs, all must
2737 * now have 0 reference.
2738 */
2739 VERIFY(IS_P2ALIGNED(buf, PAGE_SIZE));
2740 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
2741 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
2742 VERIFY(slab_is_detached(sp));
2743 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
2744 nsp = nsp->sl_next;
2745 /* Next slab must already be present */
2746 VERIFY(nsp != NULL);
2747 nsp->sl_refcnt--;
2748 VERIFY(slab_is_detached(nsp));
2749 VERIFY(nsp->sl_class == MC_16KCL &&
2750 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
2751 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
2752 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
2753 nsp->sl_head == NULL);
2754 }
2755 } else {
2756 /*
2757 * A slab that has been splitted for mbufs has at most
2758 * NMBPG reference counts. Since we have decremented
2759 * one reference above, it must now be between 0 and
2760 * NMBPG-1.
2761 */
2762 VERIFY(class == MC_MBUF);
2763 VERIFY(sp->sl_refcnt >= 0 &&
2764 sp->sl_refcnt <= (NMBPG - 1) &&
2765 sp->sl_chunks == NMBPG &&
2766 sp->sl_len == PAGE_SIZE);
2767 VERIFY(sp->sl_refcnt < (NMBPG - 1) ||
2768 (slab_is_detached(sp) && sp->sl_head == NULL));
2769 }
2770
2771 /*
2772 * When auditing is enabled, ensure that the buffer still
2773 * contains the free pattern. Otherwise it got corrupted
2774 * while at the CPU cache layer.
2775 */
2776 if (mclaudit != NULL) {
2777 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
2778 if (mclverify) {
2779 mcache_audit_free_verify(mca, buf, 0,
2780 m_maxsize(class));
2781 }
2782 mca->mca_uflags &= ~MB_SCVALID;
2783 }
2784
2785 if (class == MC_CL) {
2786 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
2787 buf->obj_next = sp->sl_head;
2788 } else if (class == MC_BIGCL) {
2789 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
2790 m_infree(MC_MBUF_BIGCL);
2791 buf->obj_next = sp->sl_head;
2792 } else if (class == MC_16KCL) {
2793 ++m_infree(MC_16KCL);
2794 } else {
2795 ++m_infree(MC_MBUF);
2796 buf->obj_next = sp->sl_head;
2797 }
2798 sp->sl_head = buf;
2799
2800 /*
2801 * If a slab has been split to either one which holds 2KB clusters,
2802 * or one which holds mbufs, turn it back to one which holds a
2803 * 4 or 16 KB cluster depending on the page size.
2804 */
2805 if (m_maxsize(MC_BIGCL) == PAGE_SIZE) {
2806 super_class = MC_BIGCL;
2807 } else {
2808 VERIFY(PAGE_SIZE == m_maxsize(MC_16KCL));
2809 super_class = MC_16KCL;
2810 }
2811 if (class == MC_MBUF && sp->sl_refcnt == 0 &&
2812 m_total(class) >= (m_minlimit(class) + NMBPG) &&
2813 m_total(super_class) < m_maxlimit(super_class)) {
2814 int i = NMBPG;
2815
2816 m_total(MC_MBUF) -= NMBPG;
2817 mbstat.m_mbufs = m_total(MC_MBUF);
2818 m_infree(MC_MBUF) -= NMBPG;
2819 mtype_stat_add(MT_FREE, -((unsigned)NMBPG));
2820
2821 while (i--) {
2822 struct mbuf *m = sp->sl_head;
2823 VERIFY(m != NULL);
2824 sp->sl_head = m->m_next;
2825 m->m_next = NULL;
2826 }
2827 reinit_supercl = true;
2828 } else if (class == MC_CL && sp->sl_refcnt == 0 &&
2829 m_total(class) >= (m_minlimit(class) + NCLPG) &&
2830 m_total(super_class) < m_maxlimit(super_class)) {
2831 int i = NCLPG;
2832
2833 m_total(MC_CL) -= NCLPG;
2834 mbstat.m_clusters = m_total(MC_CL);
2835 m_infree(MC_CL) -= NCLPG;
2836
2837 while (i--) {
2838 union mcluster *c = sp->sl_head;
2839 VERIFY(c != NULL);
2840 sp->sl_head = c->mcl_next;
2841 c->mcl_next = NULL;
2842 }
2843 reinit_supercl = true;
2844 } else if (class == MC_BIGCL && super_class != MC_BIGCL &&
2845 sp->sl_refcnt == 0 &&
2846 m_total(class) >= (m_minlimit(class) + NBCLPG) &&
2847 m_total(super_class) < m_maxlimit(super_class)) {
2848 int i = NBCLPG;
2849
2850 VERIFY(super_class == MC_16KCL);
2851 m_total(MC_BIGCL) -= NBCLPG;
2852 mbstat.m_bigclusters = m_total(MC_BIGCL);
2853 m_infree(MC_BIGCL) -= NBCLPG;
2854
2855 while (i--) {
2856 union mbigcluster *bc = sp->sl_head;
2857 VERIFY(bc != NULL);
2858 sp->sl_head = bc->mbc_next;
2859 bc->mbc_next = NULL;
2860 }
2861 reinit_supercl = true;
2862 }
2863
2864 if (reinit_supercl) {
2865 VERIFY(sp->sl_head == NULL);
2866 VERIFY(m_total(class) >= m_minlimit(class));
2867 slab_remove(sp, class);
2868
2869 /* Reinitialize it as a cluster for the super class */
2870 m_total(super_class)++;
2871 m_infree(super_class)++;
2872 VERIFY(sp->sl_flags == (SLF_MAPPED | SLF_DETACHED) &&
2873 sp->sl_len == PAGE_SIZE && sp->sl_refcnt == 0);
2874
2875 slab_init(sp, super_class, SLF_MAPPED, sp->sl_base,
2876 sp->sl_base, PAGE_SIZE, 0, 1);
2877 if (mclverify) {
2878 mcache_set_pattern(MCACHE_FREE_PATTERN,
2879 (caddr_t)sp->sl_base, sp->sl_len);
2880 }
2881 ((mcache_obj_t *)(sp->sl_base))->obj_next = NULL;
2882
2883 if (super_class == MC_BIGCL) {
2884 mbstat.m_bigclusters = m_total(MC_BIGCL);
2885 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
2886 m_infree(MC_MBUF_BIGCL);
2887 }
2888
2889 VERIFY(slab_is_detached(sp));
2890 VERIFY(m_total(super_class) <= m_maxlimit(super_class));
2891
2892 /* And finally switch class */
2893 class = super_class;
2894 }
2895
2896 /* Reinsert the slab to the class's slab list */
2897 if (slab_is_detached(sp)) {
2898 slab_insert(sp, class);
2899 }
2900
2901 /* We're done; let others enter */
2902 mb_clalloc_busy = FALSE;
2903 if (mb_clalloc_waiters > 0) {
2904 mb_clalloc_waiters = 0;
2905 wakeup(mb_clalloc_waitchan);
2906 }
2907}
2908
2909/*
2910 * Common allocator for rudimentary objects called by the CPU cache layer
2911 * during an allocation request whenever there is no available element in the
2912 * bucket layer. It returns one or more elements from the appropriate global
2913 * freelist. If the freelist is empty, it will attempt to populate it and
2914 * retry the allocation.
2915 */
2916static unsigned int
2917mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
2918{
2919 mbuf_class_t class = (mbuf_class_t)arg;
2920 unsigned int need = num;
2921 mcache_obj_t **list = *plist;
2922
2923 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2924 ASSERT(need > 0);
2925
2926 lck_mtx_lock(mbuf_mlock);
2927
2928 for (;;) {
2929 if ((*list = slab_alloc(class, wait)) != NULL) {
2930 (*list)->obj_next = NULL;
2931 list = *plist = &(*list)->obj_next;
2932
2933 if (--need == 0) {
2934 /*
2935 * If the number of elements in freelist has
2936 * dropped below low watermark, asynchronously
2937 * populate the freelist now rather than doing
2938 * it later when we run out of elements.
2939 */
2940 if (!mbuf_cached_above(class, wait) &&
2941 m_infree(class) < (m_total(class) >> 5)) {
2942 (void) freelist_populate(class, 1,
2943 M_DONTWAIT);
2944 }
2945 break;
2946 }
2947 } else {
2948 VERIFY(m_infree(class) == 0 || class == MC_CL);
2949
2950 (void) freelist_populate(class, 1,
2951 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
2952
2953 if (m_infree(class) > 0) {
2954 continue;
2955 }
2956
2957 /* Check if there's anything at the cache layer */
2958 if (mbuf_cached_above(class, wait)) {
2959 break;
2960 }
2961
2962 /* watchdog checkpoint */
2963 mbuf_watchdog();
2964
2965 /* We have nothing and cannot block; give up */
2966 if (wait & MCR_NOSLEEP) {
2967 if (!(wait & MCR_TRYHARD)) {
2968 m_fail_cnt(class)++;
2969 mbstat.m_drops++;
2970 break;
2971 }
2972 }
2973
2974 /*
2975 * If the freelist is still empty and the caller is
2976 * willing to be blocked, sleep on the wait channel
2977 * until an element is available. Otherwise, if
2978 * MCR_TRYHARD is set, do our best to satisfy the
2979 * request without having to go to sleep.
2980 */
2981 if (mbuf_worker_ready &&
2982 mbuf_sleep(class, need, wait)) {
2983 break;
2984 }
2985
2986 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2987 }
2988 }
2989
2990 m_alloc_cnt(class) += num - need;
2991 lck_mtx_unlock(mbuf_mlock);
2992
2993 return num - need;
2994}
2995
2996/*
2997 * Common de-allocator for rudimentary objects called by the CPU cache
2998 * layer when one or more elements need to be returned to the appropriate
2999 * global freelist.
3000 */
3001static void
3002mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
3003{
3004 mbuf_class_t class = (mbuf_class_t)arg;
3005 mcache_obj_t *nlist;
3006 unsigned int num = 0;
3007 int w;
3008
3009 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
3010
3011 lck_mtx_lock(mbuf_mlock);
3012
3013 for (;;) {
3014 nlist = list->obj_next;
3015 list->obj_next = NULL;
3016 slab_free(class, list);
3017 ++num;
3018 if ((list = nlist) == NULL) {
3019 break;
3020 }
3021 }
3022 m_free_cnt(class) += num;
3023
3024 if ((w = mb_waiters) > 0) {
3025 mb_waiters = 0;
3026 }
3027 if (w) {
3028 mbwdog_logger("waking up all threads");
3029 }
3030 lck_mtx_unlock(mbuf_mlock);
3031
3032 if (w != 0) {
3033 wakeup(mb_waitchan);
3034 }
3035}
3036
3037/*
3038 * Common auditor for rudimentary objects called by the CPU cache layer
3039 * during an allocation or free request. For the former, this is called
3040 * after the objects are obtained from either the bucket or slab layer
3041 * and before they are returned to the caller. For the latter, this is
3042 * called immediately during free and before placing the objects into
3043 * the bucket or slab layer.
3044 */
3045static void
3046mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
3047{
3048 mbuf_class_t class = (mbuf_class_t)arg;
3049 mcache_audit_t *mca;
3050
3051 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
3052
3053 while (list != NULL) {
3054 lck_mtx_lock(mbuf_mlock);
3055 mca = mcl_audit_buf2mca(class, list);
3056
3057 /* Do the sanity checks */
3058 if (class == MC_MBUF) {
3059 mcl_audit_mbuf(mca, list, FALSE, alloc);
3060 ASSERT(mca->mca_uflags & MB_SCVALID);
3061 } else {
3062 mcl_audit_cluster(mca, list, m_maxsize(class),
3063 alloc, TRUE);
3064 ASSERT(!(mca->mca_uflags & MB_SCVALID));
3065 }
3066 /* Record this transaction */
3067 if (mcltrace) {
3068 mcache_buffer_log(mca, list, m_cache(class), &mb_start);
3069 }
3070
3071 if (alloc) {
3072 mca->mca_uflags |= MB_INUSE;
3073 } else {
3074 mca->mca_uflags &= ~MB_INUSE;
3075 }
3076 /* Unpair the object (unconditionally) */
3077 mca->mca_uptr = NULL;
3078 lck_mtx_unlock(mbuf_mlock);
3079
3080 list = list->obj_next;
3081 }
3082}
3083
3084/*
3085 * Common notify routine for all caches. It is called by mcache when
3086 * one or more objects get freed. We use this indication to trigger
3087 * the wakeup of any sleeping threads so that they can retry their
3088 * allocation requests.
3089 */
3090static void
3091mbuf_slab_notify(void *arg, u_int32_t reason)
3092{
3093 mbuf_class_t class = (mbuf_class_t)arg;
3094 int w;
3095
3096 ASSERT(MBUF_CLASS_VALID(class));
3097
3098 if (reason != MCN_RETRYALLOC) {
3099 return;
3100 }
3101
3102 lck_mtx_lock(mbuf_mlock);
3103 if ((w = mb_waiters) > 0) {
3104 m_notified(class)++;
3105 mb_waiters = 0;
3106 }
3107 if (w) {
3108 mbwdog_logger("waking up all threads");
3109 }
3110 lck_mtx_unlock(mbuf_mlock);
3111
3112 if (w != 0) {
3113 wakeup(mb_waitchan);
3114 }
3115}
3116
3117/*
3118 * Obtain object(s) from the composite class's freelist.
3119 */
3120static unsigned int
3121cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
3122{
3123 unsigned int need = num;
3124 mcl_slab_t *sp, *clsp, *nsp;
3125 struct mbuf *m;
3126 mcache_obj_t **list = *plist;
3127 void *cl;
3128
3129 VERIFY(need > 0);
3130 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
3131 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3132
3133 /* Get what we can from the freelist */
3134 while ((*list = m_cobjlist(class)) != NULL) {
3135 MRANGE(*list);
3136
3137 m = (struct mbuf *)*list;
3138 sp = slab_get(m);
3139 cl = m->m_ext.ext_buf;
3140 clsp = slab_get(cl);
3141 VERIFY(m->m_flags == M_EXT && cl != NULL);
3142 VERIFY(m_get_rfa(m) != NULL && MBUF_IS_COMPOSITE(m));
3143
3144 if (class == MC_MBUF_CL) {
3145 VERIFY(clsp->sl_refcnt >= 1 &&
3146 clsp->sl_refcnt <= NCLPG);
3147 } else {
3148 VERIFY(clsp->sl_refcnt >= 1 &&
3149 clsp->sl_refcnt <= NBCLPG);
3150 }
3151
3152 if (class == MC_MBUF_16KCL) {
3153 int k;
3154 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
3155 nsp = nsp->sl_next;
3156 /* Next slab must already be present */
3157 VERIFY(nsp != NULL);
3158 VERIFY(nsp->sl_refcnt == 1);
3159 }
3160 }
3161
3162 if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
3163 !MBUF_IN_MAP(m_cobjlist(class))) {
3164 slab_nextptr_panic(sp, m_cobjlist(class));
3165 /* NOTREACHED */
3166 }
3167 (*list)->obj_next = NULL;
3168 list = *plist = &(*list)->obj_next;
3169
3170 if (--need == 0) {
3171 break;
3172 }
3173 }
3174 m_infree(class) -= (num - need);
3175
3176 return num - need;
3177}
3178
3179/*
3180 * Place object(s) back into a composite class's freelist.
3181 */
3182static unsigned int
3183cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
3184{
3185 mcache_obj_t *o, *tail;
3186 unsigned int num = 0;
3187 struct mbuf *m, *ms;
3188 mcache_audit_t *mca = NULL;
3189 mcache_obj_t *ref_list = NULL;
3190 mcl_slab_t *clsp, *nsp;
3191 void *cl;
3192 mbuf_class_t cl_class;
3193
3194 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3195 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
3196 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3197
3198 if (class == MC_MBUF_CL) {
3199 cl_class = MC_CL;
3200 } else if (class == MC_MBUF_BIGCL) {
3201 cl_class = MC_BIGCL;
3202 } else {
3203 VERIFY(class == MC_MBUF_16KCL);
3204 cl_class = MC_16KCL;
3205 }
3206
3207 o = tail = list;
3208
3209 while ((m = ms = (struct mbuf *)o) != NULL) {
3210 mcache_obj_t *rfa, *nexto = o->obj_next;
3211
3212 /* Do the mbuf sanity checks */
3213 if (mclaudit != NULL) {
3214 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3215 if (mclverify) {
3216 mcache_audit_free_verify(mca, m, 0,
3217 m_maxsize(MC_MBUF));
3218 }
3219 ms = MCA_SAVED_MBUF_PTR(mca);
3220 }
3221
3222 /* Do the cluster sanity checks */
3223 cl = ms->m_ext.ext_buf;
3224 clsp = slab_get(cl);
3225 if (mclverify) {
3226 size_t size = m_maxsize(cl_class);
3227 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
3228 (mcache_obj_t *)cl), cl, 0, size);
3229 }
3230 VERIFY(ms->m_type == MT_FREE);
3231 VERIFY(ms->m_flags == M_EXT);
3232 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3233 if (cl_class == MC_CL) {
3234 VERIFY(clsp->sl_refcnt >= 1 &&
3235 clsp->sl_refcnt <= NCLPG);
3236 } else {
3237 VERIFY(clsp->sl_refcnt >= 1 &&
3238 clsp->sl_refcnt <= NBCLPG);
3239 }
3240 if (cl_class == MC_16KCL) {
3241 int k;
3242 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
3243 nsp = nsp->sl_next;
3244 /* Next slab must already be present */
3245 VERIFY(nsp != NULL);
3246 VERIFY(nsp->sl_refcnt == 1);
3247 }
3248 }
3249
3250 /*
3251 * If we're asked to purge, restore the actual mbuf using
3252 * contents of the shadow structure (if auditing is enabled)
3253 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
3254 * about to free it and the attached cluster into their caches.
3255 */
3256 if (purged) {
3257 /* Restore constructed mbuf fields */
3258 if (mclaudit != NULL) {
3259 mcl_audit_restore_mbuf(m, mca, TRUE);
3260 }
3261
3262 MEXT_MINREF(m) = 0;
3263 MEXT_REF(m) = 0;
3264 MEXT_PREF(m) = 0;
3265 MEXT_FLAGS(m) = 0;
3266 MEXT_PRIV(m) = 0;
3267 MEXT_PMBUF(m) = NULL;
3268 MEXT_TOKEN(m) = 0;
3269
3270 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
3271 m_set_ext(m, NULL, NULL, NULL);
3272 rfa->obj_next = ref_list;
3273 ref_list = rfa;
3274
3275 m->m_type = MT_FREE;
3276 m->m_flags = m->m_len = 0;
3277 m->m_next = m->m_nextpkt = NULL;
3278
3279 /* Save mbuf fields and make auditing happy */
3280 if (mclaudit != NULL) {
3281 mcl_audit_mbuf(mca, o, FALSE, FALSE);
3282 }
3283
3284 VERIFY(m_total(class) > 0);
3285 m_total(class)--;
3286
3287 /* Free the mbuf */
3288 o->obj_next = NULL;
3289 slab_free(MC_MBUF, o);
3290
3291 /* And free the cluster */
3292 ((mcache_obj_t *)cl)->obj_next = NULL;
3293 if (class == MC_MBUF_CL) {
3294 slab_free(MC_CL, cl);
3295 } else if (class == MC_MBUF_BIGCL) {
3296 slab_free(MC_BIGCL, cl);
3297 } else {
3298 slab_free(MC_16KCL, cl);
3299 }
3300 }
3301
3302 ++num;
3303 tail = o;
3304 o = nexto;
3305 }
3306
3307 if (!purged) {
3308 tail->obj_next = m_cobjlist(class);
3309 m_cobjlist(class) = list;
3310 m_infree(class) += num;
3311 } else if (ref_list != NULL) {
3312 mcache_free_ext(ref_cache, ref_list);
3313 }
3314
3315 return num;
3316}
3317
3318/*
3319 * Common allocator for composite objects called by the CPU cache layer
3320 * during an allocation request whenever there is no available element in
3321 * the bucket layer. It returns one or more composite elements from the
3322 * appropriate global freelist. If the freelist is empty, it will attempt
3323 * to obtain the rudimentary objects from their caches and construct them
3324 * into composite mbuf + cluster objects.
3325 */
3326static unsigned int
3327mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
3328 int wait)
3329{
3330 mbuf_class_t class = (mbuf_class_t)arg;
3331 mbuf_class_t cl_class = 0;
3332 unsigned int num = 0, cnum = 0, want = needed;
3333 mcache_obj_t *ref_list = NULL;
3334 mcache_obj_t *mp_list = NULL;
3335 mcache_obj_t *clp_list = NULL;
3336 mcache_obj_t **list;
3337 struct ext_ref *rfa;
3338 struct mbuf *m;
3339 void *cl;
3340
3341 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3342 ASSERT(needed > 0);
3343
3344 VERIFY(class != MC_MBUF_16KCL || njcl > 0);
3345
3346 /* There should not be any slab for this class */
3347 VERIFY(m_slab_cnt(class) == 0 &&
3348 m_slablist(class).tqh_first == NULL &&
3349 m_slablist(class).tqh_last == NULL);
3350
3351 lck_mtx_lock(mbuf_mlock);
3352
3353 /* Try using the freelist first */
3354 num = cslab_alloc(class, plist, needed);
3355 list = *plist;
3356 if (num == needed) {
3357 m_alloc_cnt(class) += num;
3358 lck_mtx_unlock(mbuf_mlock);
3359 return needed;
3360 }
3361
3362 lck_mtx_unlock(mbuf_mlock);
3363
3364 /*
3365 * We could not satisfy the request using the freelist alone;
3366 * allocate from the appropriate rudimentary caches and use
3367 * whatever we can get to construct the composite objects.
3368 */
3369 needed -= num;
3370
3371 /*
3372 * Mark these allocation requests as coming from a composite cache.
3373 * Also, if the caller is willing to be blocked, mark the request
3374 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
3375 * slab layer waiting for the individual object when one or more
3376 * of the already-constructed composite objects are available.
3377 */
3378 wait |= MCR_COMP;
3379 if (!(wait & MCR_NOSLEEP)) {
3380 wait |= MCR_FAILOK;
3381 }
3382
3383 /* allocate mbufs */
3384 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
3385 if (needed == 0) {
3386 ASSERT(mp_list == NULL);
3387 goto fail;
3388 }
3389
3390 /* allocate clusters */
3391 if (class == MC_MBUF_CL) {
3392 cl_class = MC_CL;
3393 } else if (class == MC_MBUF_BIGCL) {
3394 cl_class = MC_BIGCL;
3395 } else {
3396 VERIFY(class == MC_MBUF_16KCL);
3397 cl_class = MC_16KCL;
3398 }
3399 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
3400 if (needed == 0) {
3401 ASSERT(clp_list == NULL);
3402 goto fail;
3403 }
3404
3405 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
3406 if (needed == 0) {
3407 ASSERT(ref_list == NULL);
3408 goto fail;
3409 }
3410
3411 /*
3412 * By this time "needed" is MIN(mbuf, cluster, ref). Any left
3413 * overs will get freed accordingly before we return to caller.
3414 */
3415 for (cnum = 0; cnum < needed; cnum++) {
3416 struct mbuf *ms;
3417
3418 m = ms = (struct mbuf *)mp_list;
3419 mp_list = mp_list->obj_next;
3420
3421 cl = clp_list;
3422 clp_list = clp_list->obj_next;
3423 ((mcache_obj_t *)cl)->obj_next = NULL;
3424
3425 rfa = (struct ext_ref *)ref_list;
3426 ref_list = ref_list->obj_next;
3427 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
3428
3429 /*
3430 * If auditing is enabled, construct the shadow mbuf
3431 * in the audit structure instead of in the actual one.
3432 * mbuf_cslab_audit() will take care of restoring the
3433 * contents after the integrity check.
3434 */
3435 if (mclaudit != NULL) {
3436 mcache_audit_t *mca, *cl_mca;
3437
3438 lck_mtx_lock(mbuf_mlock);
3439 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3440 ms = MCA_SAVED_MBUF_PTR(mca);
3441 cl_mca = mcl_audit_buf2mca(cl_class,
3442 (mcache_obj_t *)cl);
3443
3444 /*
3445 * Pair them up. Note that this is done at the time
3446 * the mbuf+cluster objects are constructed. This
3447 * information should be treated as "best effort"
3448 * debugging hint since more than one mbufs can refer
3449 * to a cluster. In that case, the cluster might not
3450 * be freed along with the mbuf it was paired with.
3451 */
3452 mca->mca_uptr = cl_mca;
3453 cl_mca->mca_uptr = mca;
3454
3455 ASSERT(mca->mca_uflags & MB_SCVALID);
3456 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
3457 lck_mtx_unlock(mbuf_mlock);
3458
3459 /* Technically, they are in the freelist */
3460 if (mclverify) {
3461 size_t size;
3462
3463 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
3464 m_maxsize(MC_MBUF));
3465
3466 if (class == MC_MBUF_CL) {
3467 size = m_maxsize(MC_CL);
3468 } else if (class == MC_MBUF_BIGCL) {
3469 size = m_maxsize(MC_BIGCL);
3470 } else {
3471 size = m_maxsize(MC_16KCL);
3472 }
3473
3474 mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
3475 size);
3476 }
3477 }
3478
3479 MBUF_INIT(ms, 0, MT_FREE);
3480 if (class == MC_MBUF_16KCL) {
3481 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
3482 } else if (class == MC_MBUF_BIGCL) {
3483 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
3484 } else {
3485 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
3486 }
3487 VERIFY(ms->m_flags == M_EXT);
3488 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3489
3490 *list = (mcache_obj_t *)m;
3491 (*list)->obj_next = NULL;
3492 list = *plist = &(*list)->obj_next;
3493 }
3494
3495fail:
3496 /*
3497 * Free up what's left of the above.
3498 */
3499 if (mp_list != NULL) {
3500 mcache_free_ext(m_cache(MC_MBUF), mp_list);
3501 }
3502 if (clp_list != NULL) {
3503 mcache_free_ext(m_cache(cl_class), clp_list);
3504 }
3505 if (ref_list != NULL) {
3506 mcache_free_ext(ref_cache, ref_list);
3507 }
3508
3509 lck_mtx_lock(mbuf_mlock);
3510 if (num > 0 || cnum > 0) {
3511 m_total(class) += cnum;
3512 VERIFY(m_total(class) <= m_maxlimit(class));
3513 m_alloc_cnt(class) += num + cnum;
3514 }
3515 if ((num + cnum) < want) {
3516 m_fail_cnt(class) += (want - (num + cnum));
3517 }
3518 lck_mtx_unlock(mbuf_mlock);
3519
3520 return num + cnum;
3521}
3522
3523/*
3524 * Common de-allocator for composite objects called by the CPU cache
3525 * layer when one or more elements need to be returned to the appropriate
3526 * global freelist.
3527 */
3528static void
3529mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
3530{
3531 mbuf_class_t class = (mbuf_class_t)arg;
3532 unsigned int num;
3533 int w;
3534
3535 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3536
3537 lck_mtx_lock(mbuf_mlock);
3538
3539 num = cslab_free(class, list, purged);
3540 m_free_cnt(class) += num;
3541
3542 if ((w = mb_waiters) > 0) {
3543 mb_waiters = 0;
3544 }
3545 if (w) {
3546 mbwdog_logger("waking up all threads");
3547 }
3548
3549 lck_mtx_unlock(mbuf_mlock);
3550
3551 if (w != 0) {
3552 wakeup(mb_waitchan);
3553 }
3554}
3555
3556/*
3557 * Common auditor for composite objects called by the CPU cache layer
3558 * during an allocation or free request. For the former, this is called
3559 * after the objects are obtained from either the bucket or slab layer
3560 * and before they are returned to the caller. For the latter, this is
3561 * called immediately during free and before placing the objects into
3562 * the bucket or slab layer.
3563 */
3564static void
3565mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
3566{
3567 mbuf_class_t class = (mbuf_class_t)arg, cl_class;
3568 mcache_audit_t *mca;
3569 struct mbuf *m, *ms;
3570 mcl_slab_t *clsp, *nsp;
3571 size_t cl_size;
3572 void *cl;
3573
3574 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
3575 if (class == MC_MBUF_CL) {
3576 cl_class = MC_CL;
3577 } else if (class == MC_MBUF_BIGCL) {
3578 cl_class = MC_BIGCL;
3579 } else {
3580 cl_class = MC_16KCL;
3581 }
3582 cl_size = m_maxsize(cl_class);
3583
3584 while ((m = ms = (struct mbuf *)list) != NULL) {
3585 lck_mtx_lock(mbuf_mlock);
3586 /* Do the mbuf sanity checks and record its transaction */
3587 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
3588 mcl_audit_mbuf(mca, m, TRUE, alloc);
3589 if (mcltrace) {
3590 mcache_buffer_log(mca, m, m_cache(class), &mb_start);
3591 }
3592
3593 if (alloc) {
3594 mca->mca_uflags |= MB_COMP_INUSE;
3595 } else {
3596 mca->mca_uflags &= ~MB_COMP_INUSE;
3597 }
3598
3599 /*
3600 * Use the shadow mbuf in the audit structure if we are
3601 * freeing, since the contents of the actual mbuf has been
3602 * pattern-filled by the above call to mcl_audit_mbuf().
3603 */
3604 if (!alloc && mclverify) {
3605 ms = MCA_SAVED_MBUF_PTR(mca);
3606 }
3607
3608 /* Do the cluster sanity checks and record its transaction */
3609 cl = ms->m_ext.ext_buf;
3610 clsp = slab_get(cl);
3611 VERIFY(ms->m_flags == M_EXT && cl != NULL);
3612 VERIFY(m_get_rfa(ms) != NULL && MBUF_IS_COMPOSITE(ms));
3613 if (class == MC_MBUF_CL) {
3614 VERIFY(clsp->sl_refcnt >= 1 &&
3615 clsp->sl_refcnt <= NCLPG);
3616 } else {
3617 VERIFY(clsp->sl_refcnt >= 1 &&
3618 clsp->sl_refcnt <= NBCLPG);
3619 }
3620
3621 if (class == MC_MBUF_16KCL) {
3622 int k;
3623 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
3624 nsp = nsp->sl_next;
3625 /* Next slab must already be present */
3626 VERIFY(nsp != NULL);
3627 VERIFY(nsp->sl_refcnt == 1);
3628 }
3629 }
3630
3631
3632 mca = mcl_audit_buf2mca(cl_class, cl);
3633 mcl_audit_cluster(mca, cl, cl_size, alloc, FALSE);
3634 if (mcltrace) {
3635 mcache_buffer_log(mca, cl, m_cache(class), &mb_start);
3636 }
3637
3638 if (alloc) {
3639 mca->mca_uflags |= MB_COMP_INUSE;
3640 } else {
3641 mca->mca_uflags &= ~MB_COMP_INUSE;
3642 }
3643 lck_mtx_unlock(mbuf_mlock);
3644
3645 list = list->obj_next;
3646 }
3647}
3648
3649static void
3650m_vm_error_stats(uint32_t *cnt, uint64_t *ts, uint64_t *size,
3651 uint64_t alloc_size, kern_return_t error)
3652{
3653 *cnt = *cnt + 1;
3654 *ts = net_uptime();
3655 if (size) {
3656 *size = alloc_size;
3657 }
3658 switch (error) {
3659 case KERN_SUCCESS:
3660 break;
3661 case KERN_INVALID_ARGUMENT:
3662 mb_kmem_stats[0]++;
3663 break;
3664 case KERN_INVALID_ADDRESS:
3665 mb_kmem_stats[1]++;
3666 break;
3667 case KERN_RESOURCE_SHORTAGE:
3668 mb_kmem_stats[2]++;
3669 break;
3670 case KERN_NO_SPACE:
3671 mb_kmem_stats[3]++;
3672 break;
3673 case KERN_FAILURE:
3674 mb_kmem_stats[4]++;
3675 break;
3676 default:
3677 mb_kmem_stats[5]++;
3678 break;
3679 }
3680}
3681
3682static vm_offset_t
3683kmem_mb_alloc(vm_map_t mbmap, int size, int physContig, kern_return_t *err)
3684{
3685 vm_offset_t addr = 0;
3686 kern_return_t kr = KERN_SUCCESS;
3687
3688 if (!physContig) {
3689 kr = kmem_alloc(mbmap, &addr, size,
3690 KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
3691 } else {
3692 kr = kmem_alloc_contig(mbmap, &addr, size, PAGE_MASK, 0xfffff,
3693 0, KMA_KOBJECT | KMA_LOMEM, VM_KERN_MEMORY_MBUF);
3694 }
3695
3696 if (kr != KERN_SUCCESS) {
3697 addr = 0;
3698 }
3699 if (err) {
3700 *err = kr;
3701 }
3702
3703 return addr;
3704}
3705
3706/*
3707 * Allocate some number of mbuf clusters and place on cluster freelist.
3708 */
3709static int
3710m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
3711{
3712 int i, count = 0;
3713 vm_size_t size = 0;
3714 int numpages = 0, large_buffer;
3715 vm_offset_t page = 0;
3716 mcache_audit_t *mca_list = NULL;
3717 mcache_obj_t *con_list = NULL;
3718 mcl_slab_t *sp;
3719 mbuf_class_t class;
3720 kern_return_t error;
3721
3722 /* Set if a buffer allocation needs allocation of multiple pages */
3723 large_buffer = ((bufsize == m_maxsize(MC_16KCL)) &&
3724 PAGE_SIZE < M16KCLBYTES);
3725 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
3726 bufsize == m_maxsize(MC_16KCL));
3727
3728 VERIFY((bufsize == PAGE_SIZE) ||
3729 (bufsize > PAGE_SIZE && bufsize == m_maxsize(MC_16KCL)));
3730
3731 if (bufsize == m_size(MC_BIGCL)) {
3732 class = MC_BIGCL;
3733 } else {
3734 class = MC_16KCL;
3735 }
3736
3737 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3738
3739 /*
3740 * Multiple threads may attempt to populate the cluster map one
3741 * after another. Since we drop the lock below prior to acquiring
3742 * the physical page(s), our view of the cluster map may no longer
3743 * be accurate, and we could end up over-committing the pages beyond
3744 * the maximum allowed for each class. To prevent it, this entire
3745 * operation (including the page mapping) is serialized.
3746 */
3747 while (mb_clalloc_busy) {
3748 mb_clalloc_waiters++;
3749 (void) msleep(mb_clalloc_waitchan, mbuf_mlock,
3750 (PZERO - 1), "m_clalloc", NULL);
3751 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3752 }
3753
3754 /* We are busy now; tell everyone else to go away */
3755 mb_clalloc_busy = TRUE;
3756
3757 /*
3758 * Honor the caller's wish to block or not block. We have a way
3759 * to grow the pool asynchronously using the mbuf worker thread.
3760 */
3761 i = m_howmany(num, bufsize);
3762 if (i <= 0 || (wait & M_DONTWAIT)) {
3763 goto out;
3764 }
3765
3766 lck_mtx_unlock(mbuf_mlock);
3767
3768 size = round_page(i * bufsize);
3769 page = kmem_mb_alloc(mb_map, size, large_buffer, &error);
3770
3771 /*
3772 * If we did ask for "n" 16KB physically contiguous chunks
3773 * and didn't get them, then please try again without this
3774 * restriction.
3775 */
3776 net_update_uptime();
3777 if (large_buffer && page == 0) {
3778 m_vm_error_stats(&mb_kmem_contig_failed,
3779 &mb_kmem_contig_failed_ts,
3780 &mb_kmem_contig_failed_size,
3781 size, error);
3782 page = kmem_mb_alloc(mb_map, size, 0, &error);
3783 }
3784
3785 if (page == 0) {
3786 m_vm_error_stats(&mb_kmem_failed,
3787 &mb_kmem_failed_ts,
3788 &mb_kmem_failed_size,
3789 size, error);
3790#if PAGE_SIZE == 4096
3791 if (bufsize == m_maxsize(MC_BIGCL)) {
3792#else
3793 if (bufsize >= m_maxsize(MC_BIGCL)) {
3794#endif
3795 /* Try for 1 page if failed */
3796 size = PAGE_SIZE;
3797 page = kmem_mb_alloc(mb_map, size, 0, &error);
3798 if (page == 0) {
3799 m_vm_error_stats(&mb_kmem_one_failed,
3800 &mb_kmem_one_failed_ts,
3801 NULL, size, error);
3802 }
3803 }
3804
3805 if (page == 0) {
3806 lck_mtx_lock(mbuf_mlock);
3807 goto out;
3808 }
3809 }
3810
3811 VERIFY(IS_P2ALIGNED(page, PAGE_SIZE));
3812 numpages = size / PAGE_SIZE;
3813
3814 /* If auditing is enabled, allocate the audit structures now */
3815 if (mclaudit != NULL) {
3816 int needed;
3817
3818 /*
3819 * Yes, I realize this is a waste of memory for clusters
3820 * that never get transformed into mbufs, as we may end
3821 * up with NMBPG-1 unused audit structures per cluster.
3822 * But doing so tremendously simplifies the allocation
3823 * strategy, since at this point we are not holding the
3824 * mbuf lock and the caller is okay to be blocked.
3825 */
3826 if (bufsize == PAGE_SIZE) {
3827 needed = numpages * NMBPG;
3828
3829 i = mcache_alloc_ext(mcl_audit_con_cache,
3830 &con_list, needed, MCR_SLEEP);
3831
3832 VERIFY(con_list != NULL && i == needed);
3833 } else {
3834 /*
3835 * if multiple 4K pages are being used for a
3836 * 16K cluster
3837 */
3838 needed = numpages / NSLABSP16KB;
3839 }
3840
3841 i = mcache_alloc_ext(mcache_audit_cache,
3842 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
3843
3844 VERIFY(mca_list != NULL && i == needed);
3845 }
3846
3847 lck_mtx_lock(mbuf_mlock);
3848
3849 for (i = 0; i < numpages; i++, page += PAGE_SIZE) {
3850 ppnum_t offset =
3851 ((unsigned char *)page - mbutl) >> PAGE_SHIFT;
3852 ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
3853
3854 /*
3855 * If there is a mapper the appropriate I/O page is
3856 * returned; zero out the page to discard its past
3857 * contents to prevent exposing leftover kernel memory.
3858 */
3859 VERIFY(offset < mcl_pages);
3860 if (mcl_paddr_base != 0) {
3861 bzero((void *)(uintptr_t) page, PAGE_SIZE);
3862 new_page = IOMapperInsertPage(mcl_paddr_base,
3863 offset, new_page);
3864 }
3865 mcl_paddr[offset] = new_page;
3866
3867 /* Pattern-fill this fresh page */
3868 if (mclverify) {
3869 mcache_set_pattern(MCACHE_FREE_PATTERN,
3870 (caddr_t)page, PAGE_SIZE);
3871 }
3872 if (bufsize == PAGE_SIZE) {
3873 mcache_obj_t *buf;
3874 /* One for the entire page */
3875 sp = slab_get((void *)page);
3876 if (mclaudit != NULL) {
3877 mcl_audit_init((void *)page,
3878 &mca_list, &con_list,
3879 AUDIT_CONTENTS_SIZE, NMBPG);
3880 }
3881 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3882 slab_init(sp, class, SLF_MAPPED, (void *)page,
3883 (void *)page, PAGE_SIZE, 0, 1);
3884 buf = (mcache_obj_t *)page;
3885 buf->obj_next = NULL;
3886
3887 /* Insert this slab */
3888 slab_insert(sp, class);
3889
3890 /* Update stats now since slab_get drops the lock */
3891 ++m_infree(class);
3892 ++m_total(class);
3893 VERIFY(m_total(class) <= m_maxlimit(class));
3894 if (class == MC_BIGCL) {
3895 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
3896 m_infree(MC_MBUF_BIGCL);
3897 mbstat.m_bigclusters = m_total(MC_BIGCL);
3898 }
3899 ++count;
3900 } else if ((bufsize > PAGE_SIZE) &&
3901 (i % NSLABSP16KB) == 0) {
3902 union m16kcluster *m16kcl = (union m16kcluster *)page;
3903 mcl_slab_t *nsp;
3904 int k;
3905
3906 /* One for the entire 16KB */
3907 sp = slab_get(m16kcl);
3908 if (mclaudit != NULL) {
3909 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
3910 }
3911
3912 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
3913 slab_init(sp, MC_16KCL, SLF_MAPPED,
3914 m16kcl, m16kcl, bufsize, 0, 1);
3915 m16kcl->m16kcl_next = NULL;
3916
3917 /*
3918 * 2nd-Nth page's slab is part of the first one,
3919 * where N is NSLABSP16KB.
3920 */
3921 for (k = 1; k < NSLABSP16KB; k++) {
3922 nsp = slab_get(((union mbigcluster *)page) + k);
3923 VERIFY(nsp->sl_refcnt == 0 &&
3924 nsp->sl_flags == 0);
3925 slab_init(nsp, MC_16KCL,
3926 SLF_MAPPED | SLF_PARTIAL,
3927 m16kcl, NULL, 0, 0, 0);
3928 }
3929 /* Insert this slab */
3930 slab_insert(sp, MC_16KCL);
3931
3932 /* Update stats now since slab_get drops the lock */
3933 ++m_infree(MC_16KCL);
3934 ++m_total(MC_16KCL);
3935 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3936 ++count;
3937 }
3938 }
3939 VERIFY(mca_list == NULL && con_list == NULL);
3940
3941 /* We're done; let others enter */
3942 mb_clalloc_busy = FALSE;
3943 if (mb_clalloc_waiters > 0) {
3944 mb_clalloc_waiters = 0;
3945 wakeup(mb_clalloc_waitchan);
3946 }
3947
3948 return count;
3949out:
3950 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3951
3952 mtracelarge_register(size);
3953
3954 /* We're done; let others enter */
3955 mb_clalloc_busy = FALSE;
3956 if (mb_clalloc_waiters > 0) {
3957 mb_clalloc_waiters = 0;
3958 wakeup(mb_clalloc_waitchan);
3959 }
3960
3961 /*
3962 * When non-blocking we kick a thread if we have to grow the
3963 * pool or if the number of free clusters is less than requested.
3964 */
3965 if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
3966 mbwdog_logger("waking up the worker thread to to grow %s by %d",
3967 m_cname(class), i);
3968 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
3969 mbuf_worker_needs_wakeup = FALSE;
3970 }
3971 if (class == MC_BIGCL) {
3972 if (i > 0) {
3973 /*
3974 * Remember total number of 4KB clusters needed
3975 * at this time.
3976 */
3977 i += m_total(MC_BIGCL);
3978 if (i > m_region_expand(MC_BIGCL)) {
3979 m_region_expand(MC_BIGCL) = i;
3980 }
3981 }
3982 if (m_infree(MC_BIGCL) >= num) {
3983 return 1;
3984 }
3985 } else {
3986 if (i > 0) {
3987 /*
3988 * Remember total number of 16KB clusters needed
3989 * at this time.
3990 */
3991 i += m_total(MC_16KCL);
3992 if (i > m_region_expand(MC_16KCL)) {
3993 m_region_expand(MC_16KCL) = i;
3994 }
3995 }
3996 if (m_infree(MC_16KCL) >= num) {
3997 return 1;
3998 }
3999 }
4000 return 0;
4001}
4002
4003/*
4004 * Populate the global freelist of the corresponding buffer class.
4005 */
4006static int
4007freelist_populate(mbuf_class_t class, unsigned int num, int wait)
4008{
4009 mcache_obj_t *o = NULL;
4010 int i, numpages = 0, count;
4011 mbuf_class_t super_class;
4012
4013 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
4014 class == MC_16KCL);
4015
4016 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4017
4018 VERIFY(PAGE_SIZE == m_maxsize(MC_BIGCL) ||
4019 PAGE_SIZE == m_maxsize(MC_16KCL));
4020
4021 if (m_maxsize(class) >= PAGE_SIZE) {
4022 return m_clalloc(num, wait, m_maxsize(class)) != 0;
4023 }
4024
4025 /*
4026 * The rest of the function will allocate pages and will slice
4027 * them up into the right size
4028 */
4029
4030 numpages = (num * m_size(class) + PAGE_SIZE - 1) / PAGE_SIZE;
4031
4032 /* Currently assume that pages are 4K or 16K */
4033 if (PAGE_SIZE == m_maxsize(MC_BIGCL)) {
4034 super_class = MC_BIGCL;
4035 } else {
4036 super_class = MC_16KCL;
4037 }
4038
4039 i = m_clalloc(numpages, wait, m_maxsize(super_class));
4040
4041 /* how many objects will we cut the page into? */
4042 int numobj = PAGE_SIZE / m_maxsize(class);
4043
4044 for (count = 0; count < numpages; count++) {
4045 /* respect totals, minlimit, maxlimit */
4046 if (m_total(super_class) <= m_minlimit(super_class) ||
4047 m_total(class) >= m_maxlimit(class)) {
4048 break;
4049 }
4050
4051 if ((o = slab_alloc(super_class, wait)) == NULL) {
4052 break;
4053 }
4054
4055 struct mbuf *m = (struct mbuf *)o;
4056 union mcluster *c = (union mcluster *)o;
4057 union mbigcluster *mbc = (union mbigcluster *)o;
4058 mcl_slab_t *sp = slab_get(o);
4059 mcache_audit_t *mca = NULL;
4060
4061 /*
4062 * since one full page will be converted to MC_MBUF or
4063 * MC_CL, verify that the reference count will match that
4064 * assumption
4065 */
4066 VERIFY(sp->sl_refcnt == 1 && slab_is_detached(sp));
4067 VERIFY((sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
4068 /*
4069 * Make sure that the cluster is unmolested
4070 * while in freelist
4071 */
4072 if (mclverify) {
4073 mca = mcl_audit_buf2mca(super_class,
4074 (mcache_obj_t *)o);
4075 mcache_audit_free_verify(mca,
4076 (mcache_obj_t *)o, 0, m_maxsize(super_class));
4077 }
4078
4079 /* Reinitialize it as an mbuf or 2K or 4K slab */
4080 slab_init(sp, class, sp->sl_flags,
4081 sp->sl_base, NULL, PAGE_SIZE, 0, numobj);
4082
4083 VERIFY(sp->sl_head == NULL);
4084
4085 VERIFY(m_total(super_class) >= 1);
4086 m_total(super_class)--;
4087
4088 if (super_class == MC_BIGCL) {
4089 mbstat.m_bigclusters = m_total(MC_BIGCL);
4090 }
4091
4092 m_total(class) += numobj;
4093 VERIFY(m_total(class) <= m_maxlimit(class));
4094 m_infree(class) += numobj;
4095
4096 i = numobj;
4097 if (class == MC_MBUF) {
4098 mbstat.m_mbufs = m_total(MC_MBUF);
4099 mtype_stat_add(MT_FREE, NMBPG);
4100 while (i--) {
4101 /*
4102 * If auditing is enabled, construct the
4103 * shadow mbuf in the audit structure
4104 * instead of the actual one.
4105 * mbuf_slab_audit() will take care of
4106 * restoring the contents after the
4107 * integrity check.
4108 */
4109 if (mclaudit != NULL) {
4110 struct mbuf *ms;
4111 mca = mcl_audit_buf2mca(MC_MBUF,
4112 (mcache_obj_t *)m);
4113 ms = MCA_SAVED_MBUF_PTR(mca);
4114 ms->m_type = MT_FREE;
4115 } else {
4116 m->m_type = MT_FREE;
4117 }
4118 m->m_next = sp->sl_head;
4119 sp->sl_head = (void *)m++;
4120 }
4121 } else if (class == MC_CL) { /* MC_CL */
4122 mbstat.m_clfree =
4123 m_infree(MC_CL) + m_infree(MC_MBUF_CL);
4124 mbstat.m_clusters = m_total(MC_CL);
4125 while (i--) {
4126 c->mcl_next = sp->sl_head;
4127 sp->sl_head = (void *)c++;
4128 }
4129 } else {
4130 VERIFY(class == MC_BIGCL);
4131 mbstat.m_bigclusters = m_total(MC_BIGCL);
4132 mbstat.m_bigclfree = m_infree(MC_BIGCL) +
4133 m_infree(MC_MBUF_BIGCL);
4134 while (i--) {
4135 mbc->mbc_next = sp->sl_head;
4136 sp->sl_head = (void *)mbc++;
4137 }
4138 }
4139
4140 /* Insert into the mbuf or 2k or 4k slab list */
4141 slab_insert(sp, class);
4142
4143 if ((i = mb_waiters) > 0) {
4144 mb_waiters = 0;
4145 }
4146 if (i != 0) {
4147 mbwdog_logger("waking up all threads");
4148 wakeup(mb_waitchan);
4149 }
4150 }
4151 return count != 0;
4152}
4153
4154/*
4155 * For each class, initialize the freelist to hold m_minlimit() objects.
4156 */
4157static void
4158freelist_init(mbuf_class_t class)
4159{
4160 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4161
4162 VERIFY(class == MC_CL || class == MC_BIGCL);
4163 VERIFY(m_total(class) == 0);
4164 VERIFY(m_minlimit(class) > 0);
4165
4166 while (m_total(class) < m_minlimit(class)) {
4167 (void) freelist_populate(class, m_minlimit(class), M_WAIT);
4168 }
4169
4170 VERIFY(m_total(class) >= m_minlimit(class));
4171}
4172
4173/*
4174 * (Inaccurately) check if it might be worth a trip back to the
4175 * mcache layer due the availability of objects there. We'll
4176 * end up back here if there's nothing up there.
4177 */
4178static boolean_t
4179mbuf_cached_above(mbuf_class_t class, int wait)
4180{
4181 switch (class) {
4182 case MC_MBUF:
4183 if (wait & MCR_COMP) {
4184 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
4185 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
4186 }
4187 break;
4188
4189 case MC_CL:
4190 if (wait & MCR_COMP) {
4191 return !mcache_bkt_isempty(m_cache(MC_MBUF_CL));
4192 }
4193 break;
4194
4195 case MC_BIGCL:
4196 if (wait & MCR_COMP) {
4197 return !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL));
4198 }
4199 break;
4200
4201 case MC_16KCL:
4202 if (wait & MCR_COMP) {
4203 return !mcache_bkt_isempty(m_cache(MC_MBUF_16KCL));
4204 }
4205 break;
4206
4207 case MC_MBUF_CL:
4208 case MC_MBUF_BIGCL:
4209 case MC_MBUF_16KCL:
4210 break;
4211
4212 default:
4213 VERIFY(0);
4214 /* NOTREACHED */
4215 }
4216
4217 return !mcache_bkt_isempty(m_cache(class));
4218}
4219
4220/*
4221 * If possible, convert constructed objects to raw ones.
4222 */
4223static boolean_t
4224mbuf_steal(mbuf_class_t class, unsigned int num)
4225{
4226 mcache_obj_t *top = NULL;
4227 mcache_obj_t **list = &top;
4228 unsigned int tot = 0;
4229
4230 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4231
4232 switch (class) {
4233 case MC_MBUF:
4234 case MC_CL:
4235 case MC_BIGCL:
4236 case MC_16KCL:
4237 return FALSE;
4238
4239 case MC_MBUF_CL:
4240 case MC_MBUF_BIGCL:
4241 case MC_MBUF_16KCL:
4242 /* Get the required number of constructed objects if possible */
4243 if (m_infree(class) > m_minlimit(class)) {
4244 tot = cslab_alloc(class, &list,
4245 MIN(num, m_infree(class)));
4246 }
4247
4248 /* And destroy them to get back the raw objects */
4249 if (top != NULL) {
4250 (void) cslab_free(class, top, 1);
4251 }
4252 break;
4253
4254 default:
4255 VERIFY(0);
4256 /* NOTREACHED */
4257 }
4258
4259 return tot == num;
4260}
4261
4262static void
4263m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
4264{
4265 int m, bmap = 0;
4266
4267 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
4268
4269 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
4270 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
4271 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
4272
4273 /*
4274 * This logic can be made smarter; for now, simply mark
4275 * all other related classes as potential victims.
4276 */
4277 switch (class) {
4278 case MC_MBUF:
4279 m_wantpurge(MC_CL)++;
4280 m_wantpurge(MC_BIGCL)++;
4281 m_wantpurge(MC_MBUF_CL)++;
4282 m_wantpurge(MC_MBUF_BIGCL)++;
4283 break;
4284
4285 case MC_CL:
4286 m_wantpurge(MC_MBUF)++;
4287 m_wantpurge(MC_BIGCL)++;
4288 m_wantpurge(MC_MBUF_BIGCL)++;
4289 if (!comp) {
4290 m_wantpurge(MC_MBUF_CL)++;
4291 }
4292 break;
4293
4294 case MC_BIGCL:
4295 m_wantpurge(MC_MBUF)++;
4296 m_wantpurge(MC_CL)++;
4297 m_wantpurge(MC_MBUF_CL)++;
4298 if (!comp) {
4299 m_wantpurge(MC_MBUF_BIGCL)++;
4300 }
4301 break;
4302
4303 case MC_16KCL:
4304 if (!comp) {
4305 m_wantpurge(MC_MBUF_16KCL)++;
4306 }
4307 break;
4308
4309 default:
4310 VERIFY(0);
4311 /* NOTREACHED */
4312 }
4313
4314 /*
4315 * Run through each marked class and check if we really need to
4316 * purge (and therefore temporarily disable) the per-CPU caches
4317 * layer used by the class. If so, remember the classes since
4318 * we are going to drop the lock below prior to purging.
4319 */
4320 for (m = 0; m < NELEM(mbuf_table); m++) {
4321 if (m_wantpurge(m) > 0) {
4322 m_wantpurge(m) = 0;
4323 /*
4324 * Try hard to steal the required number of objects
4325 * from the freelist of other mbuf classes. Only
4326 * purge and disable the per-CPU caches layer when
4327 * we don't have enough; it's the last resort.
4328 */
4329 if (!mbuf_steal(m, num)) {
4330 bmap |= (1 << m);
4331 }
4332 }
4333 }
4334
4335 lck_mtx_unlock(mbuf_mlock);
4336
4337 if (bmap != 0) {
4338 /* signal the domains to drain */
4339 net_drain_domains();
4340
4341 /* Sigh; we have no other choices but to ask mcache to purge */
4342 for (m = 0; m < NELEM(mbuf_table); m++) {
4343 if ((bmap & (1 << m)) &&
4344 mcache_purge_cache(m_cache(m), TRUE)) {
4345 lck_mtx_lock(mbuf_mlock);
4346 m_purge_cnt(m)++;
4347 mbstat.m_drain++;
4348 lck_mtx_unlock(mbuf_mlock);
4349 }
4350 }
4351 } else {
4352 /*
4353 * Request mcache to reap extra elements from all of its caches;
4354 * note that all reaps are serialized and happen only at a fixed
4355 * interval.
4356 */
4357 mcache_reap();
4358 }
4359 lck_mtx_lock(mbuf_mlock);
4360}
4361#endif /* CONFIG_MBUF_MCACHE */
4362
4363static inline struct mbuf *
4364m_get_common(int wait, short type, int hdr)
4365{
4366 struct mbuf *m;
4367
4368#if CONFIG_MBUF_MCACHE
4369 int mcflags = MSLEEPF(wait);
4370
4371 /* Is this due to a non-blocking retry? If so, then try harder */
4372 if (mcflags & MCR_NOSLEEP) {
4373 mcflags |= MCR_TRYHARD;
4374 }
4375
4376 m = mcache_alloc(m_cache(MC_MBUF), mcflags);
4377#else
4378 m = mz_alloc(flags: wait);
4379#endif /* CONFIG_MBUF_MCACHE */
4380 if (m != NULL) {
4381 MBUF_INIT(m, hdr, type);
4382 mtype_stat_inc(type);
4383 mtype_stat_dec(MT_FREE);
4384 }
4385 return m;
4386}
4387
4388/*
4389 * Space allocation routines; these are also available as macros
4390 * for critical paths.
4391 */
4392#define _M_GET(wait, type) m_get_common(wait, type, 0)
4393#define _M_GETHDR(wait, type) m_get_common(wait, type, 1)
4394#define _M_RETRY(wait, type) _M_GET(wait, type)
4395#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type)
4396#define _MGET(m, how, type) ((m) = _M_GET(how, type))
4397#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type))
4398
4399struct mbuf *
4400m_get(int wait, int type)
4401{
4402 return _M_GET(wait, type);
4403}
4404
4405struct mbuf *
4406m_gethdr(int wait, int type)
4407{
4408 return _M_GETHDR(wait, type);
4409}
4410
4411struct mbuf *
4412m_retry(int wait, int type)
4413{
4414 return _M_RETRY(wait, type);
4415}
4416
4417struct mbuf *
4418m_retryhdr(int wait, int type)
4419{
4420 return _M_RETRYHDR(wait, type);
4421}
4422
4423struct mbuf *
4424m_getclr(int wait, int type)
4425{
4426 struct mbuf *m;
4427
4428 _MGET(m, wait, type);
4429 if (m != NULL) {
4430 bzero(MTOD(m, caddr_t), MLEN);
4431 }
4432 return m;
4433}
4434
4435static int
4436m_free_paired(struct mbuf *m)
4437{
4438 VERIFY((m->m_flags & M_EXT) && (MEXT_FLAGS(m) & EXTF_PAIRED));
4439
4440 os_atomic_thread_fence(seq_cst);
4441 if (MEXT_PMBUF(m) == m) {
4442 /*
4443 * Paired ref count might be negative in case we lose
4444 * against another thread clearing MEXT_PMBUF, in the
4445 * event it occurs after the above memory barrier sync.
4446 * In that case just ignore as things have been unpaired.
4447 */
4448 int16_t prefcnt = os_atomic_dec(&MEXT_PREF(m), acq_rel);
4449 if (prefcnt > 1) {
4450 return 1;
4451 } else if (prefcnt == 1) {
4452 m_ext_free_func_t m_free_func = m_get_ext_free(m);
4453 VERIFY(m_free_func != NULL);
4454 (*m_free_func)(m->m_ext.ext_buf,
4455 m->m_ext.ext_size, m_get_ext_arg(m));
4456 return 1;
4457 } else if (prefcnt == 0) {
4458 VERIFY(MBUF_IS_PAIRED(m));
4459
4460 /*
4461 * Restore minref to its natural value, so that
4462 * the caller will be able to free the cluster
4463 * as appropriate.
4464 */
4465 MEXT_MINREF(m) = 0;
4466
4467 /*
4468 * Clear MEXT_PMBUF, but leave EXTF_PAIRED intact
4469 * as it is immutable. atomic_set_ptr also causes
4470 * memory barrier sync.
4471 */
4472 os_atomic_store(&MEXT_PMBUF(m), NULL, release);
4473
4474 switch (m->m_ext.ext_size) {
4475 case MCLBYTES:
4476 m_set_ext(m, m_get_rfa(m), NULL, NULL);
4477 break;
4478
4479 case MBIGCLBYTES:
4480 m_set_ext(m, m_get_rfa(m), m_bigfree, NULL);
4481 break;
4482
4483 case M16KCLBYTES:
4484 m_set_ext(m, m_get_rfa(m), m_16kfree, NULL);
4485 break;
4486
4487 default:
4488 VERIFY(0);
4489 /* NOTREACHED */
4490 }
4491 }
4492 }
4493
4494 /*
4495 * Tell caller the unpair has occurred, and that the reference
4496 * count on the external cluster held for the paired mbuf should
4497 * now be dropped.
4498 */
4499 return 0;
4500}
4501
4502struct mbuf *
4503m_free(struct mbuf *m)
4504{
4505 struct mbuf *n = m->m_next;
4506
4507 if (m->m_type == MT_FREE) {
4508 panic("m_free: freeing an already freed mbuf");
4509 }
4510
4511 if (m->m_flags & M_PKTHDR) {
4512 /* Check for scratch area overflow */
4513 m_redzone_verify(m);
4514 /* Free the aux data and tags if there is any */
4515 m_tag_delete_chain(m);
4516
4517 m_do_tx_compl_callback(m, NULL);
4518 }
4519
4520 if (m->m_flags & M_EXT) {
4521 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
4522 return n;
4523 }
4524 /*
4525 * Make sure that we don't touch any ext_ref
4526 * member after we decrement the reference count
4527 * since that may lead to use-after-free
4528 * when we do not hold the last reference.
4529 */
4530 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
4531 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
4532 const uint16_t minref = MEXT_MINREF(m);
4533 const uint16_t refcnt = m_decref(m);
4534
4535 if (refcnt == minref && !composite) {
4536#if CONFIG_MBUF_MCACHE
4537 if (m_free_func == NULL) {
4538 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
4539 } else if (m_free_func == m_bigfree) {
4540 mcache_free(m_cache(MC_BIGCL),
4541 m->m_ext.ext_buf);
4542 } else if (m_free_func == m_16kfree) {
4543 mcache_free(m_cache(MC_16KCL),
4544 m->m_ext.ext_buf);
4545 } else {
4546 (*m_free_func)(m->m_ext.ext_buf,
4547 m->m_ext.ext_size, m_get_ext_arg(m));
4548 }
4549 mcache_free(ref_cache, m_get_rfa(m));
4550#else
4551 if (m_free_func == NULL) {
4552 mz_cl_free(zid: ZONE_ID_CLUSTER_2K, cl: m->m_ext.ext_buf);
4553 } else if (m_free_func == m_bigfree) {
4554 mz_cl_free(zid: ZONE_ID_CLUSTER_4K, cl: m->m_ext.ext_buf);
4555 } else if (m_free_func == m_16kfree) {
4556 mz_cl_free(zid: ZONE_ID_CLUSTER_16K, cl: m->m_ext.ext_buf);
4557 } else {
4558 (*m_free_func)(m->m_ext.ext_buf,
4559 m->m_ext.ext_size, m_get_ext_arg(m));
4560 }
4561 mz_ref_free(rfa: m_get_rfa(m));
4562#endif /* CONFIG_MBUF_MCACHE */
4563 m_set_ext(m, NULL, NULL, NULL);
4564 } else if (refcnt == minref && composite) {
4565 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
4566
4567 mtype_stat_dec(m->m_type);
4568 mtype_stat_inc(MT_FREE);
4569
4570 m->m_type = MT_FREE;
4571 m->m_flags = M_EXT;
4572 m->m_len = 0;
4573 m->m_next = m->m_nextpkt = NULL;
4574 /*
4575 * MEXT_FLAGS is safe to access here
4576 * since we are now sure that we held
4577 * the last reference to ext_ref.
4578 */
4579 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4580
4581#if CONFIG_MBUF_MCACHE
4582 /* "Free" into the intermediate cache */
4583 if (m_free_func == NULL) {
4584 mcache_free(m_cache(MC_MBUF_CL), m);
4585 } else if (m_free_func == m_bigfree) {
4586 mcache_free(m_cache(MC_MBUF_BIGCL), m);
4587 } else {
4588 VERIFY(m_free_func == m_16kfree);
4589 mcache_free(m_cache(MC_MBUF_16KCL), m);
4590 }
4591#else
4592 /* "Free" into the intermediate cache */
4593 if (m_free_func == NULL) {
4594 mz_composite_free(class: MC_MBUF_CL, m);
4595 } else if (m_free_func == m_bigfree) {
4596 mz_composite_free(class: MC_MBUF_BIGCL, m);
4597 } else {
4598 VERIFY(m_free_func == m_16kfree);
4599 mz_composite_free(class: MC_MBUF_16KCL, m);
4600 }
4601#endif /* CONFIG_MBUF_MCACHE */
4602 return n;
4603 }
4604 }
4605
4606 mtype_stat_dec(m->m_type);
4607 mtype_stat_inc(MT_FREE);
4608
4609 m->m_type = MT_FREE;
4610 m->m_flags = m->m_len = 0;
4611 m->m_next = m->m_nextpkt = NULL;
4612
4613#if CONFIG_MBUF_MCACHE
4614 mcache_free(m_cache(MC_MBUF), m);
4615#else
4616 mz_free(m);
4617#endif /* CONFIG_MBUF_MCACHE */
4618
4619 return n;
4620}
4621
4622__private_extern__ struct mbuf *
4623m_clattach(struct mbuf *m, int type, caddr_t extbuf,
4624 void (*extfree)(caddr_t, u_int, caddr_t), size_t extsize, caddr_t extarg,
4625 int wait, int pair)
4626{
4627 struct ext_ref *rfa = NULL;
4628
4629 /*
4630 * If pairing is requested and an existing mbuf is provided, reject
4631 * it if it's already been paired to another cluster. Otherwise,
4632 * allocate a new one or free any existing below.
4633 */
4634 if ((m != NULL && MBUF_IS_PAIRED(m)) ||
4635 (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)) {
4636 return NULL;
4637 }
4638
4639 if (m->m_flags & M_EXT) {
4640 /*
4641 * Make sure that we don't touch any ext_ref
4642 * member after we decrement the reference count
4643 * since that may lead to use-after-free
4644 * when we do not hold the last reference.
4645 */
4646 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
4647 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED) && MEXT_PMBUF(m) == NULL);
4648 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
4649 const uint16_t minref = MEXT_MINREF(m);
4650 const uint16_t refcnt = m_decref(m);
4651
4652 if (refcnt == minref && !composite) {
4653#if CONFIG_MBUF_MCACHE
4654 if (m_free_func == NULL) {
4655 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
4656 } else if (m_free_func == m_bigfree) {
4657 mcache_free(m_cache(MC_BIGCL),
4658 m->m_ext.ext_buf);
4659 } else if (m_free_func == m_16kfree) {
4660 mcache_free(m_cache(MC_16KCL),
4661 m->m_ext.ext_buf);
4662 } else {
4663 (*m_free_func)(m->m_ext.ext_buf,
4664 m->m_ext.ext_size, m_get_ext_arg(m));
4665 }
4666#else
4667 if (m_free_func == NULL) {
4668 mz_cl_free(zid: ZONE_ID_CLUSTER_2K, cl: m->m_ext.ext_buf);
4669 } else if (m_free_func == m_bigfree) {
4670 mz_cl_free(zid: ZONE_ID_CLUSTER_4K, cl: m->m_ext.ext_buf);
4671 } else if (m_free_func == m_16kfree) {
4672 mz_cl_free(zid: ZONE_ID_CLUSTER_16K, cl: m->m_ext.ext_buf);
4673 } else {
4674 (*m_free_func)(m->m_ext.ext_buf,
4675 m->m_ext.ext_size, m_get_ext_arg(m));
4676 }
4677#endif /* CONFIG_MBUF_MCACHE */
4678 /* Re-use the reference structure */
4679 rfa = m_get_rfa(m);
4680 } else if (refcnt == minref && composite) {
4681 VERIFY(m->m_type != MT_FREE);
4682
4683 mtype_stat_dec(m->m_type);
4684 mtype_stat_inc(MT_FREE);
4685
4686 m->m_type = MT_FREE;
4687 m->m_flags = M_EXT;
4688 m->m_len = 0;
4689 m->m_next = m->m_nextpkt = NULL;
4690
4691 /*
4692 * MEXT_FLAGS is safe to access here
4693 * since we are now sure that we held
4694 * the last reference to ext_ref.
4695 */
4696 MEXT_FLAGS(m) &= ~EXTF_READONLY;
4697
4698 /* "Free" into the intermediate cache */
4699#if CONFIG_MBUF_MCACHE
4700 if (m_free_func == NULL) {
4701 mcache_free(m_cache(MC_MBUF_CL), m);
4702 } else if (m_free_func == m_bigfree) {
4703 mcache_free(m_cache(MC_MBUF_BIGCL), m);
4704 } else {
4705 VERIFY(m_free_func == m_16kfree);
4706 mcache_free(m_cache(MC_MBUF_16KCL), m);
4707 }
4708#else
4709 if (m_free_func == NULL) {
4710 mz_composite_free(class: MC_MBUF_CL, m);
4711 } else if (m_free_func == m_bigfree) {
4712 mz_composite_free(class: MC_MBUF_BIGCL, m);
4713 } else {
4714 VERIFY(m_free_func == m_16kfree);
4715 mz_composite_free(class: MC_MBUF_16KCL, m);
4716 }
4717#endif /* CONFIG_MBUF_MCACHE */
4718 /*
4719 * Allocate a new mbuf, since we didn't divorce
4720 * the composite mbuf + cluster pair above.
4721 */
4722 if ((m = _M_GETHDR(wait, type)) == NULL) {
4723 return NULL;
4724 }
4725 }
4726 }
4727
4728#if CONFIG_MBUF_MCACHE
4729 if (rfa == NULL &&
4730 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4731 m_free(m);
4732 return NULL;
4733 }
4734#else
4735 if (rfa == NULL &&
4736 (rfa = mz_ref_alloc(flags: wait)) == NULL) {
4737 m_free(m);
4738 return NULL;
4739 }
4740#endif /* CONFIG_MBUF_MCACHE */
4741
4742 if (!pair) {
4743 MEXT_INIT(m, buf: extbuf, size: extsize, free: extfree, free_arg: extarg, rfa,
4744 min: 0, ref: 1, pref: 0, flag: 0, priv: 0, NULL);
4745 } else {
4746 MEXT_INIT(m, buf: extbuf, size: extsize, free: extfree, free_arg: (caddr_t)m, rfa,
4747 min: 1, ref: 1, pref: 1, EXTF_PAIRED, priv: 0, pm: m);
4748 }
4749
4750 return m;
4751}
4752
4753/*
4754 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
4755 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
4756 */
4757struct mbuf *
4758m_getcl(int wait, int type, int flags)
4759{
4760 struct mbuf *m = NULL;
4761 int hdr = (flags & M_PKTHDR);
4762
4763#if CONFIG_MBUF_MCACHE
4764 int mcflags = MSLEEPF(wait);
4765
4766 /* Is this due to a non-blocking retry? If so, then try harder */
4767 if (mcflags & MCR_NOSLEEP) {
4768 mcflags |= MCR_TRYHARD;
4769 }
4770
4771 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
4772#else
4773 m = mz_composite_alloc(class: MC_MBUF_CL, flags: wait);
4774#endif /* CONFIG_MBUF_MCACHE */
4775 if (m != NULL) {
4776 u_int16_t flag;
4777 struct ext_ref *rfa;
4778 void *cl;
4779
4780 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
4781 cl = m->m_ext.ext_buf;
4782 rfa = m_get_rfa(m);
4783
4784 ASSERT(cl != NULL && rfa != NULL);
4785 VERIFY(MBUF_IS_COMPOSITE(m) && m_get_ext_free(m) == NULL);
4786
4787 flag = MEXT_FLAGS(m);
4788
4789 MBUF_INIT(m, hdr, type);
4790 MBUF_CL_INIT(m, cl, rfa, 1, flag);
4791
4792 mtype_stat_inc(type);
4793 mtype_stat_dec(MT_FREE);
4794 }
4795 return m;
4796}
4797
4798/* m_mclget() add an mbuf cluster to a normal mbuf */
4799struct mbuf *
4800m_mclget(struct mbuf *m, int wait)
4801{
4802 struct ext_ref *rfa = NULL;
4803
4804#if CONFIG_MBUF_MCACHE
4805 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4806 return m;
4807 }
4808#else
4809 if ((rfa = mz_ref_alloc(flags: wait)) == NULL) {
4810 return m;
4811 }
4812#endif /* CONFIG_MBUF_MCACHE */
4813 m->m_ext.ext_buf = m_mclalloc(wait);
4814 if (m->m_ext.ext_buf != NULL) {
4815 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4816 } else {
4817#if CONFIG_MBUF_MCACHE
4818 mcache_free(ref_cache, rfa);
4819#else
4820 mz_ref_free(rfa);
4821#endif /* CONFIG_MBUF_MCACHE */
4822 }
4823
4824 return m;
4825}
4826
4827/* Allocate an mbuf cluster */
4828caddr_t
4829m_mclalloc(int wait)
4830{
4831#if CONFIG_MBUF_MCACHE
4832 int mcflags = MSLEEPF(wait);
4833
4834 /* Is this due to a non-blocking retry? If so, then try harder */
4835 if (mcflags & MCR_NOSLEEP) {
4836 mcflags |= MCR_TRYHARD;
4837 }
4838
4839 return mcache_alloc(m_cache(MC_CL), mcflags);
4840#else
4841 return mz_cl_alloc(zid: ZONE_ID_CLUSTER_2K, flags: wait);
4842#endif /* CONFIG_MBUF_MCACHE */
4843}
4844
4845/* Free an mbuf cluster */
4846void
4847m_mclfree(caddr_t p)
4848{
4849#if CONFIG_MBUF_MCACHE
4850 mcache_free(m_cache(MC_CL), p);
4851#else
4852 mz_cl_free(zid: ZONE_ID_CLUSTER_2K, cl: p);
4853#endif /* CONFIG_MBUF_MCACHE */
4854}
4855
4856/*
4857 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
4858 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
4859 */
4860int
4861m_mclhasreference(struct mbuf *m)
4862{
4863 if (!(m->m_flags & M_EXT)) {
4864 return 0;
4865 }
4866
4867 ASSERT(m_get_rfa(m) != NULL);
4868
4869 return (MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0;
4870}
4871
4872__private_extern__ caddr_t
4873m_bigalloc(int wait)
4874{
4875#if CONFIG_MBUF_MCACHE
4876 int mcflags = MSLEEPF(wait);
4877
4878 /* Is this due to a non-blocking retry? If so, then try harder */
4879 if (mcflags & MCR_NOSLEEP) {
4880 mcflags |= MCR_TRYHARD;
4881 }
4882
4883 return mcache_alloc(m_cache(MC_BIGCL), mcflags);
4884#else
4885 return mz_cl_alloc(zid: ZONE_ID_CLUSTER_4K, flags: wait);
4886#endif /* CONFIG_MBUF_MCACHE */
4887}
4888
4889__private_extern__ void
4890m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4891{
4892#if CONFIG_MBUF_MCACHE
4893 mcache_free(m_cache(MC_BIGCL), p);
4894#else
4895 mz_cl_free(zid: ZONE_ID_CLUSTER_4K, cl: p);
4896#endif /* CONFIG_MBUF_MCACHE */
4897}
4898
4899/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
4900__private_extern__ struct mbuf *
4901m_mbigget(struct mbuf *m, int wait)
4902{
4903 struct ext_ref *rfa = NULL;
4904
4905#if CONFIG_MBUF_MCACHE
4906 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4907 return m;
4908 }
4909#else
4910 if ((rfa = mz_ref_alloc(flags: wait)) == NULL) {
4911 return m;
4912 }
4913#endif /* CONFIG_MBUF_MCACHE */
4914 m->m_ext.ext_buf = m_bigalloc(wait);
4915 if (m->m_ext.ext_buf != NULL) {
4916 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4917 } else {
4918#if CONFIG_MBUF_MCACHE
4919 mcache_free(ref_cache, rfa);
4920#else
4921 mz_ref_free(rfa);
4922#endif /* CONFIG_MBUF_MCACHE */
4923 }
4924 return m;
4925}
4926
4927__private_extern__ caddr_t
4928m_16kalloc(int wait)
4929{
4930#if CONFIG_MBUF_MCACHE
4931 int mcflags = MSLEEPF(wait);
4932
4933 /* Is this due to a non-blocking retry? If so, then try harder */
4934 if (mcflags & MCR_NOSLEEP) {
4935 mcflags |= MCR_TRYHARD;
4936 }
4937
4938 return mcache_alloc(m_cache(MC_16KCL), mcflags);
4939#else
4940 return mz_cl_alloc(zid: ZONE_ID_CLUSTER_16K, flags: wait);
4941#endif /* CONFIG_MBUF_MCACHE */
4942}
4943
4944__private_extern__ void
4945m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
4946{
4947#if CONFIG_MBUF_MCACHE
4948 mcache_free(m_cache(MC_16KCL), p);
4949#else
4950 mz_cl_free(zid: ZONE_ID_CLUSTER_16K, cl: p);
4951#endif /* CONFIG_MBUF_MCACHE */
4952}
4953
4954/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
4955__private_extern__ struct mbuf *
4956m_m16kget(struct mbuf *m, int wait)
4957{
4958 struct ext_ref *rfa = NULL;
4959
4960#if CONFIG_MBUF_MCACHE
4961 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
4962 return m;
4963 }
4964#else
4965 if ((rfa = mz_ref_alloc(flags: wait)) == NULL) {
4966 return m;
4967 }
4968#endif /* CONFIG_MBUF_MCACHE */
4969 m->m_ext.ext_buf = m_16kalloc(wait);
4970 if (m->m_ext.ext_buf != NULL) {
4971 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
4972 } else {
4973#if CONFIG_MBUF_MCACHE
4974 mcache_free(ref_cache, rfa);
4975#else
4976 mz_ref_free(rfa);
4977#endif /* CONFIG_MBUF_MCACHE */
4978 }
4979
4980 return m;
4981}
4982
4983/*
4984 * "Move" mbuf pkthdr from "from" to "to".
4985 * "from" must have M_PKTHDR set, and "to" must be empty.
4986 */
4987void
4988m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
4989{
4990 VERIFY(from->m_flags & M_PKTHDR);
4991
4992 /* Check for scratch area overflow */
4993 m_redzone_verify(m: from);
4994
4995 if (to->m_flags & M_PKTHDR) {
4996 /* Check for scratch area overflow */
4997 m_redzone_verify(m: to);
4998 /* We will be taking over the tags of 'to' */
4999 m_tag_delete_chain(to);
5000 }
5001 to->m_pkthdr = from->m_pkthdr; /* especially tags */
5002 m_classifier_init(from, 0); /* purge classifier info */
5003 m_tag_init(from, 1); /* purge all tags from src */
5004 m_scratch_init(from); /* clear src scratch area */
5005 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
5006 if ((to->m_flags & M_EXT) == 0) {
5007 to->m_data = (uintptr_t)to->m_pktdat;
5008 }
5009 m_redzone_init(to); /* setup red zone on dst */
5010}
5011
5012/*
5013 * Duplicate "from"'s mbuf pkthdr in "to".
5014 * "from" must have M_PKTHDR set, and "to" must be empty.
5015 * In particular, this does a deep copy of the packet tags.
5016 */
5017int
5018m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
5019{
5020 VERIFY(from->m_flags & M_PKTHDR);
5021
5022 /* Check for scratch area overflow */
5023 m_redzone_verify(m: from);
5024
5025 if (to->m_flags & M_PKTHDR) {
5026 /* Check for scratch area overflow */
5027 m_redzone_verify(m: to);
5028 /* We will be taking over the tags of 'to' */
5029 m_tag_delete_chain(to);
5030 }
5031 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
5032 if ((to->m_flags & M_EXT) == 0) {
5033 to->m_data = (uintptr_t)to->m_pktdat;
5034 }
5035 to->m_pkthdr = from->m_pkthdr;
5036 /* clear TX completion flag so the callback is not called in the copy */
5037 to->m_pkthdr.pkt_flags &= ~PKTF_TX_COMPL_TS_REQ;
5038 m_redzone_init(to); /* setup red zone on dst */
5039 m_tag_init(to, 0); /* preserve dst static tags */
5040 return m_tag_copy_chain(to, from, how);
5041}
5042
5043void
5044m_copy_pftag(struct mbuf *to, struct mbuf *from)
5045{
5046 memcpy(m_pftag(to), m_pftag(from), n: sizeof(struct pf_mtag));
5047#if PF_ECN
5048 m_pftag(to)->pftag_hdr = NULL;
5049 m_pftag(to)->pftag_flags &= ~(PF_TAG_HDR_INET | PF_TAG_HDR_INET6);
5050#endif /* PF_ECN */
5051}
5052
5053void
5054m_copy_necptag(struct mbuf *to, struct mbuf *from)
5055{
5056 memcpy(m_necptag(to), m_necptag(from), n: sizeof(struct necp_mtag_));
5057}
5058
5059void
5060m_classifier_init(struct mbuf *m, uint32_t pktf_mask)
5061{
5062 VERIFY(m->m_flags & M_PKTHDR);
5063
5064 m->m_pkthdr.pkt_proto = 0;
5065 m->m_pkthdr.pkt_flowsrc = 0;
5066 m->m_pkthdr.pkt_flowid = 0;
5067 m->m_pkthdr.pkt_ext_flags = 0;
5068 m->m_pkthdr.pkt_flags &= pktf_mask; /* caller-defined mask */
5069 /* preserve service class and interface info for loopback packets */
5070 if (!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
5071 (void) m_set_service_class(m, MBUF_SC_BE);
5072 }
5073 if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
5074 m->m_pkthdr.pkt_ifainfo = 0;
5075 }
5076 /*
5077 * Preserve timestamp if requested
5078 */
5079 if (!(m->m_pkthdr.pkt_flags & PKTF_TS_VALID)) {
5080 m->m_pkthdr.pkt_timestamp = 0;
5081 }
5082}
5083
5084void
5085m_copy_classifier(struct mbuf *to, struct mbuf *from)
5086{
5087 VERIFY(to->m_flags & M_PKTHDR);
5088 VERIFY(from->m_flags & M_PKTHDR);
5089
5090 to->m_pkthdr.pkt_proto = from->m_pkthdr.pkt_proto;
5091 to->m_pkthdr.pkt_flowsrc = from->m_pkthdr.pkt_flowsrc;
5092 to->m_pkthdr.pkt_flowid = from->m_pkthdr.pkt_flowid;
5093 to->m_pkthdr.pkt_mpriv_srcid = from->m_pkthdr.pkt_mpriv_srcid;
5094 to->m_pkthdr.pkt_flags = from->m_pkthdr.pkt_flags;
5095 to->m_pkthdr.pkt_ext_flags = from->m_pkthdr.pkt_ext_flags;
5096 (void) m_set_service_class(to, from->m_pkthdr.pkt_svc);
5097 to->m_pkthdr.pkt_ifainfo = from->m_pkthdr.pkt_ifainfo;
5098}
5099
5100/*
5101 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
5102 * if wantall is not set, return whatever number were available. Set up the
5103 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
5104 * are chained on the m_nextpkt field. Any packets requested beyond this
5105 * are chained onto the last packet header's m_next field. The size of
5106 * the cluster is controlled by the parameter bufsize.
5107 */
5108__private_extern__ struct mbuf *
5109m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
5110 int wait, int wantall, size_t bufsize)
5111{
5112 struct mbuf *m = NULL;
5113 struct mbuf **np, *top;
5114 unsigned int pnum, needed = *num_needed;
5115#if CONFIG_MBUF_MCACHE
5116 mcache_obj_t *mp_list = NULL;
5117 int mcflags = MSLEEPF(wait);
5118 mcache_t *cp;
5119#else
5120 zstack_t mp_list = {};
5121 mbuf_class_t class = MC_MBUF_CL;
5122#endif /* CONFIG_MBUF_MCACHE */
5123 u_int16_t flag;
5124 struct ext_ref *rfa;
5125 void *cl;
5126
5127 ASSERT(bufsize == m_maxsize(MC_CL) ||
5128 bufsize == m_maxsize(MC_BIGCL) ||
5129 bufsize == m_maxsize(MC_16KCL));
5130
5131 /*
5132 * Caller must first check for njcl because this
5133 * routine is internal and not exposed/used via KPI.
5134 */
5135 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
5136
5137 top = NULL;
5138 np = &top;
5139 pnum = 0;
5140
5141 /*
5142 * The caller doesn't want all the requested buffers; only some.
5143 * Try hard to get what we can, but don't block. This effectively
5144 * overrides MCR_SLEEP, since this thread will not go to sleep
5145 * if we can't get all the buffers.
5146 */
5147#if CONFIG_MBUF_MCACHE
5148 if (!wantall || (mcflags & MCR_NOSLEEP)) {
5149 mcflags |= MCR_TRYHARD;
5150 }
5151
5152 /* Allocate the composite mbuf + cluster elements from the cache */
5153 if (bufsize == m_maxsize(MC_CL)) {
5154 cp = m_cache(MC_MBUF_CL);
5155 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5156 cp = m_cache(MC_MBUF_BIGCL);
5157 } else {
5158 cp = m_cache(MC_MBUF_16KCL);
5159 }
5160 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
5161#else
5162 if (!wantall || (wait & Z_NOWAIT)) {
5163 wait &= ~Z_NOWAIT;
5164 wait |= Z_NOPAGEWAIT;
5165 }
5166
5167 /* Allocate the composite mbuf + cluster elements from the cache */
5168 if (bufsize == m_maxsize(MC_CL)) {
5169 class = MC_MBUF_CL;
5170 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5171 class = MC_MBUF_BIGCL;
5172 } else {
5173 class = MC_MBUF_16KCL;
5174 }
5175 mp_list = mz_composite_alloc_n(class, n: needed, flags: wait);
5176 needed = zstack_count(stack: mp_list);
5177#endif /* CONFIG_MBUF_MCACHE */
5178
5179 for (pnum = 0; pnum < needed; pnum++) {
5180#if CONFIG_MBUF_MCACHE
5181 m = (struct mbuf *)mp_list;
5182 mp_list = mp_list->obj_next;
5183#else
5184 m = zstack_pop(stack: &mp_list);
5185#endif /* CONFIG_MBUF_MCACHE */
5186
5187 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
5188 cl = m->m_ext.ext_buf;
5189 rfa = m_get_rfa(m);
5190
5191 ASSERT(cl != NULL && rfa != NULL);
5192 VERIFY(MBUF_IS_COMPOSITE(m));
5193
5194 flag = MEXT_FLAGS(m);
5195
5196 MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
5197 if (bufsize == m_maxsize(MC_16KCL)) {
5198 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
5199 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5200 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
5201 } else {
5202 MBUF_CL_INIT(m, cl, rfa, 1, flag);
5203 }
5204
5205 if (num_with_pkthdrs > 0) {
5206 --num_with_pkthdrs;
5207 }
5208
5209 *np = m;
5210 if (num_with_pkthdrs > 0) {
5211 np = &m->m_nextpkt;
5212 } else {
5213 np = &m->m_next;
5214 }
5215 }
5216#if CONFIG_MBUF_MCACHE
5217 ASSERT(pnum != *num_needed || mp_list == NULL);
5218 if (mp_list != NULL) {
5219 mcache_free_ext(cp, mp_list);
5220 }
5221#else
5222 ASSERT(pnum != *num_needed || zstack_empty(mp_list));
5223 if (!zstack_empty(stack: mp_list)) {
5224 mz_composite_free_n(class, list: mp_list);
5225 }
5226#endif /* CONFIG_MBUF_MCACHE */
5227 if (pnum > 0) {
5228 mtype_stat_add(MT_DATA, pnum);
5229 mtype_stat_sub(MT_FREE, pnum);
5230 }
5231
5232 if (wantall && (pnum != *num_needed)) {
5233 if (top != NULL) {
5234 m_freem_list(top);
5235 }
5236 return NULL;
5237 }
5238
5239 if (pnum > *num_needed) {
5240 printf("%s: File a radar related to <rdar://10146739>. \
5241 needed = %u, pnum = %u, num_needed = %u \n",
5242 __func__, needed, pnum, *num_needed);
5243 }
5244 *num_needed = pnum;
5245
5246 return top;
5247}
5248
5249/*
5250 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if
5251 * wantall is not set, return whatever number were available. The size of
5252 * each mbuf in the list is controlled by the parameter packetlen. Each
5253 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf
5254 * in the chain is called a segment. If maxsegments is not null and the
5255 * value pointed to is not null, this specify the maximum number of segments
5256 * for a chain of mbufs. If maxsegments is zero or the value pointed to
5257 * is zero the caller does not have any restriction on the number of segments.
5258 * The actual number of segments of a mbuf chain is return in the value
5259 * pointed to by maxsegments.
5260 */
5261__private_extern__ struct mbuf *
5262m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
5263 unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
5264{
5265 struct mbuf **np, *top, *first = NULL;
5266 size_t bufsize, r_bufsize;
5267 unsigned int num = 0;
5268 unsigned int nsegs = 0;
5269 unsigned int needed = 0, resid;
5270#if CONFIG_MBUF_MCACHE
5271 int mcflags = MSLEEPF(wait);
5272 mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
5273 mcache_t *cp = NULL, *rcp = NULL;
5274#else
5275 zstack_t mp_list = {}, rmp_list = {};
5276 mbuf_class_t class = MC_MBUF, rclass = MC_MBUF_CL;
5277#endif /* CONFIG_MBUF_MCACHE */
5278
5279 if (*numlist == 0) {
5280 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal *numlist is 0");
5281 return NULL;
5282 }
5283
5284 top = NULL;
5285 np = &top;
5286
5287 if (wantsize == 0) {
5288 if (packetlen <= MINCLSIZE) {
5289 bufsize = packetlen;
5290 } else if (packetlen > m_maxsize(MC_CL)) {
5291 /* Use 4KB if jumbo cluster pool isn't available */
5292 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) {
5293 bufsize = m_maxsize(MC_BIGCL);
5294 } else {
5295 bufsize = m_maxsize(MC_16KCL);
5296 }
5297 } else {
5298 bufsize = m_maxsize(MC_CL);
5299 }
5300 } else if (wantsize == m_maxsize(MC_CL) ||
5301 wantsize == m_maxsize(MC_BIGCL) ||
5302 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
5303 bufsize = wantsize;
5304 } else {
5305 *numlist = 0;
5306 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal wantsize unsupported");
5307 return NULL;
5308 }
5309
5310 if (bufsize <= MHLEN) {
5311 nsegs = 1;
5312 } else if (bufsize <= MINCLSIZE) {
5313 if (maxsegments != NULL && *maxsegments == 1) {
5314 bufsize = m_maxsize(MC_CL);
5315 nsegs = 1;
5316 } else {
5317 nsegs = 2;
5318 }
5319 } else if (bufsize == m_maxsize(MC_16KCL)) {
5320 VERIFY(njcl > 0);
5321 nsegs = ((packetlen - 1) >> M16KCLSHIFT) + 1;
5322 } else if (bufsize == m_maxsize(MC_BIGCL)) {
5323 nsegs = ((packetlen - 1) >> MBIGCLSHIFT) + 1;
5324 } else {
5325 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
5326 }
5327 if (maxsegments != NULL) {
5328 if (*maxsegments && nsegs > *maxsegments) {
5329 *maxsegments = nsegs;
5330 *numlist = 0;
5331 os_log(OS_LOG_DEFAULT, "m_allocpacket_internal nsegs > *maxsegments");
5332 return NULL;
5333 }
5334 *maxsegments = nsegs;
5335 }
5336
5337 /*
5338 * The caller doesn't want all the requested buffers; only some.
5339 * Try hard to get what we can, but don't block. This effectively
5340 * overrides MCR_SLEEP, since this thread will not go to sleep
5341 * if we can't get all the buffers.
5342 */
5343#if CONFIG_MBUF_MCACHE
5344 if (!wantall || (mcflags & MCR_NOSLEEP)) {
5345 mcflags |= MCR_TRYHARD;
5346 }
5347#else
5348 if (!wantall || (wait & Z_NOWAIT)) {
5349 wait &= ~Z_NOWAIT;
5350 wait |= Z_NOPAGEWAIT;
5351 }
5352#endif /* !CONFIG_MBUF_MCACHE */
5353
5354 /*
5355 * Simple case where all elements in the lists/chains are mbufs.
5356 * Unless bufsize is greater than MHLEN, each segment chain is made
5357 * up of exactly 1 mbuf. Otherwise, each segment chain is made up
5358 * of 2 mbufs; the second one is used for the residual data, i.e.
5359 * the remaining data that cannot fit into the first mbuf.
5360 */
5361 if (bufsize <= MINCLSIZE) {
5362 /* Allocate the elements in one shot from the mbuf cache */
5363 ASSERT(bufsize <= MHLEN || nsegs == 2);
5364#if CONFIG_MBUF_MCACHE
5365 cp = m_cache(MC_MBUF);
5366 needed = mcache_alloc_ext(cp, &mp_list,
5367 (*numlist) * nsegs, mcflags);
5368#else
5369 class = MC_MBUF;
5370 mp_list = mz_alloc_n(count: (*numlist) * nsegs, flags: wait);
5371 needed = zstack_count(stack: mp_list);
5372#endif /* CONFIG_MBUF_MCACHE */
5373
5374 /*
5375 * The number of elements must be even if we are to use an
5376 * mbuf (instead of a cluster) to store the residual data.
5377 * If we couldn't allocate the requested number of mbufs,
5378 * trim the number down (if it's odd) in order to avoid
5379 * creating a partial segment chain.
5380 */
5381 if (bufsize > MHLEN && (needed & 0x1)) {
5382 needed--;
5383 }
5384
5385 while (num < needed) {
5386 struct mbuf *m = NULL;
5387
5388#if CONFIG_MBUF_MCACHE
5389 m = (struct mbuf *)mp_list;
5390 mp_list = mp_list->obj_next;
5391#else
5392 m = zstack_pop(stack: &mp_list);
5393#endif /* CONFIG_MBUF_MCACHE */
5394 ASSERT(m != NULL);
5395
5396 MBUF_INIT(m, 1, MT_DATA);
5397 num++;
5398 if (bufsize > MHLEN) {
5399 /* A second mbuf for this segment chain */
5400#if CONFIG_MBUF_MCACHE
5401 m->m_next = (struct mbuf *)mp_list;
5402 mp_list = mp_list->obj_next;
5403#else
5404 m->m_next = zstack_pop(stack: &mp_list);
5405#endif /* CONFIG_MBUF_MCACHE */
5406
5407 ASSERT(m->m_next != NULL);
5408
5409 MBUF_INIT(m->m_next, 0, MT_DATA);
5410 num++;
5411 }
5412 *np = m;
5413 np = &m->m_nextpkt;
5414 }
5415#if CONFIG_MBUF_MCACHE
5416 ASSERT(num != *numlist || mp_list == NULL);
5417#else
5418 ASSERT(num != *numlist || zstack_empty(mp_list));
5419#endif /* CONFIG_MBUF_MCACHE */
5420
5421 if (num > 0) {
5422 mtype_stat_add(MT_DATA, num);
5423 mtype_stat_sub(MT_FREE, num);
5424 }
5425 num /= nsegs;
5426
5427 /* We've got them all; return to caller */
5428 if (num == *numlist) {
5429 return top;
5430 }
5431
5432 goto fail;
5433 }
5434
5435 /*
5436 * Complex cases where elements are made up of one or more composite
5437 * mbufs + cluster, depending on packetlen. Each N-segment chain can
5438 * be illustrated as follows:
5439 *
5440 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
5441 *
5442 * Every composite mbuf + cluster element comes from the intermediate
5443 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency,
5444 * the last composite element will come from the MC_MBUF_CL cache,
5445 * unless the residual data is larger than 2KB where we use the
5446 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual
5447 * data is defined as extra data beyond the first element that cannot
5448 * fit into the previous element, i.e. there is no residual data if
5449 * the chain only has 1 segment.
5450 */
5451 r_bufsize = bufsize;
5452 resid = packetlen > bufsize ? packetlen % bufsize : 0;
5453 if (resid > 0) {
5454 /* There is residual data; figure out the cluster size */
5455 if (wantsize == 0 && packetlen > MINCLSIZE) {
5456 /*
5457 * Caller didn't request that all of the segments
5458 * in the chain use the same cluster size; use the
5459 * smaller of the cluster sizes.
5460 */
5461 if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) {
5462 r_bufsize = m_maxsize(MC_16KCL);
5463 } else if (resid > m_maxsize(MC_CL)) {
5464 r_bufsize = m_maxsize(MC_BIGCL);
5465 } else {
5466 r_bufsize = m_maxsize(MC_CL);
5467 }
5468 } else {
5469 /* Use the same cluster size as the other segments */
5470 resid = 0;
5471 }
5472 }
5473
5474 needed = *numlist;
5475 if (resid > 0) {
5476 /*
5477 * Attempt to allocate composite mbuf + cluster elements for
5478 * the residual data in each chain; record the number of such
5479 * elements that can be allocated so that we know how many
5480 * segment chains we can afford to create.
5481 */
5482#if CONFIG_MBUF_MCACHE
5483 if (r_bufsize <= m_maxsize(MC_CL)) {
5484 rcp = m_cache(MC_MBUF_CL);
5485 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
5486 rcp = m_cache(MC_MBUF_BIGCL);
5487 } else {
5488 rcp = m_cache(MC_MBUF_16KCL);
5489 }
5490 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
5491#else
5492 if (r_bufsize <= m_maxsize(MC_CL)) {
5493 rclass = MC_MBUF_CL;
5494 } else if (r_bufsize <= m_maxsize(MC_BIGCL)) {
5495 rclass = MC_MBUF_BIGCL;
5496 } else {
5497 rclass = MC_MBUF_16KCL;
5498 }
5499 rmp_list = mz_composite_alloc_n(class: rclass, n: *numlist, flags: wait);
5500 needed = zstack_count(stack: rmp_list);
5501#endif /* CONFIG_MBUF_MCACHE */
5502 if (needed == 0) {
5503 goto fail;
5504 }
5505
5506 /* This is temporarily reduced for calculation */
5507 ASSERT(nsegs > 1);
5508 nsegs--;
5509 }
5510
5511 /*
5512 * Attempt to allocate the rest of the composite mbuf + cluster
5513 * elements for the number of segment chains that we need.
5514 */
5515#if CONFIG_MBUF_MCACHE
5516 if (bufsize <= m_maxsize(MC_CL)) {
5517 cp = m_cache(MC_MBUF_CL);
5518 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
5519 cp = m_cache(MC_MBUF_BIGCL);
5520 } else {
5521 cp = m_cache(MC_MBUF_16KCL);
5522 }
5523 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
5524#else
5525 if (bufsize <= m_maxsize(MC_CL)) {
5526 class = MC_MBUF_CL;
5527 } else if (bufsize <= m_maxsize(MC_BIGCL)) {
5528 class = MC_MBUF_BIGCL;
5529 } else {
5530 class = MC_MBUF_16KCL;
5531 }
5532 mp_list = mz_composite_alloc_n(class, n: needed * nsegs, flags: wait);
5533 needed = zstack_count(stack: mp_list);
5534#endif /* CONFIG_MBUF_MCACHE */
5535
5536 /* Round it down to avoid creating a partial segment chain */
5537 needed = (needed / nsegs) * nsegs;
5538 if (needed == 0) {
5539 goto fail;
5540 }
5541
5542 if (resid > 0) {
5543 /*
5544 * We're about to construct the chain(s); take into account
5545 * the number of segments we have created above to hold the
5546 * residual data for each chain, as well as restore the
5547 * original count of segments per chain.
5548 */
5549 ASSERT(nsegs > 0);
5550 needed += needed / nsegs;
5551 nsegs++;
5552 }
5553
5554 for (;;) {
5555 struct mbuf *m = NULL;
5556 u_int16_t flag;
5557 struct ext_ref *rfa;
5558 void *cl;
5559 int pkthdr;
5560 m_ext_free_func_t m_free_func;
5561
5562 ++num;
5563
5564 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
5565#if CONFIG_MBUF_MCACHE
5566 m = (struct mbuf *)mp_list;
5567 mp_list = mp_list->obj_next;
5568#else
5569 m = zstack_pop(stack: &mp_list);
5570#endif /* CONFIG_MBUF_MCACHE */
5571 } else {
5572#if CONFIG_MBUF_MCACHE
5573 m = (struct mbuf *)rmp_list;
5574 rmp_list = rmp_list->obj_next;
5575#else
5576 m = zstack_pop(stack: &rmp_list);
5577#endif /* CONFIG_MBUF_MCACHE */
5578 }
5579 m_free_func = m_get_ext_free(m);
5580 ASSERT(m != NULL);
5581 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
5582 VERIFY(m_free_func == NULL || m_free_func == m_bigfree ||
5583 m_free_func == m_16kfree);
5584
5585 cl = m->m_ext.ext_buf;
5586 rfa = m_get_rfa(m);
5587
5588 ASSERT(cl != NULL && rfa != NULL);
5589 VERIFY(MBUF_IS_COMPOSITE(m));
5590
5591 flag = MEXT_FLAGS(m);
5592
5593 pkthdr = (nsegs == 1 || (num % nsegs) == 1);
5594 if (pkthdr) {
5595 first = m;
5596 }
5597 MBUF_INIT(m, pkthdr, MT_DATA);
5598 if (m_free_func == m_16kfree) {
5599 MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
5600 } else if (m_free_func == m_bigfree) {
5601 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
5602 } else {
5603 MBUF_CL_INIT(m, cl, rfa, 1, flag);
5604 }
5605
5606 *np = m;
5607 if ((num % nsegs) == 0) {
5608 np = &first->m_nextpkt;
5609 } else {
5610 np = &m->m_next;
5611 }
5612
5613 if (num == needed) {
5614 break;
5615 }
5616 }
5617
5618 if (num > 0) {
5619 mtype_stat_add(MT_DATA, num);
5620 mtype_stat_sub(MT_FREE, num);
5621 }
5622
5623 num /= nsegs;
5624
5625 /* We've got them all; return to caller */
5626 if (num == *numlist) {
5627#if CONFIG_MBUF_MCACHE
5628 ASSERT(mp_list == NULL && rmp_list == NULL);
5629#else
5630 ASSERT(zstack_empty(mp_list) && zstack_empty(rmp_list));
5631#endif /* CONFIG_MBUF_MCACHE */
5632 return top;
5633 }
5634
5635fail:
5636 /* Free up what's left of the above */
5637#if CONFIG_MBUF_MCACHE
5638 if (mp_list != NULL) {
5639 mcache_free_ext(cp, mp_list);
5640 }
5641 if (rmp_list != NULL) {
5642 mcache_free_ext(rcp, rmp_list);
5643 }
5644#else
5645 if (!zstack_empty(stack: mp_list)) {
5646 if (class == MC_MBUF) {
5647 /* No need to elide, these mbufs came from the cache. */
5648 mz_free_n(list: mp_list);
5649 } else {
5650 mz_composite_free_n(class, list: mp_list);
5651 }
5652 }
5653 if (!zstack_empty(stack: rmp_list)) {
5654 mz_composite_free_n(class: rclass, list: rmp_list);
5655 }
5656#endif /* CONFIG_MBUF_MCACHE */
5657 if (wantall && top != NULL) {
5658 m_freem_list(top);
5659 *numlist = 0;
5660 return NULL;
5661 }
5662 *numlist = num;
5663 return top;
5664}
5665
5666/*
5667 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
5668 * packets on receive ring.
5669 */
5670__private_extern__ struct mbuf *
5671m_getpacket_how(int wait)
5672{
5673 unsigned int num_needed = 1;
5674
5675 return m_getpackets_internal(num_needed: &num_needed, num_with_pkthdrs: 1, wait, wantall: 1,
5676 m_maxsize(MC_CL));
5677}
5678
5679/*
5680 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated
5681 * packets on receive ring.
5682 */
5683struct mbuf *
5684m_getpacket(void)
5685{
5686 unsigned int num_needed = 1;
5687
5688 return m_getpackets_internal(num_needed: &num_needed, num_with_pkthdrs: 1, M_WAIT, wantall: 1,
5689 m_maxsize(MC_CL));
5690}
5691
5692/*
5693 * Return a list of mbuf hdrs that point to clusters. Try for num_needed;
5694 * if this can't be met, return whatever number were available. Set up the
5695 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These
5696 * are chained on the m_nextpkt field. Any packets requested beyond this are
5697 * chained onto the last packet header's m_next field.
5698 */
5699struct mbuf *
5700m_getpackets(int num_needed, int num_with_pkthdrs, int how)
5701{
5702 unsigned int n = num_needed;
5703
5704 return m_getpackets_internal(num_needed: &n, num_with_pkthdrs, wait: how, wantall: 0,
5705 m_maxsize(MC_CL));
5706}
5707
5708/*
5709 * Return a list of mbuf hdrs set up as packet hdrs chained together
5710 * on the m_nextpkt field
5711 */
5712struct mbuf *
5713m_getpackethdrs(int num_needed, int how)
5714{
5715 struct mbuf *m;
5716 struct mbuf **np, *top;
5717
5718 top = NULL;
5719 np = &top;
5720
5721 while (num_needed--) {
5722 m = _M_RETRYHDR(how, MT_DATA);
5723 if (m == NULL) {
5724 break;
5725 }
5726
5727 *np = m;
5728 np = &m->m_nextpkt;
5729 }
5730
5731 return top;
5732}
5733
5734/*
5735 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count
5736 * for mbufs packets freed. Used by the drivers.
5737 */
5738int
5739m_freem_list(struct mbuf *m)
5740{
5741 struct mbuf *nextpkt;
5742#if CONFIG_MBUF_MCACHE
5743 mcache_obj_t *mp_list = NULL;
5744 mcache_obj_t *mcl_list = NULL;
5745 mcache_obj_t *mbc_list = NULL;
5746 mcache_obj_t *m16k_list = NULL;
5747 mcache_obj_t *m_mcl_list = NULL;
5748 mcache_obj_t *m_mbc_list = NULL;
5749 mcache_obj_t *m_m16k_list = NULL;
5750 mcache_obj_t *ref_list = NULL;
5751#else
5752 zstack_t mp_list = {}, mcl_list = {}, mbc_list = {},
5753 m16k_list = {}, m_mcl_list = {},
5754 m_mbc_list = {}, m_m16k_list = {}, ref_list = {};
5755#endif /* CONFIG_MBUF_MCACHE */
5756 int pktcount = 0;
5757 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
5758
5759 while (m != NULL) {
5760 pktcount++;
5761
5762 nextpkt = m->m_nextpkt;
5763 m->m_nextpkt = NULL;
5764
5765 while (m != NULL) {
5766 struct mbuf *next = m->m_next;
5767#if CONFIG_MBUF_MCACHE
5768 mcache_obj_t *o, *rfa;
5769#else
5770 void *cl = NULL;
5771#endif /* CONFIG_MBUF_MCACHE */
5772 if (m->m_type == MT_FREE) {
5773 panic("m_free: freeing an already freed mbuf");
5774 }
5775
5776 if (m->m_flags & M_PKTHDR) {
5777 /* Check for scratch area overflow */
5778 m_redzone_verify(m);
5779 /* Free the aux data and tags if there is any */
5780 m_tag_delete_chain(m);
5781 m_do_tx_compl_callback(m, NULL);
5782 }
5783
5784 if (!(m->m_flags & M_EXT)) {
5785 mt_free++;
5786 goto simple_free;
5787 }
5788
5789 if (MBUF_IS_PAIRED(m) && m_free_paired(m)) {
5790 m = next;
5791 continue;
5792 }
5793
5794 mt_free++;
5795
5796#if CONFIG_MBUF_MCACHE
5797 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
5798#else
5799 cl = m->m_ext.ext_buf;
5800#endif /* CONFIG_MBUF_MCACHE */
5801 /*
5802 * Make sure that we don't touch any ext_ref
5803 * member after we decrement the reference count
5804 * since that may lead to use-after-free
5805 * when we do not hold the last reference.
5806 */
5807 const bool composite = !!(MEXT_FLAGS(m) & EXTF_COMPOSITE);
5808 const m_ext_free_func_t m_free_func = m_get_ext_free(m);
5809 const uint16_t minref = MEXT_MINREF(m);
5810 const uint16_t refcnt = m_decref(m);
5811 if (refcnt == minref && !composite) {
5812#if CONFIG_MBUF_MCACHE
5813 if (m_free_func == NULL) {
5814 o->obj_next = mcl_list;
5815 mcl_list = o;
5816 } else if (m_free_func == m_bigfree) {
5817 o->obj_next = mbc_list;
5818 mbc_list = o;
5819 } else if (m_free_func == m_16kfree) {
5820 o->obj_next = m16k_list;
5821 m16k_list = o;
5822 } else {
5823 (*(m_free_func))((caddr_t)o,
5824 m->m_ext.ext_size,
5825 m_get_ext_arg(m));
5826 }
5827 rfa = (mcache_obj_t *)(void *)m_get_rfa(m);
5828 rfa->obj_next = ref_list;
5829 ref_list = rfa;
5830#else
5831 if (m_free_func == NULL) {
5832 zstack_push(stack: &mcl_list, elem: cl);
5833 } else if (m_free_func == m_bigfree) {
5834 zstack_push(stack: &mbc_list, elem: cl);
5835 } else if (m_free_func == m_16kfree) {
5836 zstack_push(stack: &m16k_list, elem: cl);
5837 } else {
5838 (*(m_free_func))((caddr_t)cl,
5839 m->m_ext.ext_size,
5840 m_get_ext_arg(m));
5841 }
5842 zstack_push(stack: &ref_list, elem: m_get_rfa(m));
5843#endif /* CONFIG_MBUF_MCACHE */
5844 m_set_ext(m, NULL, NULL, NULL);
5845 } else if (refcnt == minref && composite) {
5846 VERIFY(!(MEXT_FLAGS(m) & EXTF_PAIRED));
5847 /*
5848 * Amortize the costs of atomic operations
5849 * by doing them at the end, if possible.
5850 */
5851 if (m->m_type == MT_DATA) {
5852 mt_data++;
5853 } else if (m->m_type == MT_HEADER) {
5854 mt_header++;
5855 } else if (m->m_type == MT_SONAME) {
5856 mt_soname++;
5857 } else if (m->m_type == MT_TAG) {
5858 mt_tag++;
5859 } else {
5860 mtype_stat_dec(m->m_type);
5861 }
5862
5863 m->m_type = MT_FREE;
5864 m->m_flags = M_EXT;
5865 m->m_len = 0;
5866 m->m_next = m->m_nextpkt = NULL;
5867
5868 /*
5869 * MEXT_FLAGS is safe to access here
5870 * since we are now sure that we held
5871 * the last reference to ext_ref.
5872 */
5873 MEXT_FLAGS(m) &= ~EXTF_READONLY;
5874
5875 /* "Free" into the intermediate cache */
5876#if CONFIG_MBUF_MCACHE
5877 o = (mcache_obj_t *)m;
5878 if (m_free_func == NULL) {
5879 o->obj_next = m_mcl_list;
5880 m_mcl_list = o;
5881 } else if (m_free_func == m_bigfree) {
5882 o->obj_next = m_mbc_list;
5883 m_mbc_list = o;
5884 } else {
5885 VERIFY(m_free_func == m_16kfree);
5886 o->obj_next = m_m16k_list;
5887 m_m16k_list = o;
5888 }
5889#else
5890 if (m_free_func == NULL) {
5891 zstack_push(stack: &m_mcl_list, elem: m);
5892 } else if (m_free_func == m_bigfree) {
5893 zstack_push(stack: &m_mbc_list, elem: m);
5894 } else {
5895 VERIFY(m_free_func == m_16kfree);
5896 zstack_push(stack: &m_m16k_list, elem: m);
5897 }
5898#endif /* CONFIG_MBUF_MCACHE */
5899 m = next;
5900 continue;
5901 }
5902simple_free:
5903 /*
5904 * Amortize the costs of atomic operations
5905 * by doing them at the end, if possible.
5906 */
5907 if (m->m_type == MT_DATA) {
5908 mt_data++;
5909 } else if (m->m_type == MT_HEADER) {
5910 mt_header++;
5911 } else if (m->m_type == MT_SONAME) {
5912 mt_soname++;
5913 } else if (m->m_type == MT_TAG) {
5914 mt_tag++;
5915 } else if (m->m_type != MT_FREE) {
5916 mtype_stat_dec(m->m_type);
5917 }
5918
5919 m->m_type = MT_FREE;
5920 m->m_flags = m->m_len = 0;
5921 m->m_next = m->m_nextpkt = NULL;
5922
5923#if CONFIG_MBUF_MCACHE
5924 ((mcache_obj_t *)m)->obj_next = mp_list;
5925 mp_list = (mcache_obj_t *)m;
5926#else
5927 m_elide(m);
5928 zstack_push(stack: &mp_list, elem: m);
5929#endif /* CONFIG_MBUF_MCACHE */
5930
5931 m = next;
5932 }
5933
5934 m = nextpkt;
5935 }
5936
5937 if (mt_free > 0) {
5938 mtype_stat_add(MT_FREE, mt_free);
5939 }
5940 if (mt_data > 0) {
5941 mtype_stat_sub(MT_DATA, mt_data);
5942 }
5943 if (mt_header > 0) {
5944 mtype_stat_sub(MT_HEADER, mt_header);
5945 }
5946 if (mt_soname > 0) {
5947 mtype_stat_sub(MT_SONAME, mt_soname);
5948 }
5949 if (mt_tag > 0) {
5950 mtype_stat_sub(MT_TAG, mt_tag);
5951 }
5952#if CONFIG_MBUF_MCACHE
5953 if (mp_list != NULL) {
5954 mcache_free_ext(m_cache(MC_MBUF), mp_list);
5955 }
5956 if (mcl_list != NULL) {
5957 mcache_free_ext(m_cache(MC_CL), mcl_list);
5958 }
5959 if (mbc_list != NULL) {
5960 mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
5961 }
5962 if (m16k_list != NULL) {
5963 mcache_free_ext(m_cache(MC_16KCL), m16k_list);
5964 }
5965 if (m_mcl_list != NULL) {
5966 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
5967 }
5968 if (m_mbc_list != NULL) {
5969 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
5970 }
5971 if (m_m16k_list != NULL) {
5972 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
5973 }
5974 if (ref_list != NULL) {
5975 mcache_free_ext(ref_cache, ref_list);
5976 }
5977#else
5978 if (!zstack_empty(stack: mp_list)) {
5979 /* mbufs elided above. */
5980 mz_free_n(list: mp_list);
5981 }
5982 if (!zstack_empty(stack: mcl_list)) {
5983 zfree_nozero_n(ZONE_ID_CLUSTER_2K, mcl_list);
5984 }
5985 if (!zstack_empty(stack: mbc_list)) {
5986 zfree_nozero_n(ZONE_ID_CLUSTER_4K, mbc_list);
5987 }
5988 if (!zstack_empty(stack: m16k_list)) {
5989 zfree_nozero_n(ZONE_ID_CLUSTER_16K, m16k_list);
5990 }
5991 if (!zstack_empty(stack: m_mcl_list)) {
5992 mz_composite_free_n(class: MC_MBUF_CL, list: m_mcl_list);
5993 }
5994 if (!zstack_empty(stack: m_mbc_list)) {
5995 mz_composite_free_n(class: MC_MBUF_BIGCL, list: m_mbc_list);
5996 }
5997 if (!zstack_empty(stack: m_m16k_list)) {
5998 mz_composite_free_n(class: MC_MBUF_16KCL, list: m_m16k_list);
5999 }
6000 if (!zstack_empty(stack: ref_list)) {
6001 zfree_nozero_n(ZONE_ID_MBUF_REF, ref_list);
6002 }
6003#endif /* CONFIG_MBUF_MCACHE */
6004
6005 return pktcount;
6006}
6007
6008void
6009m_freem(struct mbuf *m)
6010{
6011 while (m != NULL) {
6012 m = m_free(m);
6013 }
6014}
6015
6016/*
6017 * Mbuffer utility routines.
6018 */
6019/*
6020 * Set the m_data pointer of a newly allocated mbuf to place an object of the
6021 * specified size at the end of the mbuf, longword aligned.
6022 *
6023 * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
6024 * separate macros, each asserting that it was called at the proper moment.
6025 * This required callers to themselves test the storage type and call the
6026 * right one. Rather than require callers to be aware of those layout
6027 * decisions, we centralize here.
6028 */
6029void
6030m_align(struct mbuf *m, int len)
6031{
6032 int adjust = 0;
6033
6034 /* At this point data must point to start */
6035 VERIFY(m->m_data == (uintptr_t)M_START(m));
6036 VERIFY(len >= 0);
6037 VERIFY(len <= M_SIZE(m));
6038 adjust = M_SIZE(m) - len;
6039 m->m_data += adjust & ~(sizeof(long) - 1);
6040}
6041
6042/*
6043 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
6044 * copy junk along. Does not adjust packet header length.
6045 */
6046struct mbuf *
6047m_prepend(struct mbuf *m, int len, int how)
6048{
6049 struct mbuf *mn;
6050
6051 _MGET(mn, how, m->m_type);
6052 if (mn == NULL) {
6053 m_freem(m);
6054 return NULL;
6055 }
6056 if (m->m_flags & M_PKTHDR) {
6057 M_COPY_PKTHDR(mn, m);
6058 m->m_flags &= ~M_PKTHDR;
6059 }
6060 mn->m_next = m;
6061 m = mn;
6062 if (m->m_flags & M_PKTHDR) {
6063 VERIFY(len <= MHLEN);
6064 MH_ALIGN(m, len);
6065 } else {
6066 VERIFY(len <= MLEN);
6067 M_ALIGN(m, len);
6068 }
6069 m->m_len = len;
6070 return m;
6071}
6072
6073/*
6074 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
6075 * chain, copy junk along, and adjust length.
6076 */
6077struct mbuf *
6078m_prepend_2(struct mbuf *m, int len, int how, int align)
6079{
6080 if (M_LEADINGSPACE(m) >= len &&
6081 (!align || IS_P2ALIGNED((m->m_data - len), sizeof(u_int32_t)))) {
6082 m->m_data -= len;
6083 m->m_len += len;
6084 } else {
6085 m = m_prepend(m, len, how);
6086 }
6087 if ((m) && (m->m_flags & M_PKTHDR)) {
6088 m->m_pkthdr.len += len;
6089 }
6090 return m;
6091}
6092
6093/*
6094 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
6095 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
6096 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
6097 *
6098 * The last mbuf and offset accessed are passed in and adjusted on return to
6099 * avoid having to iterate over the entire mbuf chain each time.
6100 */
6101struct mbuf *
6102m_copym_mode(struct mbuf *m, int off0, int len0, int wait,
6103 struct mbuf **m_lastm, int *m_off, uint32_t mode)
6104{
6105 struct mbuf *n, *mhdr = NULL, **np;
6106 int off = off0, len = len0;
6107 struct mbuf *top;
6108 int copyhdr = 0;
6109
6110 if (off < 0 || len < 0) {
6111 panic("m_copym: invalid offset %d or len %d", off, len);
6112 }
6113
6114 VERIFY((mode != M_COPYM_MUST_COPY_HDR &&
6115 mode != M_COPYM_MUST_MOVE_HDR) || (m->m_flags & M_PKTHDR));
6116
6117 if ((off == 0 && (m->m_flags & M_PKTHDR)) ||
6118 mode == M_COPYM_MUST_COPY_HDR || mode == M_COPYM_MUST_MOVE_HDR) {
6119 mhdr = m;
6120 copyhdr = 1;
6121 }
6122
6123 if (m_lastm != NULL && *m_lastm != NULL) {
6124 if (off0 >= *m_off) {
6125 m = *m_lastm;
6126 off = off0 - *m_off;
6127 }
6128 }
6129
6130 while (off >= m->m_len) {
6131 off -= m->m_len;
6132 m = m->m_next;
6133 }
6134 np = &top;
6135 top = NULL;
6136
6137 while (len > 0) {
6138 if (m == NULL) {
6139 if (len != M_COPYALL) {
6140 panic("m_copym: len != M_COPYALL");
6141 }
6142 break;
6143 }
6144
6145 if (copyhdr) {
6146 n = _M_RETRYHDR(wait, m->m_type);
6147 } else {
6148 n = _M_RETRY(wait, m->m_type);
6149 }
6150 *np = n;
6151
6152 if (n == NULL) {
6153 goto nospace;
6154 }
6155
6156 if (copyhdr != 0) {
6157 if ((mode == M_COPYM_MOVE_HDR) ||
6158 (mode == M_COPYM_MUST_MOVE_HDR)) {
6159 M_COPY_PKTHDR(n, mhdr);
6160 } else if ((mode == M_COPYM_COPY_HDR) ||
6161 (mode == M_COPYM_MUST_COPY_HDR)) {
6162 if (m_dup_pkthdr(to: n, from: mhdr, how: wait) == 0) {
6163 goto nospace;
6164 }
6165 }
6166 if (len == M_COPYALL) {
6167 n->m_pkthdr.len -= off0;
6168 } else {
6169 n->m_pkthdr.len = len;
6170 }
6171 copyhdr = 0;
6172 /*
6173 * There is data to copy from the packet header mbuf
6174 * if it is empty or it is before the starting offset
6175 */
6176 if (mhdr != m) {
6177 np = &n->m_next;
6178 continue;
6179 }
6180 }
6181 n->m_len = MIN(len, (m->m_len - off));
6182 if (m->m_flags & M_EXT) {
6183 n->m_ext = m->m_ext;
6184 m_incref(m);
6185 n->m_data = m->m_data + off;
6186 n->m_flags |= M_EXT;
6187 } else {
6188 /*
6189 * Limit to the capacity of the destination
6190 */
6191 if (n->m_flags & M_PKTHDR) {
6192 n->m_len = MIN(n->m_len, MHLEN);
6193 } else {
6194 n->m_len = MIN(n->m_len, MLEN);
6195 }
6196
6197 if (MTOD(n, char *) + n->m_len > ((char *)n) + _MSIZE) {
6198 panic("%s n %p copy overflow",
6199 __func__, n);
6200 }
6201
6202 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
6203 n: (unsigned)n->m_len);
6204 }
6205 if (len != M_COPYALL) {
6206 len -= n->m_len;
6207 }
6208
6209 if (len == 0) {
6210 if (m_lastm != NULL) {
6211 *m_lastm = m;
6212 *m_off = off0 + len0 - (off + n->m_len);
6213 }
6214 }
6215 off = 0;
6216 m = m->m_next;
6217 np = &n->m_next;
6218 }
6219
6220 return top;
6221nospace:
6222 m_freem(m: top);
6223
6224 return NULL;
6225}
6226
6227
6228struct mbuf *
6229m_copym(struct mbuf *m, int off0, int len, int wait)
6230{
6231 return m_copym_mode(m, off0, len0: len, wait, NULL, NULL, M_COPYM_MOVE_HDR);
6232}
6233
6234/*
6235 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
6236 * within this routine also.
6237 *
6238 * The last mbuf and offset accessed are passed in and adjusted on return to
6239 * avoid having to iterate over the entire mbuf chain each time.
6240 */
6241struct mbuf *
6242m_copym_with_hdrs(struct mbuf *m0, int off0, int len0, int wait,
6243 struct mbuf **m_lastm, int *m_off, uint32_t mode)
6244{
6245 struct mbuf *m = m0, *n, **np = NULL;
6246 int off = off0, len = len0;
6247 struct mbuf *top = NULL;
6248#if CONFIG_MBUF_MCACHE
6249 int mcflags = MSLEEPF(wait);
6250 mcache_obj_t *list = NULL;
6251#else
6252 zstack_t list = {};
6253#endif /* CONFIG_MBUF_MCACHE */
6254 int copyhdr = 0;
6255 int type = 0;
6256 int needed = 0;
6257
6258 if (off == 0 && (m->m_flags & M_PKTHDR)) {
6259 copyhdr = 1;
6260 }
6261
6262 if (m_lastm != NULL && *m_lastm != NULL) {
6263 if (off0 >= *m_off) {
6264 m = *m_lastm;
6265 off = off0 - *m_off;
6266 }
6267 }
6268
6269 while (off >= m->m_len) {
6270 off -= m->m_len;
6271 m = m->m_next;
6272 }
6273
6274 n = m;
6275 while (len > 0) {
6276 needed++;
6277 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
6278 n = n->m_next;
6279 }
6280 needed++;
6281 len = len0;
6282
6283#if CONFIG_MBUF_MCACHE
6284 /*
6285 * If the caller doesn't want to be put to sleep, mark it with
6286 * MCR_TRYHARD so that we may reclaim buffers from other places
6287 * before giving up.
6288 */
6289 if (mcflags & MCR_NOSLEEP) {
6290 mcflags |= MCR_TRYHARD;
6291 }
6292
6293 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
6294 mcflags) != needed) {
6295 goto nospace;
6296 }
6297#else
6298 list = mz_alloc_n(count: needed, flags: wait);
6299 if (zstack_count(stack: list) != needed) {
6300 goto nospace;
6301 }
6302#endif /* CONFIG_MBUF_MCACHE */
6303
6304 needed = 0;
6305 while (len > 0) {
6306#if CONFIG_MBUF_MCACHE
6307 n = (struct mbuf *)list;
6308 list = list->obj_next;
6309#else
6310 n = zstack_pop(stack: &list);
6311#endif /* CONFIG_MBUF_MCACHE */
6312 ASSERT(n != NULL && m != NULL);
6313
6314 type = (top == NULL) ? MT_HEADER : m->m_type;
6315 MBUF_INIT(n, (top == NULL), type);
6316
6317 if (top == NULL) {
6318 top = n;
6319 np = &top->m_next;
6320 continue;
6321 } else {
6322 needed++;
6323 *np = n;
6324 }
6325
6326 if (copyhdr) {
6327 if ((mode == M_COPYM_MOVE_HDR) ||
6328 (mode == M_COPYM_MUST_MOVE_HDR)) {
6329 M_COPY_PKTHDR(n, m);
6330 } else if ((mode == M_COPYM_COPY_HDR) ||
6331 (mode == M_COPYM_MUST_COPY_HDR)) {
6332 if (m_dup_pkthdr(to: n, from: m, how: wait) == 0) {
6333#if !CONFIG_MBUF_MCACHE
6334 m_elide(m: n);
6335#endif
6336 goto nospace;
6337 }
6338 }
6339 n->m_pkthdr.len = len;
6340 copyhdr = 0;
6341 }
6342 n->m_len = MIN(len, (m->m_len - off));
6343
6344 if (m->m_flags & M_EXT) {
6345 n->m_ext = m->m_ext;
6346 m_incref(m);
6347 n->m_data = m->m_data + off;
6348 n->m_flags |= M_EXT;
6349 } else {
6350 if (m_mtod_end(m: n) > m_mtod_upper_bound(m: n)) {
6351 panic("%s n %p copy overflow",
6352 __func__, n);
6353 }
6354
6355 bcopy(MTOD(m, caddr_t) + off, MTOD(n, caddr_t),
6356 n: (unsigned)n->m_len);
6357 }
6358 len -= n->m_len;
6359
6360 if (len == 0) {
6361 if (m_lastm != NULL) {
6362 *m_lastm = m;
6363 *m_off = off0 + len0 - (off + n->m_len);
6364 }
6365 break;
6366 }
6367 off = 0;
6368 m = m->m_next;
6369 np = &n->m_next;
6370 }
6371
6372 mtype_stat_inc(MT_HEADER);
6373 mtype_stat_add(type, needed);
6374 mtype_stat_sub(MT_FREE, needed + 1);
6375
6376#if CONFIG_MBUF_MCACHE
6377 ASSERT(list == NULL);
6378#else
6379 ASSERT(zstack_empty(list));
6380#endif /* CONFIG_MBUF_MCACHE */
6381
6382 return top;
6383
6384nospace:
6385#if CONFIG_MBUF_MCACHE
6386 if (list != NULL) {
6387 mcache_free_ext(m_cache(MC_MBUF), list);
6388 }
6389#else
6390 if (!zstack_empty(stack: list)) {
6391 /* No need to elide, these mbufs came from the cache. */
6392 mz_free_n(list);
6393 }
6394#endif /* CONFIG_MBUF_MCACHE */
6395 if (top != NULL) {
6396 m_freem(m: top);
6397 }
6398 return NULL;
6399}
6400
6401/*
6402 * Copy data from an mbuf chain starting "off" bytes from the beginning,
6403 * continuing for "len" bytes, into the indicated buffer.
6404 */
6405void
6406m_copydata(struct mbuf *m, int off, int len, void *vp)
6407{
6408 int off0 = off, len0 = len;
6409 struct mbuf *m0 = m;
6410 unsigned count;
6411 char *cp = vp;
6412
6413 if (__improbable(off < 0 || len < 0)) {
6414 panic("%s: invalid offset %d or len %d", __func__, off, len);
6415 /* NOTREACHED */
6416 }
6417
6418 while (off > 0) {
6419 if (__improbable(m == NULL)) {
6420 panic("%s: invalid mbuf chain %p [off %d, len %d]",
6421 __func__, m0, off0, len0);
6422 /* NOTREACHED */
6423 }
6424 if (off < m->m_len) {
6425 break;
6426 }
6427 off -= m->m_len;
6428 m = m->m_next;
6429 }
6430 while (len > 0) {
6431 if (__improbable(m == NULL)) {
6432 panic("%s: invalid mbuf chain %p [off %d, len %d]",
6433 __func__, m0, off0, len0);
6434 /* NOTREACHED */
6435 }
6436 count = MIN(m->m_len - off, len);
6437 bcopy(MTOD(m, caddr_t) + off, dst: cp, n: count);
6438 len -= count;
6439 cp += count;
6440 off = 0;
6441 m = m->m_next;
6442 }
6443}
6444
6445/*
6446 * Concatenate mbuf chain n to m. Both chains must be of the same type
6447 * (e.g. MT_DATA). Any m_pkthdr is not updated.
6448 */
6449void
6450m_cat(struct mbuf *m, struct mbuf *n)
6451{
6452 while (m->m_next) {
6453 m = m->m_next;
6454 }
6455 while (n) {
6456 if ((m->m_flags & M_EXT) ||
6457 m->m_data + m->m_len + n->m_len >= (uintptr_t)&m->m_dat[MLEN]) {
6458 /* just join the two chains */
6459 m->m_next = n;
6460 return;
6461 }
6462 /* splat the data from one into the other */
6463 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
6464 n: (u_int)n->m_len);
6465 m->m_len += n->m_len;
6466 n = m_free(m: n);
6467 }
6468}
6469
6470void
6471m_adj(struct mbuf *mp, int req_len)
6472{
6473 int len = req_len;
6474 struct mbuf *m;
6475 int count;
6476
6477 if ((m = mp) == NULL) {
6478 return;
6479 }
6480 if (len >= 0) {
6481 /*
6482 * Trim from head.
6483 */
6484 while (m != NULL && len > 0) {
6485 if (m->m_len <= len) {
6486 len -= m->m_len;
6487 m->m_len = 0;
6488 m = m->m_next;
6489 } else {
6490 m->m_len -= len;
6491 m->m_data += len;
6492 len = 0;
6493 }
6494 }
6495 m = mp;
6496 if (m->m_flags & M_PKTHDR) {
6497 m->m_pkthdr.len -= (req_len - len);
6498 }
6499 } else {
6500 /*
6501 * Trim from tail. Scan the mbuf chain,
6502 * calculating its length and finding the last mbuf.
6503 * If the adjustment only affects this mbuf, then just
6504 * adjust and return. Otherwise, rescan and truncate
6505 * after the remaining size.
6506 */
6507 len = -len;
6508 count = 0;
6509 for (;;) {
6510 count += m->m_len;
6511 if (m->m_next == (struct mbuf *)0) {
6512 break;
6513 }
6514 m = m->m_next;
6515 }
6516 if (m->m_len >= len) {
6517 m->m_len -= len;
6518 m = mp;
6519 if (m->m_flags & M_PKTHDR) {
6520 m->m_pkthdr.len -= len;
6521 }
6522 return;
6523 }
6524 count -= len;
6525 if (count < 0) {
6526 count = 0;
6527 }
6528 /*
6529 * Correct length for chain is "count".
6530 * Find the mbuf with last data, adjust its length,
6531 * and toss data from remaining mbufs on chain.
6532 */
6533 m = mp;
6534 if (m->m_flags & M_PKTHDR) {
6535 m->m_pkthdr.len = count;
6536 }
6537 for (; m; m = m->m_next) {
6538 if (m->m_len >= count) {
6539 m->m_len = count;
6540 break;
6541 }
6542 count -= m->m_len;
6543 }
6544 while ((m = m->m_next)) {
6545 m->m_len = 0;
6546 }
6547 }
6548}
6549
6550/*
6551 * Rearange an mbuf chain so that len bytes are contiguous
6552 * and in the data area of an mbuf (so that mtod
6553 * will work for a structure of size len). Returns the resulting
6554 * mbuf chain on success, frees it and returns null on failure.
6555 * If there is room, it will add up to max_protohdr-len extra bytes to the
6556 * contiguous region in an attempt to avoid being called next time.
6557 */
6558struct mbuf *
6559m_pullup(struct mbuf *n, int len)
6560{
6561 struct mbuf *m;
6562 int count;
6563 int space;
6564
6565 /* check invalid arguments */
6566 if (n == NULL) {
6567 panic("%s: n == NULL", __func__);
6568 }
6569 if (len < 0) {
6570 os_log_info(OS_LOG_DEFAULT, "%s: failed negative len %d",
6571 __func__, len);
6572 goto bad;
6573 }
6574 if (len > MLEN) {
6575 os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
6576 __func__, len);
6577 goto bad;
6578 }
6579 if ((n->m_flags & M_EXT) == 0 &&
6580 m_mtod_current(m: n) >= m_mtod_upper_bound(m: n)) {
6581 os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
6582 __func__);
6583 goto bad;
6584 }
6585
6586 /*
6587 * If first mbuf has no cluster, and has room for len bytes
6588 * without shifting current data, pullup into it,
6589 * otherwise allocate a new mbuf to prepend to the chain.
6590 */
6591 if ((n->m_flags & M_EXT) == 0 &&
6592 len < m_mtod_upper_bound(m: n) - m_mtod_current(m: n) && n->m_next != NULL) {
6593 if (n->m_len >= len) {
6594 return n;
6595 }
6596 m = n;
6597 n = n->m_next;
6598 len -= m->m_len;
6599 } else {
6600 if (len > MHLEN) {
6601 goto bad;
6602 }
6603 _MGET(m, M_DONTWAIT, n->m_type);
6604 if (m == 0) {
6605 goto bad;
6606 }
6607 m->m_len = 0;
6608 if (n->m_flags & M_PKTHDR) {
6609 M_COPY_PKTHDR(m, n);
6610 n->m_flags &= ~M_PKTHDR;
6611 }
6612 }
6613 space = m_mtod_upper_bound(m) - m_mtod_end(m);
6614 do {
6615 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
6616 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
6617 n: (unsigned)count);
6618 len -= count;
6619 m->m_len += count;
6620 n->m_len -= count;
6621 space -= count;
6622 if (n->m_len != 0) {
6623 n->m_data += count;
6624 } else {
6625 n = m_free(m: n);
6626 }
6627 } while (len > 0 && n != NULL);
6628 if (len > 0) {
6629 (void) m_free(m);
6630 goto bad;
6631 }
6632 m->m_next = n;
6633 return m;
6634bad:
6635 m_freem(m: n);
6636 return 0;
6637}
6638
6639/*
6640 * Like m_pullup(), except a new mbuf is always allocated, and we allow
6641 * the amount of empty space before the data in the new mbuf to be specified
6642 * (in the event that the caller expects to prepend later).
6643 */
6644__private_extern__ struct mbuf *
6645m_copyup(struct mbuf *n, int len, int dstoff)
6646{
6647 struct mbuf *m;
6648 int count, space;
6649
6650 VERIFY(len >= 0 && dstoff >= 0);
6651
6652 if (len > (MHLEN - dstoff)) {
6653 goto bad;
6654 }
6655 MGET(m, M_DONTWAIT, n->m_type);
6656 if (m == NULL) {
6657 goto bad;
6658 }
6659 m->m_len = 0;
6660 if (n->m_flags & M_PKTHDR) {
6661 m_copy_pkthdr(to: m, from: n);
6662 n->m_flags &= ~M_PKTHDR;
6663 }
6664 m->m_data += dstoff;
6665 space = m_mtod_upper_bound(m) - m_mtod_end(m);
6666 do {
6667 count = min(a: min(a: max(a: len, b: max_protohdr), b: space), b: n->m_len);
6668 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
6669 n: (unsigned)count);
6670 len -= count;
6671 m->m_len += count;
6672 n->m_len -= count;
6673 space -= count;
6674 if (n->m_len) {
6675 n->m_data += count;
6676 } else {
6677 n = m_free(m: n);
6678 }
6679 } while (len > 0 && n);
6680 if (len > 0) {
6681 (void) m_free(m);
6682 goto bad;
6683 }
6684 m->m_next = n;
6685 return m;
6686bad:
6687 m_freem(m: n);
6688
6689 return NULL;
6690}
6691
6692/*
6693 * Partition an mbuf chain in two pieces, returning the tail --
6694 * all but the first len0 bytes. In case of failure, it returns NULL and
6695 * attempts to restore the chain to its original state.
6696 */
6697struct mbuf *
6698m_split(struct mbuf *m0, int len0, int wait)
6699{
6700 return m_split0(m0, len0, wait, 1);
6701}
6702
6703static struct mbuf *
6704m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
6705{
6706 struct mbuf *m, *n;
6707 unsigned len = len0, remain;
6708
6709 /*
6710 * First iterate to the mbuf which contains the first byte of
6711 * data at offset len0
6712 */
6713 for (m = m0; m && len > m->m_len; m = m->m_next) {
6714 len -= m->m_len;
6715 }
6716 if (m == NULL) {
6717 return NULL;
6718 }
6719 /*
6720 * len effectively is now the offset in the current
6721 * mbuf where we have to perform split.
6722 *
6723 * remain becomes the tail length.
6724 * Note that len can also be == m->m_len
6725 */
6726 remain = m->m_len - len;
6727
6728 /*
6729 * If current mbuf len contains the entire remaining offset len,
6730 * just make the second mbuf chain pointing to next mbuf onwards
6731 * and return after making necessary adjustments
6732 */
6733 if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
6734 _MGETHDR(n, wait, m0->m_type);
6735 if (n == NULL) {
6736 return NULL;
6737 }
6738 n->m_next = m->m_next;
6739 m->m_next = NULL;
6740 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
6741 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
6742 m0->m_pkthdr.len = len0;
6743 return n;
6744 }
6745 if (copyhdr && (m0->m_flags & M_PKTHDR)) {
6746 _MGETHDR(n, wait, m0->m_type);
6747 if (n == NULL) {
6748 return NULL;
6749 }
6750 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
6751 n->m_pkthdr.len = m0->m_pkthdr.len - len0;
6752 m0->m_pkthdr.len = len0;
6753
6754 /*
6755 * If current points to external storage
6756 * then it can be shared by making last mbuf
6757 * of head chain and first mbuf of current chain
6758 * pointing to different data offsets
6759 */
6760 if (m->m_flags & M_EXT) {
6761 goto extpacket;
6762 }
6763 if (remain > MHLEN) {
6764 /* m can't be the lead packet */
6765 MH_ALIGN(n, 0);
6766 n->m_next = m_split(m0: m, len0: len, wait);
6767 if (n->m_next == NULL) {
6768 (void) m_free(m: n);
6769 return NULL;
6770 } else {
6771 return n;
6772 }
6773 } else {
6774 MH_ALIGN(n, remain);
6775 }
6776 } else if (remain == 0) {
6777 n = m->m_next;
6778 m->m_next = NULL;
6779 return n;
6780 } else {
6781 _MGET(n, wait, m->m_type);
6782 if (n == NULL) {
6783 return NULL;
6784 }
6785
6786 if ((m->m_flags & M_EXT) == 0) {
6787 VERIFY(remain <= MLEN);
6788 M_ALIGN(n, remain);
6789 }
6790 }
6791extpacket:
6792 if (m->m_flags & M_EXT) {
6793 n->m_flags |= M_EXT;
6794 n->m_ext = m->m_ext;
6795 m_incref(m);
6796 n->m_data = m->m_data + len;
6797 } else {
6798 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), n: remain);
6799 }
6800 n->m_len = remain;
6801 m->m_len = len;
6802 n->m_next = m->m_next;
6803 m->m_next = NULL;
6804 return n;
6805}
6806
6807/*
6808 * Routine to copy from device local memory into mbufs.
6809 */
6810struct mbuf *
6811m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
6812 void (*copy)(const void *, void *, size_t))
6813{
6814 struct mbuf *m;
6815 struct mbuf *top = NULL, **mp = &top;
6816 int off = off0, len;
6817 char *cp;
6818 char *epkt;
6819
6820 cp = buf;
6821 epkt = cp + totlen;
6822 if (off) {
6823 /*
6824 * If 'off' is non-zero, packet is trailer-encapsulated,
6825 * so we have to skip the type and length fields.
6826 */
6827 cp += off + 2 * sizeof(u_int16_t);
6828 totlen -= 2 * sizeof(u_int16_t);
6829 }
6830 _MGETHDR(m, M_DONTWAIT, MT_DATA);
6831 if (m == NULL) {
6832 return NULL;
6833 }
6834 m->m_pkthdr.rcvif = ifp;
6835 m->m_pkthdr.len = totlen;
6836 m->m_len = MHLEN;
6837
6838 while (totlen > 0) {
6839 if (top != NULL) {
6840 _MGET(m, M_DONTWAIT, MT_DATA);
6841 if (m == NULL) {
6842 m_freem(m: top);
6843 return NULL;
6844 }
6845 m->m_len = MLEN;
6846 }
6847 len = MIN(totlen, epkt - cp);
6848 if (len >= MINCLSIZE) {
6849 MCLGET(m, M_DONTWAIT);
6850 if (m->m_flags & M_EXT) {
6851 m->m_len = len = MIN(len, m_maxsize(MC_CL));
6852 } else {
6853 /* give up when it's out of cluster mbufs */
6854 if (top != NULL) {
6855 m_freem(m: top);
6856 }
6857 m_freem(m);
6858 return NULL;
6859 }
6860 } else {
6861 /*
6862 * Place initial small packet/header at end of mbuf.
6863 */
6864 if (len < m->m_len) {
6865 if (top == NULL &&
6866 len + max_linkhdr <= m->m_len) {
6867 m->m_data += max_linkhdr;
6868 }
6869 m->m_len = len;
6870 } else {
6871 len = m->m_len;
6872 }
6873 }
6874 if (copy) {
6875 copy(cp, MTOD(m, caddr_t), (unsigned)len);
6876 } else {
6877 bcopy(src: cp, MTOD(m, caddr_t), n: (unsigned)len);
6878 }
6879 cp += len;
6880 *mp = m;
6881 mp = &m->m_next;
6882 totlen -= len;
6883 if (cp == epkt) {
6884 cp = buf;
6885 }
6886 }
6887 return top;
6888}
6889
6890#if CONFIG_MBUF_MCACHE
6891#ifndef MBUF_GROWTH_NORMAL_THRESH
6892#define MBUF_GROWTH_NORMAL_THRESH 25
6893#endif
6894
6895/*
6896 * Cluster freelist allocation check.
6897 */
6898static int
6899m_howmany(int num, size_t bufsize)
6900{
6901 int i = 0, j = 0;
6902 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
6903 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
6904 u_int32_t sumclusters, freeclusters;
6905 u_int32_t percent_pool, percent_kmem;
6906 u_int32_t mb_growth, mb_growth_thresh;
6907
6908 VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
6909 bufsize == m_maxsize(MC_16KCL));
6910
6911 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6912
6913 /* Numbers in 2K cluster units */
6914 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
6915 m_clusters = m_total(MC_CL);
6916 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
6917 m_16kclusters = m_total(MC_16KCL);
6918 sumclusters = m_mbclusters + m_clusters + m_bigclusters;
6919
6920 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
6921 m_clfree = m_infree(MC_CL);
6922 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
6923 m_16kclfree = m_infree(MC_16KCL);
6924 freeclusters = m_mbfree + m_clfree + m_bigclfree;
6925
6926 /* Bail if we've maxed out the mbuf memory map */
6927 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
6928 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
6929 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
6930 mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
6931 sumclusters, nclusters,
6932 (m_16kclusters << NCLPJCLSHIFT), njcl);
6933 return 0;
6934 }
6935
6936 if (bufsize == m_maxsize(MC_BIGCL)) {
6937 /* Under minimum */
6938 if (m_bigclusters < m_minlimit(MC_BIGCL)) {
6939 return m_minlimit(MC_BIGCL) - m_bigclusters;
6940 }
6941
6942 percent_pool =
6943 ((sumclusters - freeclusters) * 100) / sumclusters;
6944 percent_kmem = (sumclusters * 100) / nclusters;
6945
6946 /*
6947 * If a light/normal user, grow conservatively (75%)
6948 * If a heavy user, grow aggressively (50%)
6949 */
6950 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) {
6951 mb_growth = MB_GROWTH_NORMAL;
6952 } else {
6953 mb_growth = MB_GROWTH_AGGRESSIVE;
6954 }
6955
6956 if (percent_kmem < 5) {
6957 /* For initial allocations */
6958 i = num;
6959 } else {
6960 /* Return if >= MBIGCL_LOWAT clusters available */
6961 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
6962 m_total(MC_BIGCL) >=
6963 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) {
6964 return 0;
6965 }
6966
6967 /* Ensure at least num clusters are accessible */
6968 if (num >= m_infree(MC_BIGCL)) {
6969 i = num - m_infree(MC_BIGCL);
6970 }
6971 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) {
6972 j = num - (m_total(MC_BIGCL) -
6973 m_minlimit(MC_BIGCL));
6974 }
6975
6976 i = MAX(i, j);
6977
6978 /*
6979 * Grow pool if percent_pool > 75 (normal growth)
6980 * or percent_pool > 50 (aggressive growth).
6981 */
6982 mb_growth_thresh = 100 - (100 / (1 << mb_growth));
6983 if (percent_pool > mb_growth_thresh) {
6984 j = ((sumclusters + num) >> mb_growth) -
6985 freeclusters;
6986 }
6987 i = MAX(i, j);
6988 }
6989
6990 /* Check to ensure we didn't go over limits */
6991 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) {
6992 i = m_maxlimit(MC_BIGCL) - m_bigclusters;
6993 }
6994 if ((i << 1) + sumclusters >= nclusters) {
6995 i = (nclusters - sumclusters) >> 1;
6996 }
6997 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
6998 VERIFY(sumclusters + (i << 1) <= nclusters);
6999 } else { /* 16K CL */
7000 VERIFY(njcl > 0);
7001 /* Ensure at least num clusters are available */
7002 if (num >= m_16kclfree) {
7003 i = num - m_16kclfree;
7004 }
7005
7006 /* Always grow 16KCL pool aggressively */
7007 if (((m_16kclusters + num) >> 1) > m_16kclfree) {
7008 j = ((m_16kclusters + num) >> 1) - m_16kclfree;
7009 }
7010 i = MAX(i, j);
7011
7012 /* Check to ensure we don't go over limit */
7013 if ((i + m_total(MC_16KCL)) >= m_maxlimit(MC_16KCL)) {
7014 i = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
7015 }
7016 }
7017 return i;
7018}
7019#endif /* CONFIG_MBUF_MCACHE */
7020/*
7021 * Return the number of bytes in the mbuf chain, m.
7022 */
7023unsigned int
7024m_length(struct mbuf *m)
7025{
7026 struct mbuf *m0;
7027 unsigned int pktlen;
7028
7029 if (m->m_flags & M_PKTHDR) {
7030 return m->m_pkthdr.len;
7031 }
7032
7033 pktlen = 0;
7034 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
7035 pktlen += m0->m_len;
7036 }
7037 return pktlen;
7038}
7039
7040/*
7041 * Copy data from a buffer back into the indicated mbuf chain,
7042 * starting "off" bytes from the beginning, extending the mbuf
7043 * chain if necessary.
7044 */
7045void
7046m_copyback(struct mbuf *m0, int off, int len, const void *cp)
7047{
7048#if DEBUG
7049 struct mbuf *origm = m0;
7050 int error;
7051#endif /* DEBUG */
7052
7053 if (m0 == NULL) {
7054 return;
7055 }
7056
7057#if DEBUG
7058 error =
7059#endif /* DEBUG */
7060 m_copyback0(&m0, off, len, cp,
7061 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
7062
7063#if DEBUG
7064 if (error != 0 || (m0 != NULL && origm != m0)) {
7065 panic("m_copyback");
7066 }
7067#endif /* DEBUG */
7068}
7069
7070struct mbuf *
7071m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
7072{
7073 int error;
7074
7075 /* don't support chain expansion */
7076 VERIFY(off + len <= m_length(m0));
7077
7078 error = m_copyback0(&m0, off, len, cp,
7079 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
7080 if (error) {
7081 /*
7082 * no way to recover from partial success.
7083 * just free the chain.
7084 */
7085 m_freem(m: m0);
7086 return NULL;
7087 }
7088 return m0;
7089}
7090
7091/*
7092 * m_makewritable: ensure the specified range writable.
7093 */
7094int
7095m_makewritable(struct mbuf **mp, int off, int len, int how)
7096{
7097 int error;
7098#if DEBUG
7099 struct mbuf *n;
7100 int origlen, reslen;
7101
7102 origlen = m_length(*mp);
7103#endif /* DEBUG */
7104
7105#if 0 /* M_COPYALL is large enough */
7106 if (len == M_COPYALL) {
7107 len = m_length(*mp) - off; /* XXX */
7108 }
7109#endif
7110
7111 error = m_copyback0(mp, off, len, NULL,
7112 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
7113
7114#if DEBUG
7115 reslen = 0;
7116 for (n = *mp; n; n = n->m_next) {
7117 reslen += n->m_len;
7118 }
7119 if (origlen != reslen) {
7120 panic("m_makewritable: length changed");
7121 }
7122 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) {
7123 panic("m_makewritable: inconsist");
7124 }
7125#endif /* DEBUG */
7126
7127 return error;
7128}
7129
7130static int
7131m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
7132 int how)
7133{
7134 int mlen;
7135 struct mbuf *m, *n;
7136 struct mbuf **mp;
7137 int totlen = 0;
7138 const char *cp = vp;
7139
7140 VERIFY(mp0 != NULL);
7141 VERIFY(*mp0 != NULL);
7142 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
7143 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
7144
7145 /*
7146 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
7147 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
7148 */
7149
7150 VERIFY((~flags & (M_COPYBACK0_EXTEND | M_COPYBACK0_COW)) != 0);
7151
7152 mp = mp0;
7153 m = *mp;
7154 while (off > (mlen = m->m_len)) {
7155 off -= mlen;
7156 totlen += mlen;
7157 if (m->m_next == NULL) {
7158 int tspace;
7159extend:
7160 if (!(flags & M_COPYBACK0_EXTEND)) {
7161 goto out;
7162 }
7163
7164 /*
7165 * try to make some space at the end of "m".
7166 */
7167
7168 mlen = m->m_len;
7169 if (off + len >= MINCLSIZE &&
7170 !(m->m_flags & M_EXT) && m->m_len == 0) {
7171 MCLGET(m, how);
7172 }
7173 tspace = M_TRAILINGSPACE(m);
7174 if (tspace > 0) {
7175 tspace = MIN(tspace, off + len);
7176 VERIFY(tspace > 0);
7177 bzero(mtod(m, char *) + m->m_len,
7178 MIN(off, tspace));
7179 m->m_len += tspace;
7180 off += mlen;
7181 totlen -= mlen;
7182 continue;
7183 }
7184
7185 /*
7186 * need to allocate an mbuf.
7187 */
7188
7189 if (off + len >= MINCLSIZE) {
7190 n = m_getcl(wait: how, type: m->m_type, flags: 0);
7191 } else {
7192 n = _M_GET(how, m->m_type);
7193 }
7194 if (n == NULL) {
7195 goto out;
7196 }
7197 n->m_len = 0;
7198 n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
7199 bzero(mtod(n, char *), MIN(n->m_len, off));
7200 m->m_next = n;
7201 }
7202 mp = &m->m_next;
7203 m = m->m_next;
7204 }
7205 while (len > 0) {
7206 mlen = m->m_len - off;
7207 if (mlen != 0 && m_mclhasreference(m)) {
7208 char *datap;
7209 int eatlen;
7210
7211 /*
7212 * this mbuf is read-only.
7213 * allocate a new writable mbuf and try again.
7214 */
7215
7216#if DIAGNOSTIC
7217 if (!(flags & M_COPYBACK0_COW)) {
7218 panic("m_copyback0: read-only");
7219 }
7220#endif /* DIAGNOSTIC */
7221
7222 /*
7223 * if we're going to write into the middle of
7224 * a mbuf, split it first.
7225 */
7226 if (off > 0 && len < mlen) {
7227 n = m_split0(m0: m, len0: off, wait: how, copyhdr: 0);
7228 if (n == NULL) {
7229 goto enobufs;
7230 }
7231 m->m_next = n;
7232 mp = &m->m_next;
7233 m = n;
7234 off = 0;
7235 continue;
7236 }
7237
7238 /*
7239 * XXX TODO coalesce into the trailingspace of
7240 * the previous mbuf when possible.
7241 */
7242
7243 /*
7244 * allocate a new mbuf. copy packet header if needed.
7245 */
7246 n = _M_GET(how, m->m_type);
7247 if (n == NULL) {
7248 goto enobufs;
7249 }
7250 if (off == 0 && (m->m_flags & M_PKTHDR)) {
7251 M_COPY_PKTHDR(n, m);
7252 n->m_len = MHLEN;
7253 } else {
7254 if (len >= MINCLSIZE) {
7255 MCLGET(n, M_DONTWAIT);
7256 }
7257 n->m_len =
7258 (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
7259 }
7260 if (n->m_len > len) {
7261 n->m_len = len;
7262 }
7263
7264 /*
7265 * free the region which has been overwritten.
7266 * copying data from old mbufs if requested.
7267 */
7268 if (flags & M_COPYBACK0_PRESERVE) {
7269 datap = mtod(n, char *);
7270 } else {
7271 datap = NULL;
7272 }
7273 eatlen = n->m_len;
7274 VERIFY(off == 0 || eatlen >= mlen);
7275 if (off > 0) {
7276 VERIFY(len >= mlen);
7277 m->m_len = off;
7278 m->m_next = n;
7279 if (datap) {
7280 m_copydata(m, off, len: mlen, vp: datap);
7281 datap += mlen;
7282 }
7283 eatlen -= mlen;
7284 mp = &m->m_next;
7285 m = m->m_next;
7286 }
7287 while (m != NULL && m_mclhasreference(m) &&
7288 n->m_type == m->m_type && eatlen > 0) {
7289 mlen = MIN(eatlen, m->m_len);
7290 if (datap) {
7291 m_copydata(m, off: 0, len: mlen, vp: datap);
7292 datap += mlen;
7293 }
7294 m->m_data += mlen;
7295 m->m_len -= mlen;
7296 eatlen -= mlen;
7297 if (m->m_len == 0) {
7298 *mp = m = m_free(m);
7299 }
7300 }
7301 if (eatlen > 0) {
7302 n->m_len -= eatlen;
7303 }
7304 n->m_next = m;
7305 *mp = m = n;
7306 continue;
7307 }
7308 mlen = MIN(mlen, len);
7309 if (flags & M_COPYBACK0_COPYBACK) {
7310 bcopy(src: cp, mtod(m, caddr_t) + off, n: (unsigned)mlen);
7311 cp += mlen;
7312 }
7313 len -= mlen;
7314 mlen += off;
7315 off = 0;
7316 totlen += mlen;
7317 if (len == 0) {
7318 break;
7319 }
7320 if (m->m_next == NULL) {
7321 goto extend;
7322 }
7323 mp = &m->m_next;
7324 m = m->m_next;
7325 }
7326out:
7327 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
7328 VERIFY(flags & M_COPYBACK0_EXTEND);
7329 m->m_pkthdr.len = totlen;
7330 }
7331
7332 return 0;
7333
7334enobufs:
7335 return ENOBUFS;
7336}
7337
7338uint64_t
7339mcl_to_paddr(char *addr)
7340{
7341#if CONFIG_MBUF_MCACHE
7342 vm_offset_t base_phys;
7343
7344 if (!MBUF_IN_MAP(addr)) {
7345 return 0;
7346 }
7347 base_phys = mcl_paddr[atop_64(addr - (char *)mbutl)];
7348
7349 if (base_phys == 0) {
7350 return 0;
7351 }
7352 return (uint64_t)(ptoa_64(base_phys) | ((uint64_t)addr & PAGE_MASK));
7353#else
7354 extern addr64_t kvtophys(vm_offset_t va);
7355
7356 return kvtophys(va: (vm_offset_t)addr);
7357#endif /* CONFIG_MBUF_MCACHE */
7358}
7359
7360/*
7361 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft.
7362 * And really copy the thing. That way, we don't "precompute" checksums
7363 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for
7364 * small packets, don't dup into a cluster. That way received packets
7365 * don't take up too much room in the sockbuf (cf. sbspace()).
7366 */
7367struct mbuf *
7368m_dup(struct mbuf *m, int how)
7369{
7370 struct mbuf *n, **np;
7371 struct mbuf *top;
7372 int copyhdr = 0;
7373
7374 np = &top;
7375 top = NULL;
7376 if (m->m_flags & M_PKTHDR) {
7377 copyhdr = 1;
7378 }
7379
7380 /*
7381 * Quick check: if we have one mbuf and its data fits in an
7382 * mbuf with packet header, just copy and go.
7383 */
7384 if (m->m_next == NULL) {
7385 /* Then just move the data into an mbuf and be done... */
7386 if (copyhdr) {
7387 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
7388 if ((n = _M_GETHDR(how, m->m_type)) == NULL) {
7389 return NULL;
7390 }
7391 n->m_len = m->m_len;
7392 m_dup_pkthdr(to: n, from: m, how);
7393 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), n: m->m_len);
7394 return n;
7395 }
7396 } else if (m->m_len <= MLEN) {
7397 if ((n = _M_GET(how, m->m_type)) == NULL) {
7398 return NULL;
7399 }
7400 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), n: m->m_len);
7401 n->m_len = m->m_len;
7402 return n;
7403 }
7404 }
7405 while (m != NULL) {
7406#if BLUE_DEBUG
7407 printf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
7408 m->m_data);
7409#endif
7410 if (copyhdr) {
7411 n = _M_GETHDR(how, m->m_type);
7412 } else {
7413 n = _M_GET(how, m->m_type);
7414 }
7415 if (n == NULL) {
7416 goto nospace;
7417 }
7418 if (m->m_flags & M_EXT) {
7419 if (m->m_len <= m_maxsize(MC_CL)) {
7420 MCLGET(n, how);
7421 } else if (m->m_len <= m_maxsize(MC_BIGCL)) {
7422 n = m_mbigget(m: n, wait: how);
7423 } else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) {
7424 n = m_m16kget(m: n, wait: how);
7425 }
7426 if (!(n->m_flags & M_EXT)) {
7427 (void) m_free(m: n);
7428 goto nospace;
7429 }
7430 } else {
7431 VERIFY((copyhdr == 1 && m->m_len <= MHLEN) ||
7432 (copyhdr == 0 && m->m_len <= MLEN));
7433 }
7434 *np = n;
7435 if (copyhdr) {
7436 /* Don't use M_COPY_PKTHDR: preserve m_data */
7437 m_dup_pkthdr(to: n, from: m, how);
7438 copyhdr = 0;
7439 if (!(n->m_flags & M_EXT)) {
7440 n->m_data = (uintptr_t)n->m_pktdat;
7441 }
7442 }
7443 n->m_len = m->m_len;
7444 /*
7445 * Get the dup on the same bdry as the original
7446 * Assume that the two mbufs have the same offset to data area
7447 * (up to word boundaries)
7448 */
7449 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), n: (unsigned)n->m_len);
7450 m = m->m_next;
7451 np = &n->m_next;
7452#if BLUE_DEBUG
7453 printf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
7454 n->m_data);
7455#endif
7456 }
7457
7458 return top;
7459
7460nospace:
7461 m_freem(m: top);
7462 return NULL;
7463}
7464
7465#define MBUF_MULTIPAGES(m) \
7466 (((m)->m_flags & M_EXT) && \
7467 ((IS_P2ALIGNED((m)->m_data, PAGE_SIZE) \
7468 && (m)->m_len > PAGE_SIZE) || \
7469 (!IS_P2ALIGNED((m)->m_data, PAGE_SIZE) && \
7470 P2ROUNDUP((m)->m_data, PAGE_SIZE) < ((uintptr_t)(m)->m_data + (m)->m_len))))
7471
7472static struct mbuf *
7473m_expand(struct mbuf *m, struct mbuf **last)
7474{
7475 struct mbuf *top = NULL;
7476 struct mbuf **nm = &top;
7477 uintptr_t data0, data;
7478 unsigned int len0, len;
7479
7480 VERIFY(MBUF_MULTIPAGES(m));
7481 VERIFY(m->m_next == NULL);
7482 data0 = (uintptr_t)m->m_data;
7483 len0 = m->m_len;
7484 *last = top;
7485
7486 for (;;) {
7487 struct mbuf *n;
7488
7489 data = data0;
7490 if (IS_P2ALIGNED(data, PAGE_SIZE) && len0 > PAGE_SIZE) {
7491 len = PAGE_SIZE;
7492 } else if (!IS_P2ALIGNED(data, PAGE_SIZE) &&
7493 P2ROUNDUP(data, PAGE_SIZE) < (data + len0)) {
7494 len = P2ROUNDUP(data, PAGE_SIZE) - data;
7495 } else {
7496 len = len0;
7497 }
7498
7499 VERIFY(len > 0);
7500 VERIFY(m->m_flags & M_EXT);
7501 m->m_data = data;
7502 m->m_len = len;
7503
7504 *nm = *last = m;
7505 nm = &m->m_next;
7506 m->m_next = NULL;
7507
7508 data0 += len;
7509 len0 -= len;
7510 if (len0 == 0) {
7511 break;
7512 }
7513
7514 n = _M_RETRY(M_DONTWAIT, MT_DATA);
7515 if (n == NULL) {
7516 m_freem(m: top);
7517 top = *last = NULL;
7518 break;
7519 }
7520
7521 n->m_ext = m->m_ext;
7522 m_incref(m);
7523 n->m_flags |= M_EXT;
7524 m = n;
7525 }
7526 return top;
7527}
7528
7529struct mbuf *
7530m_normalize(struct mbuf *m)
7531{
7532 struct mbuf *top = NULL;
7533 struct mbuf **nm = &top;
7534 boolean_t expanded = FALSE;
7535
7536 while (m != NULL) {
7537 struct mbuf *n;
7538
7539 n = m->m_next;
7540 m->m_next = NULL;
7541
7542 /* Does the data cross one or more page boundaries? */
7543 if (MBUF_MULTIPAGES(m)) {
7544 struct mbuf *last;
7545 if ((m = m_expand(m, last: &last)) == NULL) {
7546 m_freem(m: n);
7547 m_freem(m: top);
7548 top = NULL;
7549 break;
7550 }
7551 *nm = m;
7552 nm = &last->m_next;
7553 expanded = TRUE;
7554 } else {
7555 *nm = m;
7556 nm = &m->m_next;
7557 }
7558 m = n;
7559 }
7560 if (expanded) {
7561 os_atomic_inc(&mb_normalized, relaxed);
7562 }
7563 return top;
7564}
7565
7566/*
7567 * Append the specified data to the indicated mbuf chain,
7568 * Extend the mbuf chain if the new data does not fit in
7569 * existing space.
7570 *
7571 * Return 1 if able to complete the job; otherwise 0.
7572 */
7573int
7574m_append(struct mbuf *m0, int len, caddr_t cp)
7575{
7576 struct mbuf *m, *n;
7577 int remainder, space;
7578
7579 for (m = m0; m->m_next != NULL; m = m->m_next) {
7580 ;
7581 }
7582 remainder = len;
7583 space = M_TRAILINGSPACE(m);
7584 if (space > 0) {
7585 /*
7586 * Copy into available space.
7587 */
7588 if (space > remainder) {
7589 space = remainder;
7590 }
7591 bcopy(src: cp, mtod(m, caddr_t) + m->m_len, n: space);
7592 m->m_len += space;
7593 cp += space;
7594 remainder -= space;
7595 }
7596 while (remainder > 0) {
7597 /*
7598 * Allocate a new mbuf; could check space
7599 * and allocate a cluster instead.
7600 */
7601 n = m_get(M_WAITOK, type: m->m_type);
7602 if (n == NULL) {
7603 break;
7604 }
7605 n->m_len = min(MLEN, b: remainder);
7606 bcopy(src: cp, mtod(n, caddr_t), n: n->m_len);
7607 cp += n->m_len;
7608 remainder -= n->m_len;
7609 m->m_next = n;
7610 m = n;
7611 }
7612 if (m0->m_flags & M_PKTHDR) {
7613 m0->m_pkthdr.len += len - remainder;
7614 }
7615 return remainder == 0;
7616}
7617
7618struct mbuf *
7619m_last(struct mbuf *m)
7620{
7621 while (m->m_next != NULL) {
7622 m = m->m_next;
7623 }
7624 return m;
7625}
7626
7627unsigned int
7628m_fixhdr(struct mbuf *m0)
7629{
7630 u_int len;
7631
7632 VERIFY(m0->m_flags & M_PKTHDR);
7633
7634 len = m_length2(m0, NULL);
7635 m0->m_pkthdr.len = len;
7636 return len;
7637}
7638
7639unsigned int
7640m_length2(struct mbuf *m0, struct mbuf **last)
7641{
7642 struct mbuf *m;
7643 u_int len;
7644
7645 len = 0;
7646 for (m = m0; m != NULL; m = m->m_next) {
7647 len += m->m_len;
7648 if (m->m_next == NULL) {
7649 break;
7650 }
7651 }
7652 if (last != NULL) {
7653 *last = m;
7654 }
7655 return len;
7656}
7657
7658/*
7659 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
7660 * and clusters. If allocation fails and this cannot be completed, NULL will
7661 * be returned, but the passed in chain will be unchanged. Upon success,
7662 * the original chain will be freed, and the new chain will be returned.
7663 *
7664 * If a non-packet header is passed in, the original mbuf (chain?) will
7665 * be returned unharmed.
7666 *
7667 * If offset is specfied, the first mbuf in the chain will have a leading
7668 * space of the amount stated by the "off" parameter.
7669 *
7670 * This routine requires that the m_pkthdr.header field of the original
7671 * mbuf chain is cleared by the caller.
7672 */
7673struct mbuf *
7674m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
7675{
7676 struct mbuf *m_new = NULL, *m_final = NULL;
7677 int progress = 0, length, pktlen;
7678
7679 if (!(m0->m_flags & M_PKTHDR)) {
7680 return m0;
7681 }
7682
7683 VERIFY(off < MHLEN);
7684 m_fixhdr(m0); /* Needed sanity check */
7685
7686 pktlen = m0->m_pkthdr.len + off;
7687 if (pktlen > MHLEN) {
7688 m_final = m_getcl(wait: how, MT_DATA, M_PKTHDR);
7689 } else {
7690 m_final = m_gethdr(wait: how, MT_DATA);
7691 }
7692
7693 if (m_final == NULL) {
7694 goto nospace;
7695 }
7696
7697 if (off > 0) {
7698 pktlen -= off;
7699 m_final->m_data += off;
7700 }
7701
7702 /*
7703 * Caller must have handled the contents pointed to by this
7704 * pointer before coming here, as otherwise it will point to
7705 * the original mbuf which will get freed upon success.
7706 */
7707 VERIFY(m0->m_pkthdr.pkt_hdr == NULL);
7708
7709 if (m_dup_pkthdr(to: m_final, from: m0, how) == 0) {
7710 goto nospace;
7711 }
7712
7713 m_new = m_final;
7714
7715 while (progress < pktlen) {
7716 length = pktlen - progress;
7717 if (length > MCLBYTES) {
7718 length = MCLBYTES;
7719 }
7720 length -= ((m_new == m_final) ? off : 0);
7721 if (length < 0) {
7722 goto nospace;
7723 }
7724
7725 if (m_new == NULL) {
7726 if (length > MLEN) {
7727 m_new = m_getcl(wait: how, MT_DATA, flags: 0);
7728 } else {
7729 m_new = m_get(wait: how, MT_DATA);
7730 }
7731 if (m_new == NULL) {
7732 goto nospace;
7733 }
7734 }
7735
7736 m_copydata(m: m0, off: progress, len: length, mtod(m_new, caddr_t));
7737 progress += length;
7738 m_new->m_len = length;
7739 if (m_new != m_final) {
7740 m_cat(m: m_final, n: m_new);
7741 }
7742 m_new = NULL;
7743 }
7744 m_freem(m: m0);
7745 m0 = m_final;
7746 return m0;
7747nospace:
7748 if (m_final) {
7749 m_freem(m: m_final);
7750 }
7751 return NULL;
7752}
7753
7754struct mbuf *
7755m_defrag(struct mbuf *m0, int how)
7756{
7757 return m_defrag_offset(m0, off: 0, how);
7758}
7759
7760void
7761m_mchtype(struct mbuf *m, int t)
7762{
7763 mtype_stat_inc(t);
7764 mtype_stat_dec(m->m_type);
7765 (m)->m_type = t;
7766}
7767
7768void *__unsafe_indexable
7769m_mtod(struct mbuf *m)
7770{
7771 return m_mtod_current(m);
7772}
7773
7774void
7775m_mcheck(struct mbuf *m)
7776{
7777 _MCHECK(m);
7778}
7779
7780/*
7781 * Return a pointer to mbuf/offset of location in mbuf chain.
7782 */
7783struct mbuf *
7784m_getptr(struct mbuf *m, int loc, int *off)
7785{
7786 while (loc >= 0) {
7787 /* Normal end of search. */
7788 if (m->m_len > loc) {
7789 *off = loc;
7790 return m;
7791 } else {
7792 loc -= m->m_len;
7793 if (m->m_next == NULL) {
7794 if (loc == 0) {
7795 /* Point at the end of valid data. */
7796 *off = m->m_len;
7797 return m;
7798 }
7799 return NULL;
7800 }
7801 m = m->m_next;
7802 }
7803 }
7804 return NULL;
7805}
7806
7807#if CONFIG_MBUF_MCACHE
7808/*
7809 * Inform the corresponding mcache(s) that there's a waiter below.
7810 */
7811static void
7812mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
7813{
7814 mcache_waiter_inc(m_cache(class));
7815 if (comp) {
7816 if (class == MC_CL) {
7817 mcache_waiter_inc(m_cache(MC_MBUF_CL));
7818 } else if (class == MC_BIGCL) {
7819 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
7820 } else if (class == MC_16KCL) {
7821 mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
7822 } else {
7823 mcache_waiter_inc(m_cache(MC_MBUF_CL));
7824 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
7825 }
7826 }
7827}
7828
7829/*
7830 * Inform the corresponding mcache(s) that there's no more waiter below.
7831 */
7832static void
7833mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
7834{
7835 mcache_waiter_dec(m_cache(class));
7836 if (comp) {
7837 if (class == MC_CL) {
7838 mcache_waiter_dec(m_cache(MC_MBUF_CL));
7839 } else if (class == MC_BIGCL) {
7840 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
7841 } else if (class == MC_16KCL) {
7842 mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
7843 } else {
7844 mcache_waiter_dec(m_cache(MC_MBUF_CL));
7845 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
7846 }
7847 }
7848}
7849
7850static bool mbuf_watchdog_defunct_active = false;
7851
7852#endif /* CONFIG_MBUF_MCACHE */
7853
7854static uint32_t
7855mbuf_watchdog_socket_space(struct socket *so)
7856{
7857 uint32_t space = 0;
7858
7859 if (so == NULL) {
7860 return 0;
7861 }
7862
7863 space = so->so_snd.sb_mbcnt + so->so_rcv.sb_mbcnt;
7864
7865#if INET
7866 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
7867 SOCK_PROTO(so) == IPPROTO_TCP) {
7868 space += tcp_reass_qlen_space(so);
7869 }
7870#endif /* INET */
7871
7872 return space;
7873}
7874
7875struct mbuf_watchdog_defunct_args {
7876 struct proc *top_app;
7877 uint32_t top_app_space_used;
7878 bool non_blocking;
7879};
7880
7881static bool
7882proc_fd_trylock(proc_t p)
7883{
7884 return lck_mtx_try_lock(lck: &p->p_fd.fd_lock);
7885}
7886
7887static int
7888mbuf_watchdog_defunct_iterate(proc_t p, void *arg)
7889{
7890 struct fileproc *fp = NULL;
7891 struct mbuf_watchdog_defunct_args *args =
7892 (struct mbuf_watchdog_defunct_args *)arg;
7893 uint32_t space_used = 0;
7894
7895 /*
7896 * Non-blocking is only used when dumping the mbuf usage from the watchdog
7897 */
7898 if (args->non_blocking) {
7899 if (!proc_fd_trylock(p)) {
7900 return PROC_RETURNED;
7901 }
7902 } else {
7903 proc_fdlock(p);
7904 }
7905 fdt_foreach(fp, p) {
7906 struct fileglob *fg = fp->fp_glob;
7907 struct socket *so = NULL;
7908
7909 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
7910 continue;
7911 }
7912 so = fg_get_data(fg);
7913 /*
7914 * We calculate the space without the socket
7915 * lock because we don't want to be blocked
7916 * by another process that called send() and
7917 * is stuck waiting for mbufs.
7918 *
7919 * These variables are 32-bit so we don't have
7920 * to worry about incomplete reads.
7921 */
7922 space_used += mbuf_watchdog_socket_space(so);
7923 }
7924 proc_fdunlock(p);
7925 if (space_used > args->top_app_space_used) {
7926 if (args->top_app != NULL) {
7927 proc_rele(p: args->top_app);
7928 }
7929 args->top_app = p;
7930 args->top_app_space_used = space_used;
7931
7932 return PROC_CLAIMED;
7933 } else {
7934 return PROC_RETURNED;
7935 }
7936}
7937
7938extern char *proc_name_address(void *p);
7939
7940static void
7941mbuf_watchdog_defunct(thread_call_param_t arg0, thread_call_param_t arg1)
7942{
7943#pragma unused(arg0, arg1)
7944 struct mbuf_watchdog_defunct_args args = {};
7945 struct fileproc *fp = NULL;
7946
7947 args.non_blocking = false;
7948 proc_iterate(PROC_ALLPROCLIST,
7949 callout: mbuf_watchdog_defunct_iterate, arg: &args, NULL, NULL);
7950
7951 /*
7952 * Defunct all sockets from this app.
7953 */
7954 if (args.top_app != NULL) {
7955#if CONFIG_MBUF_MCACHE
7956 /* Restart the watchdog count. */
7957 lck_mtx_lock(mbuf_mlock);
7958 microuptime(&mb_wdtstart);
7959 lck_mtx_unlock(mbuf_mlock);
7960#endif
7961 os_log(OS_LOG_DEFAULT, "%s: defuncting all sockets from %s.%d",
7962 __func__,
7963 proc_name_address(args.top_app),
7964 proc_pid(args.top_app));
7965 proc_fdlock(args.top_app);
7966 fdt_foreach(fp, args.top_app) {
7967 struct fileglob *fg = fp->fp_glob;
7968 struct socket *so = NULL;
7969
7970 if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET) {
7971 continue;
7972 }
7973 so = (struct socket *)fp_get_data(fp);
7974 if (!socket_try_lock(so)) {
7975 continue;
7976 }
7977 if (sosetdefunct(args.top_app, so,
7978 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
7979 TRUE) == 0) {
7980 sodefunct(args.top_app, so,
7981 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
7982 }
7983 socket_unlock(so, refcount: 0);
7984 }
7985 proc_fdunlock(args.top_app);
7986 proc_rele(p: args.top_app);
7987 mbstat.m_forcedefunct++;
7988#if !CONFIG_MBUF_MCACHE
7989 zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_2K);
7990 zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_4K);
7991 zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_16K);
7992 zone_drain(zone: zone_by_id(zid: ZONE_ID_MBUF));
7993 zone_drain(zone: zone_by_id(zid: ZONE_ID_CLUSTER_2K));
7994 zone_drain(zone: zone_by_id(zid: ZONE_ID_CLUSTER_4K));
7995 zone_drain(zone: zone_by_id(zid: ZONE_ID_CLUSTER_16K));
7996 zone_drain(zone: zone_by_id(zid: ZONE_ID_MBUF_REF));
7997#endif
7998 }
7999#if CONFIG_MBUF_MCACHE
8000 mbuf_watchdog_defunct_active = false;
8001#endif
8002}
8003
8004#if !CONFIG_MBUF_MCACHE
8005static LCK_GRP_DECLARE(mbuf_exhausted_grp, "mbuf-exhausted");
8006static LCK_TICKET_DECLARE(mbuf_exhausted_lock, &mbuf_exhausted_grp);
8007static uint32_t mbuf_exhausted_mask;
8008
8009#define MBUF_EXHAUSTED_DRAIN_MASK (\
8010 (1u << MC_MBUF) | \
8011 (1u << MC_CL) | \
8012 (1u << MC_BIGCL) | \
8013 (1u << MC_16KCL))
8014
8015#define MBUF_EXHAUSTED_DEFUNCT_MASK (\
8016 (1u << MC_MBUF) | \
8017 (1u << MC_MBUF_CL) | \
8018 (1u << MC_MBUF_BIGCL) | \
8019 (1u << MC_MBUF_16KCL))
8020
8021static void
8022mbuf_watchdog_drain_composite(thread_call_param_t arg0, thread_call_param_t arg1)
8023{
8024#pragma unused(arg0, arg1)
8025 zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_2K);
8026 zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_4K);
8027 zcache_drain(zone_id: ZONE_ID_MBUF_CLUSTER_16K);
8028}
8029
8030static void
8031mbuf_zone_exhausted_start(uint32_t bit)
8032{
8033 uint64_t deadline;
8034 uint32_t mask;
8035
8036 mask = mbuf_exhausted_mask;
8037 mbuf_exhausted_mask = mask | bit;
8038
8039 if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
8040 (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
8041 clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 10,
8042 NSEC_PER_MSEC, result: &deadline);
8043 thread_call_enter_delayed(call: mbuf_drain_tcall, deadline);
8044 }
8045
8046 if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
8047 (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
8048 clock_interval_to_deadline(MB_WDT_MAXTIME * 1000 / 2,
8049 NSEC_PER_MSEC, result: &deadline);
8050 thread_call_enter_delayed(call: mbuf_defunct_tcall, deadline);
8051 }
8052}
8053
8054static void
8055mbuf_zone_exhausted_end(uint32_t bit)
8056{
8057 uint32_t mask;
8058
8059 mask = (mbuf_exhausted_mask &= ~bit);
8060
8061 if ((mask & MBUF_EXHAUSTED_DRAIN_MASK) == 0 &&
8062 (bit & MBUF_EXHAUSTED_DRAIN_MASK)) {
8063 thread_call_cancel(call: mbuf_drain_tcall);
8064 }
8065
8066 if ((mask & MBUF_EXHAUSTED_DEFUNCT_MASK) == 0 &&
8067 (bit & MBUF_EXHAUSTED_DEFUNCT_MASK)) {
8068 thread_call_cancel(call: mbuf_defunct_tcall);
8069 }
8070}
8071
8072static void
8073mbuf_zone_exhausted(zone_id_t zid, zone_t zone __unused, bool exhausted)
8074{
8075 uint32_t bit;
8076
8077 if (zid < m_class_to_zid(MBUF_CLASS_MIN) ||
8078 zid > m_class_to_zid(MBUF_CLASS_MAX)) {
8079 return;
8080 }
8081
8082 bit = 1u << m_class_from_zid(zid);
8083
8084 lck_ticket_lock_nopreempt(tlock: &mbuf_exhausted_lock, grp: &mbuf_exhausted_grp);
8085
8086 if (exhausted) {
8087 mbuf_zone_exhausted_start(bit);
8088 } else {
8089 mbuf_zone_exhausted_end(bit);
8090 }
8091
8092 lck_ticket_unlock_nopreempt(tlock: &mbuf_exhausted_lock);
8093}
8094EVENT_REGISTER_HANDLER(ZONE_EXHAUSTED, mbuf_zone_exhausted);
8095#endif /* !CONFIG_MBUF_MCACHE */
8096
8097#if CONFIG_MBUF_MCACHE
8098/*
8099 * Called during slab (blocking and non-blocking) allocation. If there
8100 * is at least one waiter, and the time since the first waiter is blocked
8101 * is greater than the watchdog timeout, panic the system.
8102 */
8103static void
8104mbuf_watchdog(void)
8105{
8106 struct timeval now;
8107 unsigned int since;
8108 static thread_call_t defunct_tcall = NULL;
8109
8110 if (mb_waiters == 0 || !mb_watchdog) {
8111 return;
8112 }
8113
8114 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8115
8116 microuptime(&now);
8117 since = now.tv_sec - mb_wdtstart.tv_sec;
8118
8119 if (mbuf_watchdog_defunct_active) {
8120 /*
8121 * Don't panic the system while we are trying
8122 * to find sockets to defunct.
8123 */
8124 return;
8125 }
8126 if (since >= MB_WDT_MAXTIME) {
8127 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
8128 mb_waiters, since, mbuf_dump());
8129 /* NOTREACHED */
8130 }
8131 /*
8132 * Check if we are about to panic the system due
8133 * to lack of mbufs and start defuncting sockets
8134 * from processes that use too many sockets.
8135 *
8136 * We're always called with the mbuf_mlock held,
8137 * so that also protects mbuf_watchdog_defunct_active.
8138 */
8139 if (since >= MB_WDT_MAXTIME / 2) {
8140 /*
8141 * Start a thread to defunct sockets
8142 * from apps that are over-using their socket
8143 * buffers.
8144 */
8145 if (defunct_tcall == NULL) {
8146 defunct_tcall =
8147 thread_call_allocate_with_options(mbuf_watchdog_defunct,
8148 NULL,
8149 THREAD_CALL_PRIORITY_KERNEL,
8150 THREAD_CALL_OPTIONS_ONCE);
8151 }
8152 if (defunct_tcall != NULL) {
8153 mbuf_watchdog_defunct_active = true;
8154 thread_call_enter(defunct_tcall);
8155 }
8156 }
8157}
8158
8159/*
8160 * Called during blocking allocation. Returns TRUE if one or more objects
8161 * are available at the per-CPU caches layer and that allocation should be
8162 * retried at that level.
8163 */
8164static boolean_t
8165mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
8166{
8167 boolean_t mcache_retry = FALSE;
8168
8169 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8170
8171 /* Check if there's anything at the cache layer */
8172 if (mbuf_cached_above(class, wait)) {
8173 mcache_retry = TRUE;
8174 goto done;
8175 }
8176
8177 /* Nothing? Then try hard to get it from somewhere */
8178 m_reclaim(class, num, (wait & MCR_COMP));
8179
8180 /* We tried hard and got something? */
8181 if (m_infree(class) > 0) {
8182 mbstat.m_wait++;
8183 goto done;
8184 } else if (mbuf_cached_above(class, wait)) {
8185 mbstat.m_wait++;
8186 mcache_retry = TRUE;
8187 goto done;
8188 } else if (wait & MCR_TRYHARD) {
8189 mcache_retry = TRUE;
8190 goto done;
8191 }
8192
8193 /*
8194 * There's really nothing for us right now; inform the
8195 * cache(s) that there is a waiter below and go to sleep.
8196 */
8197 mbuf_waiter_inc(class, (wait & MCR_COMP));
8198
8199 VERIFY(!(wait & MCR_NOSLEEP));
8200
8201 /*
8202 * If this is the first waiter, arm the watchdog timer. Otherwise
8203 * check if we need to panic the system due to watchdog timeout.
8204 */
8205 if (mb_waiters == 0) {
8206 microuptime(&mb_wdtstart);
8207 } else {
8208 mbuf_watchdog();
8209 }
8210
8211 mb_waiters++;
8212 m_region_expand(class) += m_total(class) + num;
8213 /* wake up the worker thread */
8214 if (mbuf_worker_ready &&
8215 mbuf_worker_needs_wakeup) {
8216 wakeup((caddr_t)&mbuf_worker_needs_wakeup);
8217 mbuf_worker_needs_wakeup = FALSE;
8218 }
8219 mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
8220 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO - 1), m_cname(class), NULL);
8221 mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
8222
8223 /* We are now up; stop getting notified until next round */
8224 mbuf_waiter_dec(class, (wait & MCR_COMP));
8225
8226 /* We waited and got something */
8227 if (m_infree(class) > 0) {
8228 mbstat.m_wait++;
8229 goto done;
8230 } else if (mbuf_cached_above(class, wait)) {
8231 mbstat.m_wait++;
8232 mcache_retry = TRUE;
8233 }
8234done:
8235 return mcache_retry;
8236}
8237
8238__attribute__((noreturn))
8239static void
8240mbuf_worker_thread(void)
8241{
8242 int mbuf_expand;
8243
8244 while (1) {
8245 lck_mtx_lock(mbuf_mlock);
8246 mbwdog_logger("worker thread running");
8247 mbuf_worker_run_cnt++;
8248 mbuf_expand = 0;
8249 /*
8250 * Allocations are based on page size, so if we have depleted
8251 * the reserved spaces, try to free mbufs from the major classes.
8252 */
8253#if PAGE_SIZE == 4096
8254 uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
8255 uint32_t m_clusters = m_total(MC_CL);
8256 uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
8257 uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
8258 if (sumclusters >= nclusters) {
8259 mbwdog_logger("reclaiming bigcl");
8260 mbuf_drain_locked(TRUE);
8261 m_reclaim(MC_BIGCL, 4, FALSE);
8262 }
8263#else
8264 uint32_t m_16kclusters = m_total(MC_16KCL);
8265 if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
8266 mbwdog_logger("reclaiming 16kcl");
8267 mbuf_drain_locked(TRUE);
8268 m_reclaim(MC_16KCL, 4, FALSE);
8269 }
8270#endif
8271 if (m_region_expand(MC_CL) > 0) {
8272 int n;
8273 mb_expand_cl_cnt++;
8274 /* Adjust to current number of cluster in use */
8275 n = m_region_expand(MC_CL) -
8276 (m_total(MC_CL) - m_infree(MC_CL));
8277 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) {
8278 n = m_maxlimit(MC_CL) - m_total(MC_CL);
8279 }
8280 if (n > 0) {
8281 mb_expand_cl_total += n;
8282 }
8283 m_region_expand(MC_CL) = 0;
8284
8285 if (n > 0) {
8286 mbwdog_logger("expanding MC_CL by %d", n);
8287 freelist_populate(MC_CL, n, M_WAIT);
8288 }
8289 }
8290 if (m_region_expand(MC_BIGCL) > 0) {
8291 int n;
8292 mb_expand_bigcl_cnt++;
8293 /* Adjust to current number of 4 KB cluster in use */
8294 n = m_region_expand(MC_BIGCL) -
8295 (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
8296 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) {
8297 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
8298 }
8299 if (n > 0) {
8300 mb_expand_bigcl_total += n;
8301 }
8302 m_region_expand(MC_BIGCL) = 0;
8303
8304 if (n > 0) {
8305 mbwdog_logger("expanding MC_BIGCL by %d", n);
8306 freelist_populate(MC_BIGCL, n, M_WAIT);
8307 }
8308 }
8309 if (m_region_expand(MC_16KCL) > 0) {
8310 int n;
8311 mb_expand_16kcl_cnt++;
8312 /* Adjust to current number of 16 KB cluster in use */
8313 n = m_region_expand(MC_16KCL) -
8314 (m_total(MC_16KCL) - m_infree(MC_16KCL));
8315 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) {
8316 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
8317 }
8318 if (n > 0) {
8319 mb_expand_16kcl_total += n;
8320 }
8321 m_region_expand(MC_16KCL) = 0;
8322
8323 if (n > 0) {
8324 mbwdog_logger("expanding MC_16KCL by %d", n);
8325 (void) freelist_populate(MC_16KCL, n, M_WAIT);
8326 }
8327 }
8328
8329 /*
8330 * Because we can run out of memory before filling the mbuf
8331 * map, we should not allocate more clusters than they are
8332 * mbufs -- otherwise we could have a large number of useless
8333 * clusters allocated.
8334 */
8335 mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
8336 m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
8337 m_total(MC_16KCL));
8338 uint32_t total_mbufs = m_total(MC_MBUF);
8339 uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
8340 m_total(MC_16KCL);
8341 if (total_mbufs < total_clusters) {
8342 mbwdog_logger("expanding MC_MBUF by %d",
8343 total_clusters - total_mbufs);
8344 }
8345 while (total_mbufs < total_clusters) {
8346 mb_expand_cnt++;
8347 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) {
8348 break;
8349 }
8350 total_mbufs = m_total(MC_MBUF);
8351 total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
8352 m_total(MC_16KCL);
8353 }
8354
8355 mbuf_worker_needs_wakeup = TRUE;
8356 /*
8357 * If there's a deadlock and we're not sending / receiving
8358 * packets, net_uptime() won't be updated. Update it here
8359 * so we are sure it's correct.
8360 */
8361 net_update_uptime();
8362 mbuf_worker_last_runtime = net_uptime();
8363 assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
8364 THREAD_UNINT);
8365 mbwdog_logger("worker thread sleeping");
8366 lck_mtx_unlock(mbuf_mlock);
8367 (void) thread_block((thread_continue_t)mbuf_worker_thread);
8368 }
8369}
8370
8371__attribute__((noreturn))
8372static void
8373mbuf_worker_thread_init(void)
8374{
8375 mbuf_worker_ready++;
8376 mbuf_worker_thread();
8377}
8378
8379static mcl_slab_t *
8380slab_get(void *buf)
8381{
8382 mcl_slabg_t *slg;
8383 unsigned int ix, k;
8384
8385 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
8386
8387 VERIFY(MBUF_IN_MAP(buf));
8388 ix = ((unsigned char *)buf - mbutl) >> MBSHIFT;
8389 VERIFY(ix < maxslabgrp);
8390
8391 if ((slg = slabstbl[ix]) == NULL) {
8392 /*
8393 * In the current implementation, we never shrink the slabs
8394 * table; if we attempt to reallocate a cluster group when
8395 * it's already allocated, panic since this is a sign of a
8396 * memory corruption (slabstbl[ix] got nullified).
8397 */
8398 ++slabgrp;
8399 VERIFY(ix < slabgrp);
8400 /*
8401 * Slabs expansion can only be done single threaded; when
8402 * we get here, it must be as a result of m_clalloc() which
8403 * is serialized and therefore mb_clalloc_busy must be set.
8404 */
8405 VERIFY(mb_clalloc_busy);
8406 lck_mtx_unlock(mbuf_mlock);
8407
8408 /* This is a new buffer; create the slabs group for it */
8409 slg = zalloc_permanent_type(mcl_slabg_t);
8410 slg->slg_slab = zalloc_permanent(sizeof(mcl_slab_t) * NSLABSPMB,
8411 ZALIGN(mcl_slab_t));
8412
8413 lck_mtx_lock(mbuf_mlock);
8414 /*
8415 * No other thread could have gone into m_clalloc() after
8416 * we dropped the lock above, so verify that it's true.
8417 */
8418 VERIFY(mb_clalloc_busy);
8419
8420 slabstbl[ix] = slg;
8421
8422 /* Chain each slab in the group to its forward neighbor */
8423 for (k = 1; k < NSLABSPMB; k++) {
8424 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
8425 }
8426 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
8427
8428 /* And chain the last slab in the previous group to this */
8429 if (ix > 0) {
8430 VERIFY(slabstbl[ix - 1]->
8431 slg_slab[NSLABSPMB - 1].sl_next == NULL);
8432 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
8433 &slg->slg_slab[0];
8434 }
8435 }
8436
8437 ix = MTOPG(buf) % NSLABSPMB;
8438 VERIFY(ix < NSLABSPMB);
8439
8440 return &slg->slg_slab[ix];
8441}
8442
8443static void
8444slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
8445 void *base, void *head, unsigned int len, int refcnt, int chunks)
8446{
8447 sp->sl_class = class;
8448 sp->sl_flags = flags;
8449 sp->sl_base = base;
8450 sp->sl_head = head;
8451 sp->sl_len = len;
8452 sp->sl_refcnt = refcnt;
8453 sp->sl_chunks = chunks;
8454 slab_detach(sp);
8455}
8456
8457static void
8458slab_insert(mcl_slab_t *sp, mbuf_class_t class)
8459{
8460 VERIFY(slab_is_detached(sp));
8461 m_slab_cnt(class)++;
8462 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
8463 sp->sl_flags &= ~SLF_DETACHED;
8464
8465 /*
8466 * If a buffer spans multiple contiguous pages then mark them as
8467 * detached too
8468 */
8469 if (class == MC_16KCL) {
8470 int k;
8471 for (k = 1; k < NSLABSP16KB; k++) {
8472 sp = sp->sl_next;
8473 /* Next slab must already be present */
8474 VERIFY(sp != NULL && slab_is_detached(sp));
8475 sp->sl_flags &= ~SLF_DETACHED;
8476 }
8477 }
8478}
8479
8480static void
8481slab_remove(mcl_slab_t *sp, mbuf_class_t class)
8482{
8483 int k;
8484 VERIFY(!slab_is_detached(sp));
8485 VERIFY(m_slab_cnt(class) > 0);
8486 m_slab_cnt(class)--;
8487 TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
8488 slab_detach(sp);
8489 if (class == MC_16KCL) {
8490 for (k = 1; k < NSLABSP16KB; k++) {
8491 sp = sp->sl_next;
8492 /* Next slab must already be present */
8493 VERIFY(sp != NULL);
8494 VERIFY(!slab_is_detached(sp));
8495 slab_detach(sp);
8496 }
8497 }
8498}
8499
8500static boolean_t
8501slab_inrange(mcl_slab_t *sp, void *buf)
8502{
8503 return (uintptr_t)buf >= (uintptr_t)sp->sl_base &&
8504 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len);
8505}
8506
8507#undef panic
8508
8509static void
8510slab_nextptr_panic(mcl_slab_t *sp, void *addr)
8511{
8512 int i;
8513 unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
8514 uintptr_t buf = (uintptr_t)sp->sl_base;
8515
8516 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
8517 void *next = ((mcache_obj_t *)buf)->obj_next;
8518 if (next != addr) {
8519 continue;
8520 }
8521 if (!mclverify) {
8522 if (next != NULL && !MBUF_IN_MAP(next)) {
8523 mcache_t *cp = m_cache(sp->sl_class);
8524 panic("%s: %s buffer %p in slab %p modified "
8525 "after free at offset 0: %p out of range "
8526 "[%p-%p)\n", __func__, cp->mc_name,
8527 (void *)buf, sp, next, mbutl, embutl);
8528 /* NOTREACHED */
8529 }
8530 } else {
8531 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
8532 (mcache_obj_t *)buf);
8533 mcl_audit_verify_nextptr(next, mca);
8534 }
8535 }
8536}
8537
8538static void
8539slab_detach(mcl_slab_t *sp)
8540{
8541 sp->sl_link.tqe_next = (mcl_slab_t *)-1;
8542 sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
8543 sp->sl_flags |= SLF_DETACHED;
8544}
8545
8546static boolean_t
8547slab_is_detached(mcl_slab_t *sp)
8548{
8549 return (intptr_t)sp->sl_link.tqe_next == -1 &&
8550 (intptr_t)sp->sl_link.tqe_prev == -1 &&
8551 (sp->sl_flags & SLF_DETACHED);
8552}
8553
8554static void
8555mcl_audit_init(void *buf, mcache_audit_t **mca_list,
8556 mcache_obj_t **con_list, size_t con_size, unsigned int num)
8557{
8558 mcache_audit_t *mca, *mca_tail;
8559 mcache_obj_t *con = NULL;
8560 boolean_t save_contents = (con_list != NULL);
8561 unsigned int i, ix;
8562
8563 ASSERT(num <= NMBPG);
8564 ASSERT(con_list == NULL || con_size != 0);
8565
8566 ix = MTOPG(buf);
8567 VERIFY(ix < maxclaudit);
8568
8569 /* Make sure we haven't been here before */
8570 for (i = 0; i < num; i++) {
8571 VERIFY(mclaudit[ix].cl_audit[i] == NULL);
8572 }
8573
8574 mca = mca_tail = *mca_list;
8575 if (save_contents) {
8576 con = *con_list;
8577 }
8578
8579 for (i = 0; i < num; i++) {
8580 mcache_audit_t *next;
8581
8582 next = mca->mca_next;
8583 bzero(mca, sizeof(*mca));
8584 mca->mca_next = next;
8585 mclaudit[ix].cl_audit[i] = mca;
8586
8587 /* Attach the contents buffer if requested */
8588 if (save_contents) {
8589 mcl_saved_contents_t *msc =
8590 (mcl_saved_contents_t *)(void *)con;
8591
8592 VERIFY(msc != NULL);
8593 VERIFY(IS_P2ALIGNED(msc, sizeof(u_int64_t)));
8594 VERIFY(con_size == sizeof(*msc));
8595 mca->mca_contents_size = con_size;
8596 mca->mca_contents = msc;
8597 con = con->obj_next;
8598 bzero(mca->mca_contents, mca->mca_contents_size);
8599 }
8600
8601 mca_tail = mca;
8602 mca = mca->mca_next;
8603 }
8604
8605 if (save_contents) {
8606 *con_list = con;
8607 }
8608
8609 *mca_list = mca_tail->mca_next;
8610 mca_tail->mca_next = NULL;
8611}
8612
8613static void
8614mcl_audit_free(void *buf, unsigned int num)
8615{
8616 unsigned int i, ix;
8617 mcache_audit_t *mca, *mca_list;
8618
8619 ix = MTOPG(buf);
8620 VERIFY(ix < maxclaudit);
8621
8622 if (mclaudit[ix].cl_audit[0] != NULL) {
8623 mca_list = mclaudit[ix].cl_audit[0];
8624 for (i = 0; i < num; i++) {
8625 mca = mclaudit[ix].cl_audit[i];
8626 mclaudit[ix].cl_audit[i] = NULL;
8627 if (mca->mca_contents) {
8628 mcache_free(mcl_audit_con_cache,
8629 mca->mca_contents);
8630 }
8631 }
8632 mcache_free_ext(mcache_audit_cache,
8633 (mcache_obj_t *)mca_list);
8634 }
8635}
8636
8637/*
8638 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
8639 * the corresponding audit structure for that buffer.
8640 */
8641static mcache_audit_t *
8642mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *mobj)
8643{
8644 mcache_audit_t *mca = NULL;
8645 int ix = MTOPG(mobj), m_idx = 0;
8646 unsigned char *page_addr;
8647
8648 VERIFY(ix < maxclaudit);
8649 VERIFY(IS_P2ALIGNED(mobj, MIN(m_maxsize(class), PAGE_SIZE)));
8650
8651 page_addr = PGTOM(ix);
8652
8653 switch (class) {
8654 case MC_MBUF:
8655 /*
8656 * For the mbuf case, find the index of the page
8657 * used by the mbuf and use that index to locate the
8658 * base address of the page. Then find out the
8659 * mbuf index relative to the page base and use
8660 * it to locate the audit structure.
8661 */
8662 m_idx = MBPAGEIDX(page_addr, mobj);
8663 VERIFY(m_idx < (int)NMBPG);
8664 mca = mclaudit[ix].cl_audit[m_idx];
8665 break;
8666
8667 case MC_CL:
8668 /*
8669 * Same thing as above, but for 2KB clusters in a page.
8670 */
8671 m_idx = CLPAGEIDX(page_addr, mobj);
8672 VERIFY(m_idx < (int)NCLPG);
8673 mca = mclaudit[ix].cl_audit[m_idx];
8674 break;
8675
8676 case MC_BIGCL:
8677 m_idx = BCLPAGEIDX(page_addr, mobj);
8678 VERIFY(m_idx < (int)NBCLPG);
8679 mca = mclaudit[ix].cl_audit[m_idx];
8680 break;
8681 case MC_16KCL:
8682 /*
8683 * Same as above, but only return the first element.
8684 */
8685 mca = mclaudit[ix].cl_audit[0];
8686 break;
8687
8688 default:
8689 VERIFY(0);
8690 /* NOTREACHED */
8691 }
8692
8693 return mca;
8694}
8695
8696static void
8697mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
8698 boolean_t alloc)
8699{
8700 struct mbuf *m = addr;
8701 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
8702
8703 VERIFY(mca->mca_contents != NULL &&
8704 mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8705
8706 if (mclverify) {
8707 mcl_audit_verify_nextptr(next, mca);
8708 }
8709
8710 if (!alloc) {
8711 /* Save constructed mbuf fields */
8712 mcl_audit_save_mbuf(m, mca);
8713 if (mclverify) {
8714 mcache_set_pattern(MCACHE_FREE_PATTERN, m,
8715 m_maxsize(MC_MBUF));
8716 }
8717 ((mcache_obj_t *)m)->obj_next = next;
8718 return;
8719 }
8720
8721 /* Check if the buffer has been corrupted while in freelist */
8722 if (mclverify) {
8723 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
8724 }
8725 /* Restore constructed mbuf fields */
8726 mcl_audit_restore_mbuf(m, mca, composite);
8727}
8728
8729static void
8730mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
8731{
8732 struct mbuf *ms = MCA_SAVED_MBUF_PTR(mca);
8733
8734 if (composite) {
8735 struct mbuf *next = m->m_next;
8736 VERIFY(ms->m_flags == M_EXT && m_get_rfa(ms) != NULL &&
8737 MBUF_IS_COMPOSITE(ms));
8738 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8739 /*
8740 * We could have hand-picked the mbuf fields and restore
8741 * them individually, but that will be a maintenance
8742 * headache. Instead, restore everything that was saved;
8743 * the mbuf layer will recheck and reinitialize anyway.
8744 */
8745 bcopy(ms, m, MCA_SAVED_MBUF_SIZE);
8746 m->m_next = next;
8747 } else {
8748 /*
8749 * For a regular mbuf (no cluster attached) there's nothing
8750 * to restore other than the type field, which is expected
8751 * to be MT_FREE.
8752 */
8753 m->m_type = ms->m_type;
8754 }
8755 _MCHECK(m);
8756}
8757
8758static void
8759mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
8760{
8761 VERIFY(mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
8762 _MCHECK(m);
8763 bcopy(m, MCA_SAVED_MBUF_PTR(mca), MCA_SAVED_MBUF_SIZE);
8764}
8765
8766static void
8767mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
8768 boolean_t save_next)
8769{
8770 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
8771
8772 if (!alloc) {
8773 if (mclverify) {
8774 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
8775 }
8776 if (save_next) {
8777 mcl_audit_verify_nextptr(next, mca);
8778 ((mcache_obj_t *)addr)->obj_next = next;
8779 }
8780 } else if (mclverify) {
8781 /* Check if the buffer has been corrupted while in freelist */
8782 mcl_audit_verify_nextptr(next, mca);
8783 mcache_audit_free_verify_set(mca, addr, 0, size);
8784 }
8785}
8786
8787static void
8788mcl_audit_scratch(mcache_audit_t *mca)
8789{
8790 void *stack[MCACHE_STACK_DEPTH + 1];
8791 mcl_scratch_audit_t *msa;
8792 struct timeval now;
8793
8794 VERIFY(mca->mca_contents != NULL);
8795 msa = MCA_SAVED_SCRATCH_PTR(mca);
8796
8797 msa->msa_pthread = msa->msa_thread;
8798 msa->msa_thread = current_thread();
8799 bcopy(msa->msa_stack, msa->msa_pstack, sizeof(msa->msa_pstack));
8800 msa->msa_pdepth = msa->msa_depth;
8801 bzero(stack, sizeof(stack));
8802 msa->msa_depth = OSBacktrace(stack, MCACHE_STACK_DEPTH + 1) - 1;
8803 bcopy(&stack[1], msa->msa_stack, sizeof(msa->msa_stack));
8804
8805 msa->msa_ptstamp = msa->msa_tstamp;
8806 microuptime(&now);
8807 /* tstamp is in ms relative to base_ts */
8808 msa->msa_tstamp = ((now.tv_usec - mb_start.tv_usec) / 1000);
8809 if ((now.tv_sec - mb_start.tv_sec) > 0) {
8810 msa->msa_tstamp += ((now.tv_sec - mb_start.tv_sec) * 1000);
8811 }
8812}
8813
8814__abortlike
8815static void
8816mcl_audit_mcheck_panic(struct mbuf *m)
8817{
8818 char buf[DUMP_MCA_BUF_SIZE];
8819 mcache_audit_t *mca;
8820
8821 MRANGE(m);
8822 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
8823
8824 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s",
8825 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(buf, mca));
8826 /* NOTREACHED */
8827}
8828
8829__abortlike
8830static void
8831mcl_audit_verify_nextptr_panic(void *next, mcache_audit_t *mca)
8832{
8833 char buf[DUMP_MCA_BUF_SIZE];
8834 panic("mcl_audit: buffer %p modified after free at offset 0: "
8835 "%p out of range [%p-%p)\n%s\n",
8836 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(buf, mca));
8837 /* NOTREACHED */
8838}
8839
8840static void
8841mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
8842{
8843 if (next != NULL && !MBUF_IN_MAP(next) &&
8844 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
8845 mcl_audit_verify_nextptr_panic(next, mca);
8846 }
8847}
8848
8849static uintptr_t
8850hash_mix(uintptr_t x)
8851{
8852#ifndef __LP64__
8853 x += ~(x << 15);
8854 x ^= (x >> 10);
8855 x += (x << 3);
8856 x ^= (x >> 6);
8857 x += ~(x << 11);
8858 x ^= (x >> 16);
8859#else
8860 x += ~(x << 32);
8861 x ^= (x >> 22);
8862 x += ~(x << 13);
8863 x ^= (x >> 8);
8864 x += (x << 3);
8865 x ^= (x >> 15);
8866 x += ~(x << 27);
8867 x ^= (x >> 31);
8868#endif
8869 return x;
8870}
8871
8872static uint32_t
8873hashbacktrace(uintptr_t* bt, uint32_t depth, uint32_t max_size)
8874{
8875 uintptr_t hash = 0;
8876 uintptr_t mask = max_size - 1;
8877
8878 while (depth) {
8879 hash += bt[--depth];
8880 }
8881
8882 hash = hash_mix(hash) & mask;
8883
8884 assert(hash < max_size);
8885
8886 return (uint32_t) hash;
8887}
8888
8889static uint32_t
8890hashaddr(uintptr_t pt, uint32_t max_size)
8891{
8892 uintptr_t hash = 0;
8893 uintptr_t mask = max_size - 1;
8894
8895 hash = hash_mix(pt) & mask;
8896
8897 assert(hash < max_size);
8898
8899 return (uint32_t) hash;
8900}
8901
8902/* This function turns on mbuf leak detection */
8903static void
8904mleak_activate(void)
8905{
8906 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
8907 PE_parse_boot_argn("mleak_sample_factor",
8908 &mleak_table.mleak_sample_factor,
8909 sizeof(mleak_table.mleak_sample_factor));
8910
8911 if (mleak_table.mleak_sample_factor == 0) {
8912 mclfindleak = 0;
8913 }
8914
8915 if (mclfindleak == 0) {
8916 return;
8917 }
8918
8919 vm_size_t alloc_size =
8920 mleak_alloc_buckets * sizeof(struct mallocation);
8921 vm_size_t trace_size = mleak_trace_buckets * sizeof(struct mtrace);
8922
8923 mleak_allocations = zalloc_permanent(alloc_size, ZALIGN(struct mallocation));
8924 mleak_traces = zalloc_permanent(trace_size, ZALIGN(struct mtrace));
8925 mleak_stat = zalloc_permanent(MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
8926 ZALIGN(mleak_stat_t));
8927
8928 mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
8929#ifdef __LP64__
8930 mleak_stat->ml_isaddr64 = 1;
8931#endif /* __LP64__ */
8932}
8933
8934static void
8935mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
8936{
8937 int temp;
8938
8939 if (mclfindleak == 0) {
8940 return;
8941 }
8942
8943 if (!alloc) {
8944 return mleak_free(addr);
8945 }
8946
8947 temp = os_atomic_inc_orig(&mleak_table.mleak_capture, relaxed);
8948
8949 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
8950 uintptr_t bt[MLEAK_STACK_DEPTH];
8951 unsigned int logged = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
8952 mleak_log(bt, addr, logged, num);
8953 }
8954}
8955
8956/*
8957 * This function records the allocation in the mleak_allocations table
8958 * and the backtrace in the mleak_traces table; if allocation slot is in use,
8959 * replace old allocation with new one if the trace slot is in use, return
8960 * (or increment refcount if same trace).
8961 */
8962static boolean_t
8963mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
8964{
8965 struct mallocation *allocation;
8966 struct mtrace *trace;
8967 uint32_t trace_index;
8968
8969 /* Quit if someone else modifying the tables */
8970 if (!lck_mtx_try_lock_spin(mleak_lock)) {
8971 mleak_table.total_conflicts++;
8972 return FALSE;
8973 }
8974
8975 allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
8976 mleak_alloc_buckets)];
8977 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
8978 trace = &mleak_traces[trace_index];
8979
8980 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
8981 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
8982
8983 allocation->hitcount++;
8984 trace->hitcount++;
8985
8986 /*
8987 * If the allocation bucket we want is occupied
8988 * and the occupier has the same trace, just bail.
8989 */
8990 if (allocation->element != NULL &&
8991 trace_index == allocation->trace_index) {
8992 mleak_table.alloc_collisions++;
8993 lck_mtx_unlock(mleak_lock);
8994 return TRUE;
8995 }
8996
8997 /*
8998 * Store the backtrace in the traces array;
8999 * Size of zero = trace bucket is free.
9000 */
9001 if (trace->allocs > 0 &&
9002 bcmp(trace->addr, bt, (depth * sizeof(uintptr_t))) != 0) {
9003 /* Different, unique trace, but the same hash! Bail out. */
9004 trace->collisions++;
9005 mleak_table.trace_collisions++;
9006 lck_mtx_unlock(mleak_lock);
9007 return TRUE;
9008 } else if (trace->allocs > 0) {
9009 /* Same trace, already added, so increment refcount */
9010 trace->allocs++;
9011 } else {
9012 /* Found an unused trace bucket, so record the trace here */
9013 if (trace->depth != 0) {
9014 /* this slot previously used but not currently in use */
9015 mleak_table.trace_overwrites++;
9016 }
9017 mleak_table.trace_recorded++;
9018 trace->allocs = 1;
9019 memcpy(trace->addr, bt, (depth * sizeof(uintptr_t)));
9020 trace->depth = depth;
9021 trace->collisions = 0;
9022 }
9023
9024 /* Step 2: Store the allocation record in the allocations array */
9025 if (allocation->element != NULL) {
9026 /*
9027 * Replace an existing allocation. No need to preserve
9028 * because only a subset of the allocations are being
9029 * recorded anyway.
9030 */
9031 mleak_table.alloc_collisions++;
9032 } else if (allocation->trace_index != 0) {
9033 mleak_table.alloc_overwrites++;
9034 }
9035 allocation->element = addr;
9036 allocation->trace_index = trace_index;
9037 allocation->count = num;
9038 mleak_table.alloc_recorded++;
9039 mleak_table.outstanding_allocs++;
9040
9041 lck_mtx_unlock(mleak_lock);
9042 return TRUE;
9043}
9044
9045static void
9046mleak_free(mcache_obj_t *addr)
9047{
9048 while (addr != NULL) {
9049 struct mallocation *allocation = &mleak_allocations
9050 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
9051
9052 if (allocation->element == addr &&
9053 allocation->trace_index < mleak_trace_buckets) {
9054 lck_mtx_lock_spin(mleak_lock);
9055 if (allocation->element == addr &&
9056 allocation->trace_index < mleak_trace_buckets) {
9057 struct mtrace *trace;
9058 trace = &mleak_traces[allocation->trace_index];
9059 /* allocs = 0 means trace bucket is unused */
9060 if (trace->allocs > 0) {
9061 trace->allocs--;
9062 }
9063 if (trace->allocs == 0) {
9064 trace->depth = 0;
9065 }
9066 /* NULL element means alloc bucket is unused */
9067 allocation->element = NULL;
9068 mleak_table.outstanding_allocs--;
9069 }
9070 lck_mtx_unlock(mleak_lock);
9071 }
9072 addr = addr->obj_next;
9073 }
9074}
9075
9076static void
9077mleak_sort_traces()
9078{
9079 int i, j, k;
9080 struct mtrace *swap;
9081
9082 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
9083 mleak_top_trace[i] = NULL;
9084 }
9085
9086 for (i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) {
9087 if (mleak_traces[i].allocs <= 0) {
9088 continue;
9089 }
9090
9091 mleak_top_trace[j] = &mleak_traces[i];
9092 for (k = j; k > 0; k--) {
9093 if (mleak_top_trace[k]->allocs <=
9094 mleak_top_trace[k - 1]->allocs) {
9095 break;
9096 }
9097
9098 swap = mleak_top_trace[k - 1];
9099 mleak_top_trace[k - 1] = mleak_top_trace[k];
9100 mleak_top_trace[k] = swap;
9101 }
9102 j++;
9103 }
9104
9105 j--;
9106 for (; i < mleak_trace_buckets; i++) {
9107 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) {
9108 continue;
9109 }
9110
9111 mleak_top_trace[j] = &mleak_traces[i];
9112
9113 for (k = j; k > 0; k--) {
9114 if (mleak_top_trace[k]->allocs <=
9115 mleak_top_trace[k - 1]->allocs) {
9116 break;
9117 }
9118
9119 swap = mleak_top_trace[k - 1];
9120 mleak_top_trace[k - 1] = mleak_top_trace[k];
9121 mleak_top_trace[k] = swap;
9122 }
9123 }
9124}
9125
9126static void
9127mleak_update_stats()
9128{
9129 mleak_trace_stat_t *mltr;
9130 int i;
9131
9132 VERIFY(mleak_stat != NULL);
9133#ifdef __LP64__
9134 VERIFY(mleak_stat->ml_isaddr64);
9135#else
9136 VERIFY(!mleak_stat->ml_isaddr64);
9137#endif /* !__LP64__ */
9138 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
9139
9140 mleak_sort_traces();
9141
9142 mltr = &mleak_stat->ml_trace[0];
9143 bzero(mltr, sizeof(*mltr) * MLEAK_NUM_TRACES);
9144 for (i = 0; i < MLEAK_NUM_TRACES; i++) {
9145 int j;
9146
9147 if (mleak_top_trace[i] == NULL ||
9148 mleak_top_trace[i]->allocs == 0) {
9149 continue;
9150 }
9151
9152 mltr->mltr_collisions = mleak_top_trace[i]->collisions;
9153 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount;
9154 mltr->mltr_allocs = mleak_top_trace[i]->allocs;
9155 mltr->mltr_depth = mleak_top_trace[i]->depth;
9156
9157 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
9158 for (j = 0; j < mltr->mltr_depth; j++) {
9159 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
9160 }
9161
9162 mltr++;
9163 }
9164}
9165
9166static struct mbtypes {
9167 int mt_type;
9168 const char *mt_name;
9169} mbtypes[] = {
9170 { MT_DATA, "data" },
9171 { MT_OOBDATA, "oob data" },
9172 { MT_CONTROL, "ancillary data" },
9173 { MT_HEADER, "packet headers" },
9174 { MT_SOCKET, "socket structures" },
9175 { MT_PCB, "protocol control blocks" },
9176 { MT_RTABLE, "routing table entries" },
9177 { MT_HTABLE, "IMP host table entries" },
9178 { MT_ATABLE, "address resolution tables" },
9179 { MT_FTABLE, "fragment reassembly queue headers" },
9180 { MT_SONAME, "socket names and addresses" },
9181 { MT_SOOPTS, "socket options" },
9182 { MT_RIGHTS, "access rights" },
9183 { MT_IFADDR, "interface addresses" },
9184 { MT_TAG, "packet tags" },
9185 { 0, NULL }
9186};
9187
9188#define MBUF_DUMP_BUF_CHK() { \
9189 clen -= k; \
9190 if (clen < 1) \
9191 goto done; \
9192 c += k; \
9193}
9194
9195static char *
9196mbuf_dump(void)
9197{
9198 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct,
9199 totreturned = 0;
9200 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
9201 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
9202 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
9203 int nmbtypes = sizeof(mbstat.m_mtypes) / sizeof(short);
9204 uint8_t seen[256];
9205 struct mbtypes *mp;
9206 mb_class_stat_t *sp;
9207 mleak_trace_stat_t *mltr;
9208 char *c = mbuf_dump_buf;
9209 int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
9210 struct mbuf_watchdog_defunct_args args = {};
9211
9212 mbuf_dump_buf[0] = '\0';
9213
9214 /* synchronize all statistics in the mbuf table */
9215 mbuf_stat_sync();
9216 mbuf_mtypes_sync(TRUE);
9217
9218 sp = &mb_stat->mbs_class[0];
9219 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
9220 u_int32_t mem;
9221
9222 if (m_class(i) == MC_MBUF) {
9223 m_mbufs = sp->mbcl_active;
9224 } else if (m_class(i) == MC_CL) {
9225 m_clfree = sp->mbcl_total - sp->mbcl_active;
9226 } else if (m_class(i) == MC_BIGCL) {
9227 m_bigclfree = sp->mbcl_total - sp->mbcl_active;
9228 } else if (njcl > 0 && m_class(i) == MC_16KCL) {
9229 m_16kclfree = sp->mbcl_total - sp->mbcl_active;
9230 m_16kclusters = sp->mbcl_total;
9231 } else if (m_class(i) == MC_MBUF_CL) {
9232 m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
9233 } else if (m_class(i) == MC_MBUF_BIGCL) {
9234 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
9235 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
9236 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
9237 }
9238
9239 mem = sp->mbcl_ctotal * sp->mbcl_size;
9240 totmem += mem;
9241 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
9242 sp->mbcl_size;
9243 totreturned += sp->mbcl_release_cnt;
9244 }
9245
9246 /* adjust free counts to include composite caches */
9247 m_clfree += m_mbufclfree;
9248 m_bigclfree += m_mbufbigclfree;
9249 m_16kclfree += m_mbuf16kclfree;
9250
9251 totmbufs = 0;
9252 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
9253 totmbufs += mbstat.m_mtypes[mp->mt_type];
9254 }
9255 if (totmbufs > m_mbufs) {
9256 totmbufs = m_mbufs;
9257 }
9258 k = scnprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
9259 MBUF_DUMP_BUF_CHK();
9260
9261 bzero(&seen, sizeof(seen));
9262 for (mp = mbtypes; mp->mt_name != NULL; mp++) {
9263 if (mbstat.m_mtypes[mp->mt_type] != 0) {
9264 seen[mp->mt_type] = 1;
9265 k = scnprintf(c, clen, "\t%u mbufs allocated to %s\n",
9266 mbstat.m_mtypes[mp->mt_type], mp->mt_name);
9267 MBUF_DUMP_BUF_CHK();
9268 }
9269 }
9270 seen[MT_FREE] = 1;
9271 for (i = 0; i < nmbtypes; i++) {
9272 if (!seen[i] && mbstat.m_mtypes[i] != 0) {
9273 k = scnprintf(c, clen, "\t%u mbufs allocated to "
9274 "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
9275 MBUF_DUMP_BUF_CHK();
9276 }
9277 }
9278 if ((m_mbufs - totmbufs) > 0) {
9279 k = scnprintf(c, clen, "\t%lu mbufs allocated to caches\n",
9280 m_mbufs - totmbufs);
9281 MBUF_DUMP_BUF_CHK();
9282 }
9283 k = scnprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
9284 "%u/%u mbuf 4KB clusters in use\n",
9285 (unsigned int)(mbstat.m_clusters - m_clfree),
9286 (unsigned int)mbstat.m_clusters,
9287 (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
9288 (unsigned int)mbstat.m_bigclusters);
9289 MBUF_DUMP_BUF_CHK();
9290
9291 if (njcl > 0) {
9292 k = scnprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
9293 m_16kclusters - m_16kclfree, m_16kclusters,
9294 njclbytes / 1024);
9295 MBUF_DUMP_BUF_CHK();
9296 }
9297 totused = totmem - totfree;
9298 if (totmem == 0) {
9299 totpct = 0;
9300 } else if (totused < (ULONG_MAX / 100)) {
9301 totpct = (totused * 100) / totmem;
9302 } else {
9303 u_long totmem1 = totmem / 100;
9304 u_long totused1 = totused / 100;
9305 totpct = (totused1 * 100) / totmem1;
9306 }
9307 k = scnprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
9308 "in use)\n", totmem / 1024, totpct);
9309 MBUF_DUMP_BUF_CHK();
9310 k = scnprintf(c, clen, "%lu KB returned to the system\n",
9311 totreturned / 1024);
9312 MBUF_DUMP_BUF_CHK();
9313
9314 net_update_uptime();
9315
9316 k = scnprintf(c, clen,
9317 "worker thread runs: %u, expansions: %llu, cl %llu/%llu, "
9318 "bigcl %llu/%llu, 16k %llu/%llu\n", mbuf_worker_run_cnt,
9319 mb_expand_cnt, mb_expand_cl_cnt, mb_expand_cl_total,
9320 mb_expand_bigcl_cnt, mb_expand_bigcl_total, mb_expand_16kcl_cnt,
9321 mb_expand_16kcl_total);
9322 MBUF_DUMP_BUF_CHK();
9323 if (mbuf_worker_last_runtime != 0) {
9324 k = scnprintf(c, clen, "worker thread last run time: "
9325 "%llu (%llu seconds ago)\n",
9326 mbuf_worker_last_runtime,
9327 net_uptime() - mbuf_worker_last_runtime);
9328 MBUF_DUMP_BUF_CHK();
9329 }
9330 if (mbuf_drain_last_runtime != 0) {
9331 k = scnprintf(c, clen, "drain routine last run time: "
9332 "%llu (%llu seconds ago)\n",
9333 mbuf_drain_last_runtime,
9334 net_uptime() - mbuf_drain_last_runtime);
9335 MBUF_DUMP_BUF_CHK();
9336 }
9337
9338 /*
9339 * Log where the most mbufs have accumulated:
9340 * - Process socket buffers
9341 * - TCP reassembly queue
9342 * - Interface AQM queue (output) and DLIL input queue
9343 */
9344 args.non_blocking = true;
9345 proc_iterate(PROC_ALLPROCLIST,
9346 mbuf_watchdog_defunct_iterate, &args, NULL, NULL);
9347 if (args.top_app != NULL) {
9348 k = scnprintf(c, clen, "\ntop proc mbuf space %u bytes by %s:%d\n",
9349 args.top_app_space_used,
9350 proc_name_address(args.top_app),
9351 proc_pid(args.top_app));
9352 proc_rele(args.top_app);
9353 }
9354 MBUF_DUMP_BUF_CHK();
9355
9356#if INET
9357 k = dump_tcp_reass_qlen(c, clen);
9358 MBUF_DUMP_BUF_CHK();
9359#endif /* INET */
9360
9361#if MPTCP
9362 k = dump_mptcp_reass_qlen(c, clen);
9363 MBUF_DUMP_BUF_CHK();
9364#endif /* MPTCP */
9365
9366#if NETWORKING
9367 k = dlil_dump_top_if_qlen(c, clen);
9368 MBUF_DUMP_BUF_CHK();
9369#endif /* NETWORKING */
9370
9371 /* mbuf leak detection statistics */
9372 mleak_update_stats();
9373
9374 k = scnprintf(c, clen, "\nmbuf leak detection table:\n");
9375 MBUF_DUMP_BUF_CHK();
9376 k = scnprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
9377 mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
9378 mleak_table.mleak_sample_factor);
9379 MBUF_DUMP_BUF_CHK();
9380 k = scnprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
9381 mleak_table.outstanding_allocs);
9382 MBUF_DUMP_BUF_CHK();
9383 k = scnprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
9384 mleak_table.alloc_recorded, mleak_table.trace_recorded);
9385 MBUF_DUMP_BUF_CHK();
9386 k = scnprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
9387 mleak_table.alloc_collisions, mleak_table.trace_collisions);
9388 MBUF_DUMP_BUF_CHK();
9389 k = scnprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
9390 mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
9391 MBUF_DUMP_BUF_CHK();
9392 k = scnprintf(c, clen, "\tlock conflicts: %llu\n\n",
9393 mleak_table.total_conflicts);
9394 MBUF_DUMP_BUF_CHK();
9395
9396 k = scnprintf(c, clen, "top %d outstanding traces:\n",
9397 mleak_stat->ml_cnt);
9398 MBUF_DUMP_BUF_CHK();
9399 for (i = 0; i < mleak_stat->ml_cnt; i++) {
9400 mltr = &mleak_stat->ml_trace[i];
9401 k = scnprintf(c, clen, "[%d] %llu outstanding alloc(s), "
9402 "%llu hit(s), %llu collision(s)\n", (i + 1),
9403 mltr->mltr_allocs, mltr->mltr_hitcount,
9404 mltr->mltr_collisions);
9405 MBUF_DUMP_BUF_CHK();
9406 }
9407
9408 if (mleak_stat->ml_isaddr64) {
9409 k = scnprintf(c, clen, MB_LEAK_HDR_64);
9410 } else {
9411 k = scnprintf(c, clen, MB_LEAK_HDR_32);
9412 }
9413 MBUF_DUMP_BUF_CHK();
9414
9415 for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
9416 k = scnprintf(c, clen, "%2d: ", (i + 1));
9417 MBUF_DUMP_BUF_CHK();
9418 for (j = 0; j < mleak_stat->ml_cnt; j++) {
9419 mltr = &mleak_stat->ml_trace[j];
9420 if (i < mltr->mltr_depth) {
9421 if (mleak_stat->ml_isaddr64) {
9422 k = scnprintf(c, clen, "0x%0llx ",
9423 (uint64_t)VM_KERNEL_UNSLIDE(
9424 mltr->mltr_addr[i]));
9425 } else {
9426 k = scnprintf(c, clen,
9427 "0x%08x ",
9428 (uint32_t)VM_KERNEL_UNSLIDE(
9429 mltr->mltr_addr[i]));
9430 }
9431 } else {
9432 if (mleak_stat->ml_isaddr64) {
9433 k = scnprintf(c, clen,
9434 MB_LEAK_SPACING_64);
9435 } else {
9436 k = scnprintf(c, clen,
9437 MB_LEAK_SPACING_32);
9438 }
9439 }
9440 MBUF_DUMP_BUF_CHK();
9441 }
9442 k = scnprintf(c, clen, "\n");
9443 MBUF_DUMP_BUF_CHK();
9444 }
9445
9446done:
9447 return mbuf_dump_buf;
9448}
9449
9450#undef MBUF_DUMP_BUF_CHK
9451#endif /* CONFIG_MBUF_MCACHE */
9452
9453/*
9454 * Convert between a regular and a packet header mbuf. Caller is responsible
9455 * for setting or clearing M_PKTHDR; this routine does the rest of the work.
9456 */
9457int
9458m_reinit(struct mbuf *m, int hdr)
9459{
9460 int ret = 0;
9461
9462 if (hdr) {
9463 VERIFY(!(m->m_flags & M_PKTHDR));
9464 if (!(m->m_flags & M_EXT) &&
9465 (m->m_data != (uintptr_t)m->m_dat || m->m_len > 0)) {
9466 /*
9467 * If there's no external cluster attached and the
9468 * mbuf appears to contain user data, we cannot
9469 * safely convert this to a packet header mbuf,
9470 * as the packet header structure might overlap
9471 * with the data.
9472 */
9473 printf("%s: cannot set M_PKTHDR on altered mbuf %llx, "
9474 "m_data %llx (expected %llx), "
9475 "m_len %d (expected 0)\n",
9476 __func__,
9477 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m),
9478 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)m->m_data),
9479 (uint64_t)VM_KERNEL_ADDRPERM((uintptr_t)(m->m_dat)), m->m_len);
9480 ret = EBUSY;
9481 } else {
9482 VERIFY((m->m_flags & M_EXT) || m->m_data == (uintptr_t)m->m_dat);
9483 m->m_flags |= M_PKTHDR;
9484 MBUF_INIT_PKTHDR(m);
9485 }
9486 } else {
9487 /* Check for scratch area overflow */
9488 m_redzone_verify(m);
9489 /* Free the aux data and tags if there is any */
9490 m_tag_delete_chain(m);
9491 m_do_tx_compl_callback(m, NULL);
9492 m->m_flags &= ~M_PKTHDR;
9493 }
9494
9495 return ret;
9496}
9497
9498int
9499m_ext_set_prop(struct mbuf *m, uint32_t o, uint32_t n)
9500{
9501 ASSERT(m->m_flags & M_EXT);
9502 return os_atomic_cmpxchg(&MEXT_PRIV(m), o, n, acq_rel);
9503}
9504
9505uint32_t
9506m_ext_get_prop(struct mbuf *m)
9507{
9508 ASSERT(m->m_flags & M_EXT);
9509 return MEXT_PRIV(m);
9510}
9511
9512int
9513m_ext_paired_is_active(struct mbuf *m)
9514{
9515 return MBUF_IS_PAIRED(m) ? (MEXT_PREF(m) > MEXT_MINREF(m)) : 1;
9516}
9517
9518void
9519m_ext_paired_activate(struct mbuf *m)
9520{
9521 struct ext_ref *rfa;
9522 int hdr, type;
9523 caddr_t extbuf;
9524 m_ext_free_func_t extfree;
9525 u_int extsize;
9526
9527 VERIFY(MBUF_IS_PAIRED(m));
9528 VERIFY(MEXT_REF(m) == MEXT_MINREF(m));
9529 VERIFY(MEXT_PREF(m) == MEXT_MINREF(m));
9530
9531 hdr = (m->m_flags & M_PKTHDR);
9532 type = m->m_type;
9533 extbuf = m->m_ext.ext_buf;
9534 extfree = m_get_ext_free(m);
9535 extsize = m->m_ext.ext_size;
9536 rfa = m_get_rfa(m);
9537
9538 VERIFY(extbuf != NULL && rfa != NULL);
9539
9540 /*
9541 * Safe to reinitialize packet header tags, since it's
9542 * already taken care of at m_free() time. Similar to
9543 * what's done in m_clattach() for the cluster. Bump
9544 * up MEXT_PREF to indicate activation.
9545 */
9546 MBUF_INIT(m, hdr, type);
9547 MEXT_INIT(m, buf: extbuf, size: extsize, free: extfree, free_arg: (caddr_t)m, rfa,
9548 min: 1, ref: 1, pref: 2, EXTF_PAIRED, MEXT_PRIV(m), pm: m);
9549}
9550
9551void
9552m_scratch_init(struct mbuf *m)
9553{
9554 struct pkthdr *pkt = &m->m_pkthdr;
9555
9556 VERIFY(m->m_flags & M_PKTHDR);
9557
9558 /* See comments in <rdar://problem/14040693> */
9559 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
9560 panic_plain("Invalid attempt to modify guarded module-private "
9561 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
9562 /* NOTREACHED */
9563 }
9564
9565 bzero(s: &pkt->pkt_mpriv, n: sizeof(pkt->pkt_mpriv));
9566}
9567
9568/*
9569 * This routine is reserved for mbuf_get_driver_scratch(); clients inside
9570 * xnu that intend on utilizing the module-private area should directly
9571 * refer to the pkt_mpriv structure in the pkthdr. They are also expected
9572 * to set and clear PKTF_PRIV_GUARDED, while owning the packet and prior
9573 * to handing it off to another module, respectively.
9574 */
9575u_int32_t
9576m_scratch_get(struct mbuf *m, u_int8_t **p)
9577{
9578 struct pkthdr *pkt = &m->m_pkthdr;
9579
9580 VERIFY(m->m_flags & M_PKTHDR);
9581
9582 /* See comments in <rdar://problem/14040693> */
9583 if (pkt->pkt_flags & PKTF_PRIV_GUARDED) {
9584 panic_plain("Invalid attempt to access guarded module-private "
9585 "area: mbuf %p, pkt_flags 0x%x\n", m, pkt->pkt_flags);
9586 /* NOTREACHED */
9587 }
9588
9589#if CONFIG_MBUF_MCACHE
9590 if (mcltrace) {
9591 mcache_audit_t *mca;
9592
9593 lck_mtx_lock(mbuf_mlock);
9594 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
9595 if (mca->mca_uflags & MB_SCVALID) {
9596 mcl_audit_scratch(mca);
9597 }
9598 lck_mtx_unlock(mbuf_mlock);
9599 }
9600#endif /* CONFIG_MBUF_MCACHE */
9601
9602 *p = (u_int8_t *)&pkt->pkt_mpriv;
9603 return sizeof(pkt->pkt_mpriv);
9604}
9605
9606void
9607m_add_crumb(struct mbuf *m, uint16_t crumb)
9608{
9609 VERIFY(m->m_flags & M_PKTHDR);
9610
9611 m->m_pkthdr.pkt_crumbs |= crumb;
9612}
9613
9614static void
9615m_redzone_init(struct mbuf *m)
9616{
9617 VERIFY(m->m_flags & M_PKTHDR);
9618 /*
9619 * Each mbuf has a unique red zone pattern, which is a XOR
9620 * of the red zone cookie and the address of the mbuf.
9621 */
9622 m->m_pkthdr.redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
9623}
9624
9625static void
9626m_redzone_verify(struct mbuf *m)
9627{
9628 u_int32_t mb_redzone;
9629
9630 VERIFY(m->m_flags & M_PKTHDR);
9631
9632 mb_redzone = ((u_int32_t)(uintptr_t)m) ^ mb_redzone_cookie;
9633 if (m->m_pkthdr.redzone != mb_redzone) {
9634 panic("mbuf %p redzone violation with value 0x%x "
9635 "(instead of 0x%x, using cookie 0x%x)\n",
9636 m, m->m_pkthdr.redzone, mb_redzone, mb_redzone_cookie);
9637 /* NOTREACHED */
9638 }
9639}
9640
9641__private_extern__ inline void
9642m_set_ext(struct mbuf *m, struct ext_ref *rfa, m_ext_free_func_t ext_free,
9643 caddr_t ext_arg)
9644{
9645 VERIFY(m->m_flags & M_EXT);
9646 if (rfa != NULL) {
9647 m_set_rfa(m, rfa);
9648 if (ext_free != NULL) {
9649 rfa->ext_token = ((uintptr_t)&rfa->ext_token) ^
9650 mb_obscure_extfree;
9651 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ rfa->ext_token;
9652 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9653 if (ext_arg != NULL) {
9654 m->m_ext.ext_arg =
9655 (caddr_t)(((uintptr_t)ext_arg) ^ rfa->ext_token);
9656 } else {
9657 m->m_ext.ext_arg = NULL;
9658 }
9659 } else {
9660 rfa->ext_token = 0;
9661 m->m_ext.ext_free = NULL;
9662 m->m_ext.ext_arg = NULL;
9663 }
9664 } else {
9665 /*
9666 * If we are going to loose the cookie in ext_token by
9667 * resetting the rfa, we should use the global cookie
9668 * to obscure the ext_free and ext_arg pointers.
9669 */
9670 if (ext_free != NULL) {
9671 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, ext_free) ^ mb_obscure_extfree;
9672 m->m_ext.ext_free = ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9673 if (ext_arg != NULL) {
9674 m->m_ext.ext_arg =
9675 (caddr_t)((uintptr_t)ext_arg ^
9676 mb_obscure_extfree);
9677 } else {
9678 m->m_ext.ext_arg = NULL;
9679 }
9680 } else {
9681 m->m_ext.ext_free = NULL;
9682 m->m_ext.ext_arg = NULL;
9683 }
9684 m->m_ext.ext_refflags = NULL;
9685 }
9686}
9687
9688__private_extern__ inline struct ext_ref *
9689m_get_rfa(struct mbuf *m)
9690{
9691 if (m->m_ext.ext_refflags == NULL) {
9692 return NULL;
9693 } else {
9694 return (struct ext_ref *)(((uintptr_t)m->m_ext.ext_refflags) ^ mb_obscure_extref);
9695 }
9696}
9697
9698static inline void
9699m_set_rfa(struct mbuf *m, struct ext_ref *rfa)
9700{
9701 if (rfa != NULL) {
9702 m->m_ext.ext_refflags =
9703 (struct ext_ref *)(((uintptr_t)rfa) ^ mb_obscure_extref);
9704 } else {
9705 m->m_ext.ext_refflags = NULL;
9706 }
9707}
9708
9709__private_extern__ inline m_ext_free_func_t
9710m_get_ext_free(struct mbuf *m)
9711{
9712 struct ext_ref *rfa;
9713 if (m->m_ext.ext_free == NULL) {
9714 return NULL;
9715 }
9716
9717 rfa = m_get_rfa(m);
9718 if (rfa == NULL) {
9719 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ mb_obscure_extfree;
9720 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9721 } else {
9722 uintptr_t ext_free_val = ptrauth_nop_cast(uintptr_t, m->m_ext.ext_free) ^ rfa->ext_token;
9723 return ptrauth_nop_cast(m_ext_free_func_t, ext_free_val);
9724 }
9725}
9726
9727__private_extern__ inline caddr_t
9728m_get_ext_arg(struct mbuf *m)
9729{
9730 struct ext_ref *rfa;
9731 if (m->m_ext.ext_arg == NULL) {
9732 return NULL;
9733 }
9734
9735 rfa = m_get_rfa(m);
9736 if (rfa == NULL) {
9737 return (caddr_t)((uintptr_t)m->m_ext.ext_arg ^ mb_obscure_extfree);
9738 } else {
9739 return (caddr_t)(((uintptr_t)m->m_ext.ext_arg) ^
9740 rfa->ext_token);
9741 }
9742}
9743
9744#if CONFIG_MBUF_MCACHE
9745/*
9746 * Simple routine to avoid taking the lock when we can't run the
9747 * mbuf drain.
9748 */
9749static int
9750mbuf_drain_checks(boolean_t ignore_waiters)
9751{
9752 if (mb_drain_maxint == 0) {
9753 return 0;
9754 }
9755 if (!ignore_waiters && mb_waiters != 0) {
9756 return 0;
9757 }
9758
9759 return 1;
9760}
9761
9762/*
9763 * Called by the VM when there's memory pressure or when we exhausted
9764 * the 4k/16k reserved space.
9765 */
9766static void
9767mbuf_drain_locked(boolean_t ignore_waiters)
9768{
9769 mbuf_class_t mc;
9770 mcl_slab_t *sp, *sp_tmp, *nsp;
9771 unsigned int num, k, interval, released = 0;
9772 unsigned long total_mem = 0, use_mem = 0;
9773 boolean_t ret, purge_caches = FALSE;
9774 ppnum_t offset;
9775 mcache_obj_t *obj;
9776 unsigned long per;
9777 static unsigned char scratch[32];
9778 static ppnum_t scratch_pa = 0;
9779
9780 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
9781 if (!mbuf_drain_checks(ignore_waiters)) {
9782 return;
9783 }
9784 if (scratch_pa == 0) {
9785 bzero(scratch, sizeof(scratch));
9786 scratch_pa = pmap_find_phys(kernel_pmap, (addr64_t)scratch);
9787 VERIFY(scratch_pa);
9788 } else if (mclverify) {
9789 /*
9790 * Panic if a driver wrote to our scratch memory.
9791 */
9792 for (k = 0; k < sizeof(scratch); k++) {
9793 if (scratch[k]) {
9794 panic("suspect DMA to freed address");
9795 }
9796 }
9797 }
9798 /*
9799 * Don't free memory too often as that could cause excessive
9800 * waiting times for mbufs. Purge caches if we were asked to drain
9801 * in the last 5 minutes.
9802 */
9803 if (mbuf_drain_last_runtime != 0) {
9804 interval = net_uptime() - mbuf_drain_last_runtime;
9805 if (interval <= mb_drain_maxint) {
9806 return;
9807 }
9808 if (interval <= mb_drain_maxint * 5) {
9809 purge_caches = TRUE;
9810 }
9811 }
9812 mbuf_drain_last_runtime = net_uptime();
9813 /*
9814 * Don't free any memory if we're using 60% or more.
9815 */
9816 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
9817 total_mem += m_total(mc) * m_maxsize(mc);
9818 use_mem += m_active(mc) * m_maxsize(mc);
9819 }
9820 per = (use_mem * 100) / total_mem;
9821 if (per >= 60) {
9822 return;
9823 }
9824 /*
9825 * Purge all the caches. This effectively disables
9826 * caching for a few seconds, but the mbuf worker thread will
9827 * re-enable them again.
9828 */
9829 if (purge_caches == TRUE) {
9830 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
9831 if (m_total(mc) < m_avgtotal(mc)) {
9832 continue;
9833 }
9834 lck_mtx_unlock(mbuf_mlock);
9835 ret = mcache_purge_cache(m_cache(mc), FALSE);
9836 lck_mtx_lock(mbuf_mlock);
9837 if (ret == TRUE) {
9838 m_purge_cnt(mc)++;
9839 }
9840 }
9841 }
9842 /*
9843 * Move the objects from the composite class freelist to
9844 * the rudimentary slabs list, but keep at least 10% of the average
9845 * total in the freelist.
9846 */
9847 for (mc = 0; mc < NELEM(mbuf_table); mc++) {
9848 while (m_cobjlist(mc) &&
9849 m_total(mc) < m_avgtotal(mc) &&
9850 m_infree(mc) > 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
9851 obj = m_cobjlist(mc);
9852 m_cobjlist(mc) = obj->obj_next;
9853 obj->obj_next = NULL;
9854 num = cslab_free(mc, obj, 1);
9855 VERIFY(num == 1);
9856 m_free_cnt(mc)++;
9857 m_infree(mc)--;
9858 /* cslab_free() handles m_total */
9859 }
9860 }
9861 /*
9862 * Free the buffers present in the slab list up to 10% of the total
9863 * average per class.
9864 *
9865 * We walk the list backwards in an attempt to reduce fragmentation.
9866 */
9867 for (mc = NELEM(mbuf_table) - 1; (int)mc >= 0; mc--) {
9868 TAILQ_FOREACH_SAFE(sp, &m_slablist(mc), sl_link, sp_tmp) {
9869 /*
9870 * Process only unused slabs occupying memory.
9871 */
9872 if (sp->sl_refcnt != 0 || sp->sl_len == 0 ||
9873 sp->sl_base == NULL) {
9874 continue;
9875 }
9876 if (m_total(mc) < m_avgtotal(mc) ||
9877 m_infree(mc) < 0.1 * m_avgtotal(mc) + m_minlimit(mc)) {
9878 break;
9879 }
9880 slab_remove(sp, mc);
9881 switch (mc) {
9882 case MC_MBUF:
9883 m_infree(mc) -= NMBPG;
9884 m_total(mc) -= NMBPG;
9885 if (mclaudit != NULL) {
9886 mcl_audit_free(sp->sl_base, NMBPG);
9887 }
9888 break;
9889 case MC_CL:
9890 m_infree(mc) -= NCLPG;
9891 m_total(mc) -= NCLPG;
9892 if (mclaudit != NULL) {
9893 mcl_audit_free(sp->sl_base, NMBPG);
9894 }
9895 break;
9896 case MC_BIGCL:
9897 {
9898 m_infree(mc) -= NBCLPG;
9899 m_total(mc) -= NBCLPG;
9900 if (mclaudit != NULL) {
9901 mcl_audit_free(sp->sl_base, NMBPG);
9902 }
9903 break;
9904 }
9905 case MC_16KCL:
9906 m_infree(mc)--;
9907 m_total(mc)--;
9908 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
9909 nsp = nsp->sl_next;
9910 VERIFY(nsp->sl_refcnt == 0 &&
9911 nsp->sl_base != NULL &&
9912 nsp->sl_len == 0);
9913 slab_init(nsp, 0, 0, NULL, NULL, 0, 0,
9914 0);
9915 nsp->sl_flags = 0;
9916 }
9917 if (mclaudit != NULL) {
9918 if (sp->sl_len == PAGE_SIZE) {
9919 mcl_audit_free(sp->sl_base,
9920 NMBPG);
9921 } else {
9922 mcl_audit_free(sp->sl_base, 1);
9923 }
9924 }
9925 break;
9926 default:
9927 /*
9928 * The composite classes have their own
9929 * freelist (m_cobjlist), so we only
9930 * process rudimentary classes here.
9931 */
9932 VERIFY(0);
9933 }
9934 m_release_cnt(mc) += m_size(mc);
9935 released += m_size(mc);
9936 VERIFY(sp->sl_base != NULL &&
9937 sp->sl_len >= PAGE_SIZE);
9938 offset = MTOPG(sp->sl_base);
9939 /*
9940 * Make sure the IOMapper points to a valid, but
9941 * bogus, address. This should prevent further DMA
9942 * accesses to freed memory.
9943 */
9944 IOMapperInsertPage(mcl_paddr_base, offset, scratch_pa);
9945 mcl_paddr[offset] = 0;
9946 kmem_free(mb_map, (vm_offset_t)sp->sl_base,
9947 sp->sl_len);
9948 slab_init(sp, 0, 0, NULL, NULL, 0, 0, 0);
9949 sp->sl_flags = 0;
9950 }
9951 }
9952 mbstat.m_drain++;
9953 mbstat.m_bigclusters = m_total(MC_BIGCL);
9954 mbstat.m_clusters = m_total(MC_CL);
9955 mbstat.m_mbufs = m_total(MC_MBUF);
9956 mbuf_stat_sync();
9957 mbuf_mtypes_sync(TRUE);
9958}
9959
9960__private_extern__ void
9961mbuf_drain(boolean_t ignore_waiters)
9962{
9963 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
9964 if (!mbuf_drain_checks(ignore_waiters)) {
9965 return;
9966 }
9967 lck_mtx_lock(mbuf_mlock);
9968 mbuf_drain_locked(ignore_waiters);
9969 lck_mtx_unlock(mbuf_mlock);
9970}
9971
9972
9973static int
9974m_drain_force_sysctl SYSCTL_HANDLER_ARGS
9975{
9976#pragma unused(arg1, arg2)
9977 int val = 0, err;
9978
9979 err = sysctl_handle_int(oidp, &val, 0, req);
9980 if (err != 0 || req->newptr == USER_ADDR_NULL) {
9981 return err;
9982 }
9983 if (val) {
9984 mbuf_drain(TRUE);
9985 }
9986
9987 return err;
9988}
9989
9990#if DEBUG || DEVELOPMENT
9991__printflike(3, 4)
9992static void
9993_mbwdog_logger(const char *func, const int line, const char *fmt, ...)
9994{
9995 va_list ap;
9996 struct timeval now;
9997 char str[384], p[256];
9998 int len;
9999
10000 LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
10001 if (mbwdog_logging == NULL) {
10002 /*
10003 * This might block under a mutex, which isn't really great,
10004 * but this happens once, so we'll live.
10005 */
10006 mbwdog_logging = zalloc_permanent(mbwdog_logging_size,
10007 ZALIGN_NONE);
10008 }
10009 va_start(ap, fmt);
10010 vsnprintf(p, sizeof(p), fmt, ap);
10011 va_end(ap);
10012 microuptime(&now);
10013 len = scnprintf(str, sizeof(str),
10014 "\n%ld.%d (%d/%llx) %s:%d %s",
10015 now.tv_sec, now.tv_usec,
10016 proc_getpid(current_proc()),
10017 (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
10018 func, line, p);
10019 if (len < 0) {
10020 return;
10021 }
10022 if (mbwdog_logging_used + len > mbwdog_logging_size) {
10023 mbwdog_logging_used = mbwdog_logging_used / 2;
10024 memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
10025 mbwdog_logging_size - mbwdog_logging_used);
10026 mbwdog_logging[mbwdog_logging_used] = 0;
10027 }
10028 strlcat(mbwdog_logging, str, mbwdog_logging_size);
10029 mbwdog_logging_used += len;
10030}
10031
10032#endif // DEBUG || DEVELOPMENT
10033
10034static void
10035mtracelarge_register(size_t size)
10036{
10037 int i;
10038 struct mtracelarge *trace;
10039 uintptr_t bt[MLEAK_STACK_DEPTH];
10040 unsigned int depth;
10041
10042 depth = backtrace(bt, MLEAK_STACK_DEPTH, NULL, NULL);
10043 /* Check if this entry is already on the list. */
10044 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
10045 trace = &mtracelarge_table[i];
10046 if (trace->size == size && trace->depth == depth &&
10047 memcmp(bt, trace->addr, depth * sizeof(uintptr_t)) == 0) {
10048 return;
10049 }
10050 }
10051 for (i = 0; i < MTRACELARGE_NUM_TRACES; i++) {
10052 trace = &mtracelarge_table[i];
10053 if (size > trace->size) {
10054 trace->depth = depth;
10055 memcpy(trace->addr, bt, depth * sizeof(uintptr_t));
10056 trace->size = size;
10057 break;
10058 }
10059 }
10060}
10061
10062#if DEBUG || DEVELOPMENT
10063
10064static int
10065mbuf_wd_dump_sysctl SYSCTL_HANDLER_ARGS
10066{
10067 char *str;
10068
10069 ifnet_head_lock_shared();
10070 lck_mtx_lock(mbuf_mlock);
10071
10072 str = mbuf_dump();
10073
10074 lck_mtx_unlock(mbuf_mlock);
10075 ifnet_head_done();
10076
10077 return sysctl_io_string(req, str, 0, 0, NULL);
10078}
10079
10080#endif /* DEBUG || DEVELOPMENT */
10081#endif /* CONFIG_MBUF_MCACHE */
10082
10083SYSCTL_DECL(_kern_ipc);
10084#if DEBUG || DEVELOPMENT
10085#if SKYWALK && CONFIG_MBUF_MCACHE
10086SYSCTL_UINT(_kern_ipc, OID_AUTO, mc_threshold_scale_factor,
10087 CTLFLAG_RW | CTLFLAG_LOCKED, &mc_threshold_scale_down_factor,
10088 MC_THRESHOLD_SCALE_DOWN_FACTOR,
10089 "scale down factor for mbuf cache thresholds");
10090#endif /* SKYWALK && CONFIG_MBUF_MCACHE */
10091#if CONFIG_MBUF_MCACHE
10092SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_wd_dump,
10093 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
10094 0, 0, mbuf_wd_dump_sysctl, "A", "mbuf watchdog dump");
10095#endif /* CONFIG_MBUF_MCACHE */
10096#endif /* DEBUG || DEVELOPMENT */
10097SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
10098 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10099 0, 0, mbstat_sysctl, "S,mbstat", "");
10100SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
10101 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10102 0, 0, mb_stat_sysctl, "S,mb_stat", "");
10103#if CONFIG_MBUF_MCACHE
10104SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
10105 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10106 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
10107SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
10108 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
10109 0, 0, mleak_table_sysctl, "S,mleak_table", "");
10110SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
10111 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
10112SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
10113 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
10114SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
10115 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
10116SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_drain_force,
10117 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, NULL, 0,
10118 m_drain_force_sysctl, "I",
10119 "Forces the mbuf garbage collection to run");
10120SYSCTL_INT(_kern_ipc, OID_AUTO, mb_drain_maxint,
10121 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_drain_maxint, 0,
10122 "Minimum time interval between garbage collection");
10123#endif /* CONFIG_MBUF_MCACHE */
10124SYSCTL_INT(_kern_ipc, OID_AUTO, mb_memory_pressure_percentage,
10125 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_memory_pressure_percentage, 0,
10126 "Percentage of when we trigger memory-pressure for an mbuf-class");
10127#if CONFIG_MBUF_MCACHE
10128static int mb_uses_mcache = 1;
10129#else
10130static int mb_uses_mcache = 0;
10131#endif /* CONFIG_MBUF_MCACHE */
10132SYSCTL_INT(_kern_ipc, OID_AUTO, mb_uses_mcache,
10133 CTLFLAG_LOCKED, &mb_uses_mcache, 0,
10134 "Whether mbufs use mcache");
10135