1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1992,7 NeXT Computer, Inc.
30 *
31 * Unix data structure initialization.
32 *
33 */
34
35#include <mach/mach_types.h>
36
37#include <kern/startup.h>
38#include <vm/vm_kern.h>
39#include <mach/vm_prot.h>
40
41#include <sys/param.h>
42#include <sys/buf_internal.h>
43#include <sys/file_internal.h>
44#include <sys/proc_internal.h>
45#include <sys/mcache.h>
46#include <sys/mbuf.h>
47#include <sys/systm.h>
48#include <sys/tty.h>
49#include <sys/vnode.h>
50#include <sys/sysctl.h>
51#include <machine/cons.h>
52#include <pexpert/pexpert.h>
53#include <sys/socketvar.h>
54#include <pexpert/pexpert.h>
55#include <netinet/tcp_var.h>
56
57extern uint32_t kern_maxvnodes;
58#if CONFIG_MBUF_MCACHE
59extern vm_map_t mb_map;
60#endif /* CONFIG_MBUF_MCACHE */
61
62#if INET
63extern uint32_t tcp_sendspace;
64extern uint32_t tcp_recvspace;
65#endif
66
67void bsd_bufferinit(void);
68
69unsigned int bsd_mbuf_cluster_reserve(boolean_t *);
70void bsd_scale_setup(int);
71void bsd_exec_setup(int);
72
73/*
74 * Declare these as initialized data so we can patch them.
75 */
76
77#ifdef NBUF
78int max_nbuf_headers = NBUF;
79int niobuf_headers = (NBUF / 2) + 2048;
80int nbuf_hashelements = NBUF;
81int nbuf_headers = NBUF;
82#else
83int max_nbuf_headers = 0;
84int niobuf_headers = 0;
85int nbuf_hashelements = 0;
86int nbuf_headers = 0;
87#endif
88
89SYSCTL_INT(_kern, OID_AUTO, nbuf, CTLFLAG_RD | CTLFLAG_LOCKED, &nbuf_headers, 0, "");
90SYSCTL_INT(_kern, OID_AUTO, maxnbuf, CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN, &max_nbuf_headers, 0, "");
91
92__private_extern__ int customnbuf = 0;
93
94/* Indicates a server boot when set */
95TUNABLE(int, serverperfmode, "serverperfmode", 0);
96
97#if SOCKETS
98static unsigned int mbuf_poolsz;
99#endif
100
101vm_map_t buffer_map;
102vm_map_t bufferhdr_map;
103static int vnodes_sized = 0;
104
105extern void bsd_startupearly(void);
106
107static vm_map_size_t bufferhdr_map_size;
108SECURITY_READ_ONLY_LATE(struct mach_vm_range) bufferhdr_range = {};
109
110static vm_map_size_t
111bsd_get_bufferhdr_map_size(void)
112{
113 vm_size_t size;
114
115 /* clip the number of buf headers upto 16k */
116 if (max_nbuf_headers == 0) {
117 max_nbuf_headers = (int)atop_kernel(sane_size / 50); /* Get 2% of ram, but no more than we can map */
118 }
119 if ((customnbuf == 0) && ((unsigned int)max_nbuf_headers > 16384)) {
120 max_nbuf_headers = 16384;
121 }
122 if (max_nbuf_headers < CONFIG_MIN_NBUF) {
123 max_nbuf_headers = CONFIG_MIN_NBUF;
124 }
125
126 if (niobuf_headers == 0) {
127 if (max_nbuf_headers < 4096) {
128 niobuf_headers = max_nbuf_headers;
129 } else {
130 niobuf_headers = (max_nbuf_headers / 2) + 2048;
131 }
132 }
133 if (niobuf_headers < CONFIG_MIN_NIOBUF) {
134 niobuf_headers = CONFIG_MIN_NIOBUF;
135 }
136
137 size = (max_nbuf_headers + niobuf_headers) * sizeof(struct buf);
138 size = round_page(x: size);
139
140 return size;
141}
142
143KMEM_RANGE_REGISTER_DYNAMIC(bufferhdr, &bufferhdr_range, ^() {
144 return bufferhdr_map_size = bsd_get_bufferhdr_map_size();
145});
146
147void
148bsd_startupearly(void)
149{
150 vm_size_t size = bufferhdr_map_size;
151
152 assert(size);
153
154 /* clip the number of hash elements to 200000 */
155 if ((customnbuf == 0) && nbuf_hashelements == 0) {
156 nbuf_hashelements = (int)atop_kernel(sane_size / 50);
157 if ((unsigned int)nbuf_hashelements > 200000) {
158 nbuf_hashelements = 200000;
159 }
160 } else {
161 nbuf_hashelements = max_nbuf_headers;
162 }
163
164 bufferhdr_map = kmem_suballoc(parent: kernel_map,
165 addr: &bufferhdr_range.min_address,
166 size,
167 vmc_options: VM_MAP_CREATE_NEVER_FAULTS,
168 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
169 flags: KMS_PERMANENT | KMS_NOFAIL,
170 VM_KERN_MEMORY_FILE).kmr_submap;
171
172 kmem_alloc(map: bufferhdr_map,
173 addrp: &(vm_offset_t){ bufferhdr_range.min_address },
174 size,
175 flags: KMA_NOFAIL | KMA_PERMANENT | KMA_ZERO | KMA_KOBJECT,
176 VM_KERN_MEMORY_FILE);
177
178 buf_headers = (struct buf *)bufferhdr_range.min_address;
179
180#if SOCKETS
181 {
182 static const unsigned int maxspace = 128 * 1024;
183 int scale;
184
185#if INET
186 if ((scale = nmbclusters / NMBCLUSTERS) > 1) {
187 tcp_sendspace *= scale;
188 tcp_recvspace *= scale;
189
190 if (tcp_sendspace > maxspace) {
191 tcp_sendspace = maxspace;
192 }
193 if (tcp_recvspace > maxspace) {
194 tcp_recvspace = maxspace;
195 }
196 }
197#endif /* INET */
198 }
199#endif /* SOCKETS */
200
201 if (vnodes_sized == 0) {
202 if (!PE_get_default(property_name: "kern.maxvnodes", property_ptr: &desiredvnodes, max_property: sizeof(desiredvnodes))) {
203 /*
204 * Size vnodes based on memory
205 * Number vnodes is (memsize/64k) + 1024
206 * This is the calculation that is used by launchd in tiger
207 * we are clipping the max based on 16G
208 * ie ((16*1024*1024*1024)/(64 *1024)) + 1024 = 263168;
209 * CONFIG_VNODES is set to 263168 for "medium" configurations (the default)
210 * but can be smaller or larger.
211 */
212 desiredvnodes = (int)(sane_size / 65536) + 1024;
213#ifdef CONFIG_VNODES
214 if (desiredvnodes > CONFIG_VNODES) {
215 desiredvnodes = CONFIG_VNODES;
216 }
217#endif
218 }
219 vnodes_sized = 1;
220 }
221}
222
223#if SOCKETS
224SECURITY_READ_ONLY_LATE(struct mach_vm_range) mb_range = {};
225KMEM_RANGE_REGISTER_DYNAMIC(mb, &mb_range, ^() {
226 nmbclusters = bsd_mbuf_cluster_reserve(NULL) / MCLBYTES;
227 return (vm_map_size_t)(nmbclusters * MCLBYTES);
228});
229#endif /* SOCKETS */
230
231void
232bsd_bufferinit(void)
233{
234 /*
235 * Note: Console device initialized in kminit() from bsd_autoconf()
236 * prior to call to us in bsd_init().
237 */
238
239 bsd_startupearly();
240
241#if CONFIG_MBUF_MCACHE
242 mb_map = kmem_suballoc(kernel_map,
243 &mb_range.min_address,
244 (vm_size_t) (nmbclusters * MCLBYTES),
245 FALSE,
246 VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE,
247 KMS_PERMANENT | KMS_NOFAIL,
248 VM_KERN_MEMORY_MBUF).kmr_submap;
249 mbutl = (unsigned char *)mb_range.min_address;
250#endif /* CONFIG_MBUF_MCACHE */
251
252 /*
253 * Set up buffers, so they can be used to read disk labels.
254 */
255 bufinit();
256}
257
258/* 512 MB (K32) or 2 GB (K64) hard limit on size of the mbuf pool */
259#if !defined(__LP64__)
260#define MAX_MBUF_POOL (512 << MBSHIFT)
261#else
262#define MAX_MBUF_POOL (2ULL << GBSHIFT)
263#endif /* !__LP64__ */
264#define MAX_NCL (MAX_MBUF_POOL >> MCLSHIFT)
265
266#if SOCKETS
267/*
268 * this has been broken out into a separate routine that
269 * can be called from the x86 early vm initialization to
270 * determine how much lo memory to reserve on systems with
271 * DMA hardware that can't fully address all of the physical
272 * memory that is present.
273 */
274unsigned int
275bsd_mbuf_cluster_reserve(boolean_t *overridden)
276{
277 int mbuf_pool = 0, ncl = 0;
278 static boolean_t was_overridden = FALSE;
279
280 /* If called more than once, return the previously calculated size */
281 if (mbuf_poolsz != 0) {
282 goto done;
283 }
284
285 /*
286 * Some of these are parsed in parse_bsd_args(), but for x86 we get
287 * here early from i386_vm_init() and so we parse them now, in order
288 * to correctly compute the size of the low-memory VM pool. It is
289 * redundant but rather harmless.
290 */
291 (void) PE_parse_boot_argn(arg_string: "ncl", arg_ptr: &ncl, max_arg: sizeof(ncl));
292 (void) PE_parse_boot_argn(arg_string: "mbuf_pool", arg_ptr: &mbuf_pool, max_arg: sizeof(mbuf_pool));
293
294 /*
295 * Convert "mbuf_pool" from MB to # of 2KB clusters; it is
296 * equivalent to "ncl", except that it uses different unit.
297 */
298 if (mbuf_pool != 0) {
299 ncl = (mbuf_pool << MBSHIFT) >> MCLSHIFT;
300 }
301
302 if (sane_size > (64 * 1024 * 1024) || ncl != 0) {
303 if (ncl || serverperfmode) {
304 was_overridden = TRUE;
305 }
306
307 if ((nmbclusters = ncl) == 0) {
308 /* Auto-configure the mbuf pool size */
309 nmbclusters = mbuf_default_ncl(mem_actual);
310 } else {
311 /* Make sure it's not odd in case ncl is manually set */
312 if (nmbclusters & 0x1) {
313 --nmbclusters;
314 }
315
316 /* And obey the upper limit */
317 if (nmbclusters > MAX_NCL) {
318 nmbclusters = MAX_NCL;
319 }
320 }
321
322 /* Round it down to nearest multiple of PAGE_SIZE */
323 nmbclusters = (unsigned int)P2ROUNDDOWN(nmbclusters, NCLPG);
324 }
325 mbuf_poolsz = nmbclusters << MCLSHIFT;
326done:
327 if (overridden) {
328 *overridden = was_overridden;
329 }
330
331 return mbuf_poolsz;
332}
333#endif
334
335#if defined(__LP64__)
336extern int tcp_tcbhashsize;
337extern int max_cached_sock_count;
338#endif
339
340#define SERVER_PERF_MODE_VALIDATION_DISABLES 0x5dee
341extern unsigned int kern_feature_overrides;
342void
343bsd_scale_setup(int scale)
344{
345#if defined(__LP64__)
346 if ((scale > 0) && (serverperfmode == 0)) {
347 maxproc *= scale;
348 maxprocperuid = (maxproc * 2) / 3;
349 if (scale > 2) {
350 maxfiles *= scale;
351 maxfilesperproc = maxfiles / 2;
352 }
353 }
354 /* Apply server scaling rules */
355 if ((scale > 0) && (serverperfmode != 0)) {
356 maxproc = 2500 * scale;
357 hard_maxproc = maxproc;
358 /* no fp usage */
359 maxprocperuid = (maxproc * 3) / 4;
360 maxfiles = (150000 * scale);
361 maxfilesperproc = maxfiles / 2;
362 desiredvnodes = maxfiles;
363 vnodes_sized = 1;
364 tcp_tfo_backlog = 100 * scale;
365 if (scale > 4) {
366 /* clip somaxconn at 32G level */
367 somaxconn = 2048;
368 /*
369 * For scale > 4 (> 32G), clip
370 * tcp_tcbhashsize to 32K
371 */
372 tcp_tcbhashsize = 32 * 1024;
373
374 if (scale > 7) {
375 /* clip at 64G level */
376 max_cached_sock_count = 165000;
377 } else {
378 max_cached_sock_count = 60000 + ((scale - 1) * 15000);
379 }
380 } else {
381 somaxconn = 512 * scale;
382 tcp_tcbhashsize = 4 * 1024 * scale;
383 max_cached_sock_count = 60000 + ((scale - 1) * 15000);
384 }
385 }
386
387 if (maxproc > hard_maxproc) {
388 hard_maxproc = maxproc;
389 }
390#endif
391 if (serverperfmode) {
392 /* If running in serverperfmode disable some internal only diagnostics. */
393 kern_feature_overrides |= SERVER_PERF_MODE_VALIDATION_DISABLES;
394 }
395 bsd_exec_setup(scale);
396}
397