1/*
2 * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30#include <sys/kern_event.h>
31#include <kern/sched_prim.h>
32#include <kern/assert.h>
33#include <kern/debug.h>
34#include <kern/locks.h>
35#include <kern/task.h>
36#include <kern/thread.h>
37#include <kern/thread_call.h>
38#include <kern/host.h>
39#include <kern/policy_internal.h>
40#include <kern/thread_group.h>
41
42#include <IOKit/IOBSD.h>
43
44#include <libkern/libkern.h>
45#include <libkern/coreanalytics/coreanalytics.h>
46#include <mach/coalition.h>
47#include <mach/clock_types.h>
48#include <mach/mach_time.h>
49#include <mach/task.h>
50#include <mach/host_priv.h>
51#include <mach/mach_host.h>
52#include <os/log.h>
53#include <pexpert/pexpert.h>
54#include <sys/coalition.h>
55#include <sys/kern_event.h>
56#include <sys/proc.h>
57#include <sys/proc_info.h>
58#include <sys/reason.h>
59#include <sys/signal.h>
60#include <sys/signalvar.h>
61#include <sys/sysctl.h>
62#include <sys/sysproto.h>
63#include <sys/time.h>
64#include <sys/wait.h>
65#include <sys/tree.h>
66#include <sys/priv.h>
67#include <vm/vm_pageout.h>
68#include <vm/vm_protos.h>
69#include <mach/machine/sdt.h>
70#include <libkern/section_keywords.h>
71#include <stdatomic.h>
72
73#if CONFIG_FREEZE
74#include <vm/vm_map.h>
75#endif /* CONFIG_FREEZE */
76
77#include <kern/kern_memorystatus_internal.h>
78#include <sys/kern_memorystatus.h>
79#include <sys/kern_memorystatus_notify.h>
80
81/*
82 * Memorystatus klist structures
83 */
84struct klist memorystatus_klist;
85static lck_mtx_t memorystatus_klist_mutex;
86static void memorystatus_klist_lock(void);
87static void memorystatus_klist_unlock(void);
88
89/*
90 * Memorystatus kevent filter routines
91 */
92static int filt_memorystatusattach(struct knote *kn, struct kevent_qos_s *kev);
93static void filt_memorystatusdetach(struct knote *kn);
94static int filt_memorystatus(struct knote *kn, long hint);
95static int filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev);
96static int filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev);
97
98SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
99 .f_attach = filt_memorystatusattach,
100 .f_detach = filt_memorystatusdetach,
101 .f_event = filt_memorystatus,
102 .f_touch = filt_memorystatustouch,
103 .f_process = filt_memorystatusprocess,
104};
105
106/*
107 * Memorystatus notification events
108 */
109enum {
110 kMemorystatusNoPressure = 0x1,
111 kMemorystatusPressure = 0x2,
112 kMemorystatusLowSwap = 0x4,
113 kMemorystatusProcLimitWarn = 0x8,
114 kMemorystatusProcLimitCritical = 0x10
115};
116
117#define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
118#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
119#define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
120#define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
121
122/*
123 * Memorystatus notification helper routines
124 */
125static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
126static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
127static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
128static struct knote *vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update);
129static void vm_dispatch_memory_pressure(void);
130kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
131
132#if VM_PRESSURE_EVENTS
133
134/*
135 * This value is the threshold that a process must meet to be considered for scavenging.
136 */
137#if XNU_TARGET_OS_OSX
138#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */
139#else /* XNU_TARGET_OS_OSX */
140#define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */
141#endif /* XNU_TARGET_OS_OSX */
142
143static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
144
145#if DEVELOPMENT || DEBUG
146SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
147#endif /* DEVELOPMENT || DEBUG */
148
149vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
150
151/*
152 * We use this flag to signal if we have any HWM offenders
153 * on the system. This way we can reduce the number of wakeups
154 * of the memorystatus_thread when the system is between the
155 * "pressure" and "critical" threshold.
156 *
157 * The (re-)setting of this variable is done without any locks
158 * or synchronization simply because it is not possible (currently)
159 * to keep track of HWM offenders that drop down below their memory
160 * limit and/or exit. So, we choose to burn a couple of wasted wakeups
161 * by allowing the unguarded modification of this variable.
162 */
163boolean_t memorystatus_hwm_candidates = 0;
164
165#endif /* VM_PRESSURE_EVENTS */
166
167#if CONFIG_JETSAM
168
169extern unsigned int memorystatus_available_pages;
170extern unsigned int memorystatus_available_pages_pressure;
171extern unsigned int memorystatus_available_pages_critical;
172extern unsigned int memorystatus_available_pages_critical_base;
173extern unsigned int memorystatus_available_pages_critical_idle_offset;
174
175#else /* CONFIG_JETSAM */
176
177extern uint64_t memorystatus_available_pages;
178extern uint64_t memorystatus_available_pages_pressure;
179extern uint64_t memorystatus_available_pages_critical;
180
181#endif /* CONFIG_JETSAM */
182
183extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
184uint32_t memorystatus_jetsam_fg_band_waiters = 0;
185static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = 0; /* nanosec */
186static uint64_t memorystatus_jetsam_fg_band_delay_ns = 5ull * 1000 * 1000 * 1000; /* nanosec */
187
188extern boolean_t(*volatile consider_buffer_cache_collect)(int);
189
190#if DEVELOPMENT || DEBUG
191SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW | CTLFLAG_LOCKED,
192 &memorystatus_jetsam_fg_band_delay_ns, "");
193#endif
194
195static int
196filt_memorystatusattach(struct knote *kn, __unused struct kevent_qos_s *kev)
197{
198 int error;
199
200 kn->kn_flags |= EV_CLEAR; /* automatically set */
201 kn->kn_sdata = 0; /* incoming data is ignored */
202 memset(s: &kn->kn_ext, c: 0, n: sizeof(kn->kn_ext));
203
204 error = memorystatus_knote_register(kn);
205 if (error) {
206 knote_set_error(kn, error);
207 }
208 return 0;
209}
210
211static void
212filt_memorystatusdetach(struct knote *kn)
213{
214 memorystatus_knote_unregister(kn);
215}
216
217static int
218filt_memorystatus(struct knote *kn __unused, long hint)
219{
220 if (hint) {
221 switch (hint) {
222 case kMemorystatusNoPressure:
223 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
224 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
225 }
226 break;
227 case kMemorystatusPressure:
228 if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
229 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
230 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
231 }
232 } else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
233 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
234 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
235 }
236 }
237 break;
238 case kMemorystatusLowSwap:
239 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
240 kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
241 }
242 break;
243
244 case kMemorystatusProcLimitWarn:
245 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
246 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
247 }
248 break;
249
250 case kMemorystatusProcLimitCritical:
251 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
252 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
253 }
254 break;
255
256 default:
257 break;
258 }
259 }
260
261#if 0
262 if (kn->kn_fflags != 0) {
263 proc_t knote_proc = knote_get_kq(kn)->kq_p;
264 pid_t knote_pid = proc_getpid(knote_proc);
265
266 printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
267 (unsigned long)kn, kn->kn_fflags, knote_pid);
268 }
269#endif
270
271 return kn->kn_fflags != 0;
272}
273
274static int
275filt_memorystatustouch(struct knote *kn, struct kevent_qos_s *kev)
276{
277 int res;
278 int prev_kn_sfflags = 0;
279
280 memorystatus_klist_lock();
281
282 /*
283 * copy in new kevent settings
284 * (saving the "desired" data and fflags).
285 */
286
287 prev_kn_sfflags = kn->kn_sfflags;
288 kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
289
290#if XNU_TARGET_OS_OSX
291 /*
292 * Only on desktop do we restrict notifications to
293 * one per active/inactive state (soft limits only).
294 */
295 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
296 /*
297 * Is there previous state to preserve?
298 */
299 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
300 /*
301 * This knote was previously interested in proc_limit_warn,
302 * so yes, preserve previous state.
303 */
304 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
305 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
306 }
307 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
308 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
309 }
310 } else {
311 /*
312 * This knote was not previously interested in proc_limit_warn,
313 * but it is now. Set both states.
314 */
315 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
316 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
317 }
318 }
319
320 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
321 /*
322 * Is there previous state to preserve?
323 */
324 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
325 /*
326 * This knote was previously interested in proc_limit_critical,
327 * so yes, preserve previous state.
328 */
329 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
330 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
331 }
332 if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
333 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
334 }
335 } else {
336 /*
337 * This knote was not previously interested in proc_limit_critical,
338 * but it is now. Set both states.
339 */
340 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
341 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
342 }
343 }
344#endif /* XNU_TARGET_OS_OSX */
345
346 /*
347 * reset the output flags based on a
348 * combination of the old events and
349 * the new desired event list.
350 */
351 //kn->kn_fflags &= kn->kn_sfflags;
352
353 res = (kn->kn_fflags != 0);
354
355 memorystatus_klist_unlock();
356
357 return res;
358}
359
360static int
361filt_memorystatusprocess(struct knote *kn, struct kevent_qos_s *kev)
362{
363 int res = 0;
364
365 memorystatus_klist_lock();
366 if (kn->kn_fflags) {
367 knote_fill_kevent(kn, kev, data: 0);
368 res = 1;
369 }
370 memorystatus_klist_unlock();
371
372 return res;
373}
374
375static void
376memorystatus_klist_lock(void)
377{
378 lck_mtx_lock(lck: &memorystatus_klist_mutex);
379}
380
381static void
382memorystatus_klist_unlock(void)
383{
384 lck_mtx_unlock(lck: &memorystatus_klist_mutex);
385}
386
387void
388memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr)
389{
390 lck_mtx_init(lck: &memorystatus_klist_mutex, grp, attr);
391 klist_init(list: &memorystatus_klist);
392}
393
394int
395memorystatus_knote_register(struct knote *kn)
396{
397 int error = 0;
398
399 memorystatus_klist_lock();
400
401 /*
402 * Support only userspace visible flags.
403 */
404 if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
405#if XNU_TARGET_OS_OSX
406 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
407 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
408 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
409 }
410
411 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
412 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
413 kn->kn_sfflags |= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
414 }
415#endif /* XNU_TARGET_OS_OSX */
416
417 KNOTE_ATTACH(&memorystatus_klist, kn);
418 } else {
419 error = ENOTSUP;
420 }
421
422 memorystatus_klist_unlock();
423
424 return error;
425}
426
427void
428memorystatus_knote_unregister(struct knote *kn __unused)
429{
430 memorystatus_klist_lock();
431 KNOTE_DETACH(&memorystatus_klist, kn);
432 memorystatus_klist_unlock();
433}
434
435#if VM_PRESSURE_EVENTS
436
437#if CONFIG_JETSAM
438
439static thread_call_t sustained_pressure_handler_thread_call;
440int memorystatus_should_kill_on_sustained_pressure = 1;
441/* Count the number of sustained pressure kills we've done since boot. */
442uint64_t memorystatus_kill_on_sustained_pressure_count = 0;
443uint64_t memorystatus_kill_on_sustained_pressure_window_s = 60 * 10; /* 10 Minutes */
444uint64_t memorystatus_kill_on_sustained_pressure_delay_ms = 500; /* .5 seconds */
445
446#if DEVELOPMENT || DEBUG
447SYSCTL_INT(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_should_kill_on_sustained_pressure, 0, "");
448#endif /* DEVELOPMENT || DEBUG */
449SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_count, CTLFLAG_RD | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, "");
450SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_window_s, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, "");
451SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_delay_ms, CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, "");
452
453static void sustained_pressure_handler(void*, void*);
454#endif /* CONFIG_JETSAM */
455static thread_call_t memorystatus_notify_update_telemetry_thread_call;
456static void update_footprints_for_telemetry(void*, void*);
457
458
459void
460memorystatus_notify_init()
461{
462#if CONFIG_JETSAM
463 sustained_pressure_handler_thread_call = thread_call_allocate_with_options(sustained_pressure_handler, NULL, THREAD_CALL_PRIORITY_KERNEL_HIGH, THREAD_CALL_OPTIONS_ONCE);
464#endif /* CONFIG_JETSAM */
465 memorystatus_notify_update_telemetry_thread_call = thread_call_allocate_with_options(func: update_footprints_for_telemetry, NULL, pri: THREAD_CALL_PRIORITY_USER, options: THREAD_CALL_OPTIONS_ONCE);
466}
467
468#if CONFIG_MEMORYSTATUS
469
470inline int
471memorystatus_send_note(int event_code, void *data, uint32_t data_length)
472{
473 int ret;
474 struct kev_msg ev_msg;
475
476 ev_msg.vendor_code = KEV_VENDOR_APPLE;
477 ev_msg.kev_class = KEV_SYSTEM_CLASS;
478 ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS;
479
480 ev_msg.event_code = event_code;
481
482 ev_msg.dv[0].data_length = data_length;
483 ev_msg.dv[0].data_ptr = data;
484 ev_msg.dv[1].data_length = 0;
485
486 ret = kev_post_msg(event: &ev_msg);
487 if (ret) {
488 memorystatus_log_error("%s: kev_post_msg() failed, err %d\n", __func__, ret);
489 }
490
491 return ret;
492}
493
494boolean_t
495memorystatus_warn_process(const proc_t p, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
496{
497 /*
498 * This function doesn't take a reference to p or lock it. So it better be the current process.
499 */
500 assert(p == current_proc());
501 pid_t pid = proc_getpid(p);
502 boolean_t ret = FALSE;
503 boolean_t found_knote = FALSE;
504 struct knote *kn = NULL;
505 int send_knote_count = 0;
506 uint32_t platform;
507 platform = proc_platform(p);
508
509 /*
510 * See comment in sysctl_memorystatus_vm_pressure_send.
511 */
512
513 memorystatus_klist_lock();
514
515 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
516 proc_t knote_proc = knote_get_kq(kn)->kq_p;
517 pid_t knote_pid = proc_getpid(knote_proc);
518
519 if (knote_pid == pid) {
520 /*
521 * By setting the "fflags" here, we are forcing
522 * a process to deal with the case where it's
523 * bumping up into its memory limits. If we don't
524 * do this here, we will end up depending on the
525 * system pressure snapshot evaluation in
526 * filt_memorystatus().
527 */
528
529 /*
530 * The type of notification and the frequency are different between
531 * embedded and desktop.
532 *
533 * Embedded processes register for global pressure notifications
534 * (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) via UIKit
535 * (see applicationDidReceiveMemoryWarning in UIKit). We'll warn them here if
536 * they are near there memory limit. filt_memorystatus() will warn them based
537 * on the system pressure level.
538 *
539 * On desktop, (NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)
540 * are only expected to fire for system level warnings. Desktop procesess
541 * register for NOTE_MEMORYSTATUS_PROC_LIMIT_WARN
542 * if they want to be warned when they approach their limit
543 * and for NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL to be warned when they
544 * exceed their limit.
545 *
546 * On embedded we continuously warn processes that are approaching their
547 * memory limit. However on desktop, we only send one warning while
548 * the process is active/inactive if the limit is soft..
549 *
550 */
551 if (platform == PLATFORM_MACOS || platform == PLATFORM_MACCATALYST || platform == PLATFORM_DRIVERKIT) {
552 if (!limit_exceeded) {
553 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
554 found_knote = TRUE;
555 if (!is_fatal) {
556 /*
557 * Restrict proc_limit_warn notifications when
558 * non-fatal (soft) limit is at play.
559 */
560 if (is_active) {
561 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
562 /*
563 * Mark this knote for delivery.
564 */
565 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
566 /*
567 * And suppress it from future notifications.
568 */
569 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
570 send_knote_count++;
571 }
572 } else {
573 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
574 /*
575 * Mark this knote for delivery.
576 */
577 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
578 /*
579 * And suppress it from future notifications.
580 */
581 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
582 send_knote_count++;
583 }
584 }
585 } else {
586 /*
587 * No restriction on proc_limit_warn notifications when
588 * fatal (hard) limit is at play.
589 */
590 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
591 send_knote_count++;
592 }
593 }
594 } else {
595 /*
596 * Send this notification when a process has exceeded a soft limit,
597 */
598
599 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
600 found_knote = TRUE;
601 if (!is_fatal) {
602 /*
603 * Restrict critical notifications for soft limits.
604 */
605
606 if (is_active) {
607 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
608 /*
609 * Suppress future proc_limit_critical notifications
610 * for the active soft limit.
611 */
612 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
613 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
614 send_knote_count++;
615 }
616 } else {
617 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
618 /*
619 * Suppress future proc_limit_critical_notifications
620 * for the inactive soft limit.
621 */
622 kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
623 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
624 send_knote_count++;
625 }
626 }
627 } else {
628 /*
629 * We should never be trying to send a critical notification for
630 * a hard limit... the process would be killed before it could be
631 * received.
632 */
633 panic("Caught sending pid %d a critical warning for a fatal limit.", pid);
634 }
635 }
636 }
637 } else {
638 if (!limit_exceeded) {
639 /*
640 * Intentionally set either the unambiguous limit warning,
641 * the system-wide critical or the system-wide warning
642 * notification bit.
643 */
644
645 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
646 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
647 found_knote = TRUE;
648 send_knote_count++;
649 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
650 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
651 found_knote = TRUE;
652 send_knote_count++;
653 } else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
654 kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
655 found_knote = TRUE;
656 send_knote_count++;
657 }
658 } else {
659 /*
660 * Send this notification when a process has exceeded a soft limit.
661 */
662 if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
663 kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
664 found_knote = TRUE;
665 send_knote_count++;
666 }
667 }
668 }
669 }
670 }
671
672 if (found_knote) {
673 if (send_knote_count > 0) {
674 KNOTE(&memorystatus_klist, 0);
675 }
676 ret = TRUE;
677 }
678
679 memorystatus_klist_unlock();
680
681 return ret;
682}
683
684/*
685 * Can only be set by the current task on itself.
686 */
687int
688memorystatus_low_mem_privileged_listener(uint32_t op_flags)
689{
690 boolean_t set_privilege = FALSE;
691 /*
692 * Need an entitlement check here?
693 */
694 if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
695 set_privilege = TRUE;
696 } else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
697 set_privilege = FALSE;
698 } else {
699 return EINVAL;
700 }
701
702 return task_low_mem_privileged_listener(task: current_task(), new_value: set_privilege, NULL);
703}
704
705int
706memorystatus_send_pressure_note(pid_t pid)
707{
708 memorystatus_log_debug("memorystatus_send_pressure_note(): pid %d\n", pid);
709 return memorystatus_send_note(event_code: kMemorystatusPressureNote, data: &pid, data_length: sizeof(pid));
710}
711
712boolean_t
713memorystatus_is_foreground_locked(proc_t p)
714{
715 return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
716 (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
717}
718
719/*
720 * This is meant for stackshot and kperf -- it does not take the proc_list_lock
721 * to access the p_memstat_dirty field.
722 */
723void
724memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
725{
726 if (!v) {
727 *is_dirty = FALSE;
728 *is_dirty_tracked = FALSE;
729 *allow_idle_exit = FALSE;
730 } else {
731 proc_t p = (proc_t)v;
732 *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
733 *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
734 *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
735 }
736}
737
738boolean_t
739memorystatus_bg_pressure_eligible(proc_t p)
740{
741 boolean_t eligible = FALSE;
742
743 proc_list_lock();
744
745 memorystatus_log_debug("memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", proc_getpid(p), p->p_memstat_state);
746
747 /* Foreground processes have already been dealt with at this point, so just test for eligibility */
748 if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
749 eligible = TRUE;
750 }
751
752 if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
753 /*
754 * IDLE and IDLE_DEFERRED bands contain processes
755 * that have dropped memory to be under their inactive
756 * memory limits. And so they can't really give back
757 * anything.
758 */
759 eligible = FALSE;
760 }
761
762 proc_list_unlock();
763
764 return eligible;
765}
766
767void
768memorystatus_send_low_swap_note(void)
769{
770 struct knote *kn = NULL;
771
772 memorystatus_klist_lock();
773 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
774 /* We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the
775 * current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
776 * that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
777 * kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
778 if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, 0, 0) == TRUE) {
779 KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
780 break;
781 }
782 }
783
784 memorystatus_klist_unlock();
785}
786
787#endif /* CONFIG_MEMORYSTATUS */
788
789/*
790 * Notification telemetry
791 */
792CA_EVENT(memorystatus_pressure_interval,
793 CA_INT, num_processes_registered,
794 CA_INT, num_notifications_sent,
795 CA_INT, max_level,
796 CA_INT, num_transitions,
797 CA_INT, num_kills,
798 CA_INT, duration);
799static CA_EVENT_TYPE(memorystatus_pressure_interval) memorystatus_pressure_interval_telemetry;
800
801CA_EVENT(memorystatus_proc_notification,
802 CA_INT, footprint_before_notification,
803 CA_INT, footprint_1_min_after_first_warning,
804 CA_INT, footprint_5_min_after_first_warning,
805 CA_INT, footprint_20_min_after_first_warning,
806 CA_INT, footprint_1_min_after_first_critical,
807 CA_INT, footprint_5_min_after_first_critical,
808 CA_INT, footprint_20_min_after_first_critical,
809 CA_INT, order_within_list,
810 CA_INT, num_notifications_sent,
811 CA_INT, time_between_warning_and_critical,
812 CA_STATIC_STRING(CA_PROCNAME_LEN), proc_name);
813
814/* The send timestamps for the first notifications are stored in the knote's kn_sdata field */
815#define KNOTE_SEND_TIMESTAMP_WARNING_INDEX 0
816#define KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX 1
817
818/* The footprint history for this task is stored in the knote's kn_ext array. */
819struct knote_footprint_history {
820 uint32_t kfh_starting_footprint;
821 uint32_t kfh_footprint_after_warn_1; /* 1 minute after first warning notification */
822 uint32_t kfh_footprint_after_warn_5; /* 5 minutes after first warning notification */
823 uint32_t kfh_footprint_after_warn_20; /* 20 minutes after first warning notification */
824 uint32_t kfh_footprint_after_critical_1; /* 1 minute after first critical notification */
825 uint32_t kfh_footprint_after_critical_5; /* 5 minutes after first critical notification */
826 uint32_t kfh_footprint_after_critical_20; /* 20 minutes after first critical notification */
827 uint16_t kfh_num_notifications;
828 uint16_t kfh_notification_order;
829} __attribute__((packed));
830
831
832static_assert(sizeof(struct knote_footprint_history) <= sizeof(uint64_t) * 4, "footprint history fits in knote extensions");
833
834static void
835mark_knote_send_time(struct knote *kn, task_t task, int knote_pressure_level, uint16_t order_within_list)
836{
837 uint32_t *timestamps;
838 uint32_t index;
839 uint64_t curr_ts, curr_ts_seconds;
840 struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
841 if (knote_pressure_level != NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
842 timestamps = (uint32_t *)&(kn->kn_sdata);
843 index = knote_pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN ?
844 KNOTE_SEND_TIMESTAMP_WARNING_INDEX : KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX;
845 if (timestamps[index] == 0) {
846 /* First notification for this level since pressure elevated from normal. */
847 curr_ts = mach_absolute_time();
848 curr_ts_seconds = 0;
849 absolutetime_to_nanoseconds(abstime: curr_ts, result: &curr_ts_seconds);
850 curr_ts_seconds /= NSEC_PER_SEC;
851
852 timestamps[index] = (uint32_t)MIN(UINT32_MAX, curr_ts_seconds);
853
854 /* Record task initial footprint */
855 if (timestamps[index == KNOTE_SEND_TIMESTAMP_WARNING_INDEX ? KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX : KNOTE_SEND_TIMESTAMP_WARNING_INDEX] == 0) {
856 /*
857 * First notification at any level since pressure elevated from normal.
858 * Record the footprint and our order in the notification list.
859 */
860 footprint_history->kfh_starting_footprint = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
861 footprint_history->kfh_notification_order = order_within_list;
862 }
863 }
864 }
865 footprint_history->kfh_num_notifications++;
866}
867
868/*
869 * Records the current footprint for this task in the knote telemetry.
870 *
871 * Returns the soonest absolutetime when this footprint history should be updated again.
872 */
873static uint64_t
874update_knote_footprint_history(struct knote *kn, task_t task, uint64_t curr_ts)
875{
876 uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
877 struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
878 uint64_t warning_send_time, critical_send_time, minutes_since_warning = UINT64_MAX, minutes_since_critical = UINT64_MAX;
879 warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
880 critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
881 uint32_t task_phys_footprint_mb = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (2UL << 20));
882 uint64_t next_run = UINT64_MAX, absolutetime_in_minute = 0, minutes_since_last_notification = 0, curr_ts_s;
883 absolutetime_to_nanoseconds(abstime: curr_ts, result: &curr_ts_s);
884 nanoseconds_to_absolutetime(nanoseconds: 60 * NSEC_PER_SEC, result: &absolutetime_in_minute);
885 curr_ts_s /= NSEC_PER_SEC;
886
887 if (warning_send_time != 0) {
888 /* This task received a warning notification. */
889 minutes_since_warning = (curr_ts_s - warning_send_time) / 60;
890 if (footprint_history->kfh_footprint_after_warn_1 == 0 && minutes_since_warning >= 1) {
891 footprint_history->kfh_footprint_after_warn_1 = task_phys_footprint_mb;
892 }
893 if (footprint_history->kfh_footprint_after_warn_5 == 0 && minutes_since_warning >= 5) {
894 footprint_history->kfh_footprint_after_warn_5 = task_phys_footprint_mb;
895 }
896 if (footprint_history->kfh_footprint_after_warn_20 == 0 && minutes_since_warning >= 20) {
897 footprint_history->kfh_footprint_after_warn_20 = task_phys_footprint_mb;
898 }
899 }
900 if (critical_send_time != 0) {
901 /* This task received a critical notification. */
902 minutes_since_critical = (curr_ts_s - critical_send_time) / 60;
903 if (footprint_history->kfh_footprint_after_critical_1 == 0 && minutes_since_critical >= 1) {
904 footprint_history->kfh_footprint_after_critical_1 = task_phys_footprint_mb;
905 }
906 if (footprint_history->kfh_footprint_after_critical_5 == 0 && minutes_since_critical >= 5) {
907 footprint_history->kfh_footprint_after_critical_5 = task_phys_footprint_mb;
908 }
909 if (footprint_history->kfh_footprint_after_critical_20 == 0 && minutes_since_critical >= 20) {
910 footprint_history->kfh_footprint_after_critical_20 = task_phys_footprint_mb;
911 }
912 }
913
914 minutes_since_last_notification = MIN(minutes_since_warning, minutes_since_critical);
915 if (minutes_since_last_notification < 20) {
916 if (minutes_since_last_notification < 5) {
917 if (minutes_since_last_notification < 1) {
918 next_run = curr_ts + absolutetime_in_minute;
919 } else {
920 next_run = curr_ts + (absolutetime_in_minute * 5);
921 }
922 } else {
923 next_run = curr_ts + (absolutetime_in_minute * 20);
924 }
925 }
926
927 return next_run;
928}
929
930extern char *proc_name_address(void *p);
931/*
932 * Attempt to send the given level telemetry event.
933 * Finalizes the duration.
934 * Clears the src_event struct.
935 */
936static void
937memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval) *src_event)
938{
939 uint64_t duration_nanoseconds = 0;
940 uint64_t curr_ts = mach_absolute_time();
941 src_event->duration = curr_ts - src_event->duration;
942 absolutetime_to_nanoseconds(abstime: src_event->duration, result: &duration_nanoseconds);
943 src_event->duration = (int64_t) (duration_nanoseconds / NSEC_PER_SEC);
944
945 /*
946 * Drop the event rather than block for memory. We should be in a normal pressure level now,
947 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
948 */
949 ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_pressure_interval, Z_NOWAIT);
950 if (event_wrapper) {
951 memcpy(dst: event_wrapper->data, src: src_event, n: sizeof(CA_EVENT_TYPE(memorystatus_pressure_interval)));
952 CA_EVENT_SEND(event_wrapper);
953 }
954 src_event->num_processes_registered = 0;
955 src_event->num_notifications_sent = 0;
956 src_event->max_level = 0;
957 src_event->num_transitions = 0;
958 src_event->num_kills = 0;
959 src_event->duration = 0;
960}
961
962
963/*
964 * Attempt to send the per-proc telemetry events.
965 * Clears the footprint histories on the knotes.
966 */
967static void
968memorystatus_pressure_proc_telemetry_send(void)
969{
970 struct knote *kn = NULL;
971 memorystatus_klist_lock();
972 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
973 proc_t p = PROC_NULL;
974 struct knote_footprint_history *footprint_history = (struct knote_footprint_history *)kn->kn_ext;
975 uint32_t *timestamps = (uint32_t *)&(kn->kn_sdata);
976 uint32_t warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
977 uint32_t critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
978 CA_EVENT_TYPE(memorystatus_proc_notification) * event = NULL;
979 if (warning_send_time != 0 || critical_send_time != 0) {
980 /*
981 * Drop the event rather than block for memory. We should be in a normal pressure level now,
982 * but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
983 */
984 ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_proc_notification, Z_NOWAIT | Z_ZERO);
985 if (event_wrapper) {
986 event = event_wrapper->data;
987
988 event->footprint_before_notification = footprint_history->kfh_starting_footprint;
989 event->footprint_1_min_after_first_warning = footprint_history->kfh_footprint_after_warn_1;
990 event->footprint_5_min_after_first_warning = footprint_history->kfh_footprint_after_warn_5;
991 event->footprint_20_min_after_first_warning = footprint_history->kfh_footprint_after_warn_20;
992 event->footprint_1_min_after_first_critical = footprint_history->kfh_footprint_after_critical_1;
993 event->footprint_5_min_after_first_critical = footprint_history->kfh_footprint_after_critical_5;
994 event->footprint_20_min_after_first_critical = footprint_history->kfh_footprint_after_critical_20;
995 event->num_notifications_sent = footprint_history->kfh_num_notifications;
996 if (warning_send_time != 0 && critical_send_time != 0) {
997 event->time_between_warning_and_critical = (critical_send_time - warning_send_time) / 60; // Minutes
998 }
999 event->order_within_list = footprint_history->kfh_notification_order;
1000
1001 p = proc_ref(p: knote_get_kq(kn)->kq_p, false);
1002 if (p == NULL) {
1003 CA_EVENT_DEALLOCATE(event_wrapper);
1004 continue;
1005 }
1006 strlcpy(dst: event->proc_name, src: proc_name_address(p), n: sizeof(event->proc_name));
1007
1008 proc_rele(p);
1009 CA_EVENT_SEND(event_wrapper);
1010 }
1011 }
1012 memset(s: footprint_history, c: 0, n: sizeof(*footprint_history));
1013 timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX] = 0;
1014 timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX] = 0;
1015 }
1016 memorystatus_klist_unlock();
1017}
1018
1019/*
1020 * Send all telemetry associated with the increased pressure interval.
1021 */
1022static void
1023memorystatus_pressure_telemetry_send(void)
1024{
1025 LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_NOTOWNED);
1026 memorystatus_pressure_interval_send(src_event: &memorystatus_pressure_interval_telemetry);
1027 memorystatus_pressure_proc_telemetry_send();
1028}
1029
1030
1031/*
1032 * kn_max - knote
1033 *
1034 * knote_pressure_level - to check if the knote is registered for this notification level.
1035 *
1036 * task - task whose bits we'll be modifying
1037 *
1038 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
1039 *
1040 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
1041 *
1042 */
1043
1044static boolean_t
1045is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
1046{
1047 if (kn_max->kn_sfflags & knote_pressure_level) {
1048 if (pressure_level_to_clear && task_has_been_notified(task, pressurelevel: pressure_level_to_clear) == TRUE) {
1049 task_clear_has_been_notified(task, pressurelevel: pressure_level_to_clear);
1050 }
1051
1052 task_mark_has_been_notified(task, pressurelevel: pressure_level_to_set);
1053 return TRUE;
1054 }
1055
1056 return FALSE;
1057}
1058
1059static void
1060memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
1061{
1062 struct knote *kn = NULL;
1063
1064 memorystatus_klist_lock();
1065
1066 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1067 proc_t p = knote_get_kq(kn)->kq_p;
1068
1069 if (p == proc_ref(p, false)) {
1070 task_clear_has_been_notified(task: proc_task(p), pressurelevel: pressure_level_to_clear);
1071 proc_rele(p);
1072 }
1073 }
1074
1075 memorystatus_klist_unlock();
1076}
1077
1078/*
1079 * Used by the vm_pressure_thread which is
1080 * signalled from within vm_pageout_scan().
1081 */
1082
1083void
1084consider_vm_pressure_events(void)
1085{
1086 vm_dispatch_memory_pressure();
1087}
1088
1089static void
1090vm_dispatch_memory_pressure(void)
1091{
1092 memorystatus_update_vm_pressure(FALSE);
1093}
1094
1095static struct knote *
1096vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int level, boolean_t target_foreground_process, uint64_t *next_telemetry_update)
1097{
1098 struct knote *kn = NULL, *kn_max = NULL;
1099 uint64_t resident_max = 0;/* MB */
1100 int selected_task_importance = 0;
1101 static int pressure_snapshot = -1;
1102 boolean_t pressure_increase = FALSE;
1103 uint64_t curr_ts = mach_absolute_time();
1104 *next_telemetry_update = UINT64_MAX;
1105
1106 if (pressure_snapshot == -1) {
1107 /*
1108 * Initial snapshot.
1109 */
1110 pressure_snapshot = level;
1111 pressure_increase = TRUE;
1112 } else {
1113 if (level && (level >= pressure_snapshot)) {
1114 pressure_increase = TRUE;
1115 } else {
1116 pressure_increase = FALSE;
1117 }
1118
1119 pressure_snapshot = level;
1120 }
1121
1122 if (pressure_increase == TRUE) {
1123 /*
1124 * We'll start by considering the largest
1125 * unimportant task in our list.
1126 */
1127 selected_task_importance = INT_MAX;
1128 } else {
1129 /*
1130 * We'll start by considering the largest
1131 * important task in our list.
1132 */
1133 selected_task_importance = 0;
1134 }
1135
1136 SLIST_FOREACH(kn, candidate_list, kn_selnext) {
1137 uint64_t resident_size = 0;/* MB */
1138 proc_t p = PROC_NULL;
1139 struct task* t = TASK_NULL;
1140 int curr_task_importance = 0;
1141 uint64_t telemetry_update = 0;
1142 boolean_t consider_knote = FALSE;
1143 boolean_t privileged_listener = FALSE;
1144
1145 p = proc_ref(p: knote_get_kq(kn)->kq_p, false);
1146 if (p == PROC_NULL) {
1147 continue;
1148 }
1149
1150#if CONFIG_MEMORYSTATUS
1151 if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
1152 /*
1153 * Skip process not marked foreground.
1154 */
1155 proc_rele(p);
1156 continue;
1157 }
1158#endif /* CONFIG_MEMORYSTATUS */
1159
1160 t = (struct task *)(proc_task(p));
1161 telemetry_update = update_knote_footprint_history(kn, task: t, curr_ts);
1162 *next_telemetry_update = MIN(*next_telemetry_update, telemetry_update);
1163
1164 vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
1165
1166 if ((kn->kn_sfflags & dispatch_level) == 0) {
1167 proc_rele(p);
1168 continue;
1169 }
1170
1171#if CONFIG_MEMORYSTATUS
1172 if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
1173 VM_PRESSURE_DEBUG(1, "[vm_pressure] skipping process %d\n", proc_getpid(p));
1174 proc_rele(p);
1175 continue;
1176 }
1177#endif /* CONFIG_MEMORYSTATUS */
1178
1179#if XNU_TARGET_OS_OSX
1180 curr_task_importance = task_importance_estimate(task: t);
1181#else /* XNU_TARGET_OS_OSX */
1182 curr_task_importance = p->p_memstat_effectivepriority;
1183#endif /* XNU_TARGET_OS_OSX */
1184
1185 /*
1186 * Privileged listeners are only considered in the multi-level pressure scheme
1187 * AND only if the pressure is increasing.
1188 */
1189 if (level > 0) {
1190 if (task_has_been_notified(task: t, pressurelevel: level) == FALSE) {
1191 /*
1192 * Is this a privileged listener?
1193 */
1194 if (task_low_mem_privileged_listener(task: t, FALSE, old_value: &privileged_listener) == 0) {
1195 if (privileged_listener) {
1196 kn_max = kn;
1197 proc_rele(p);
1198 goto done_scanning;
1199 }
1200 }
1201 } else {
1202 proc_rele(p);
1203 continue;
1204 }
1205 } else if (level == 0) {
1206 /*
1207 * Task wasn't notified when the pressure was increasing and so
1208 * no need to notify it that the pressure is decreasing.
1209 */
1210 if ((task_has_been_notified(task: t, pressurelevel: kVMPressureWarning) == FALSE) && (task_has_been_notified(task: t, pressurelevel: kVMPressureCritical) == FALSE)) {
1211 proc_rele(p);
1212 continue;
1213 }
1214 }
1215
1216 /*
1217 * We don't want a small process to block large processes from
1218 * being notified again. <rdar://problem/7955532>
1219 */
1220 resident_size = (get_task_phys_footprint(t)) / (1024 * 1024ULL); /* MB */
1221
1222 if (resident_size >= vm_pressure_task_footprint_min) {
1223 if (level > 0) {
1224 /*
1225 * Warning or Critical Pressure.
1226 */
1227 if (pressure_increase) {
1228 if ((curr_task_importance < selected_task_importance) ||
1229 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1230 /*
1231 * We have found a candidate process which is:
1232 * a) at a lower importance than the current selected process
1233 * OR
1234 * b) has importance equal to that of the current selected process but is larger
1235 */
1236
1237 consider_knote = TRUE;
1238 }
1239 } else {
1240 if ((curr_task_importance > selected_task_importance) ||
1241 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1242 /*
1243 * We have found a candidate process which is:
1244 * a) at a higher importance than the current selected process
1245 * OR
1246 * b) has importance equal to that of the current selected process but is larger
1247 */
1248
1249 consider_knote = TRUE;
1250 }
1251 }
1252 } else if (level == 0) {
1253 /*
1254 * Pressure back to normal.
1255 */
1256 if ((curr_task_importance > selected_task_importance) ||
1257 ((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1258 consider_knote = TRUE;
1259 }
1260 }
1261
1262 if (consider_knote) {
1263 resident_max = resident_size;
1264 kn_max = kn;
1265 selected_task_importance = curr_task_importance;
1266 consider_knote = FALSE; /* reset for the next candidate */
1267 }
1268 } else {
1269 /* There was no candidate with enough resident memory to scavenge */
1270 VM_PRESSURE_DEBUG(0, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", proc_getpid(p), resident_size);
1271 }
1272 proc_rele(p);
1273 }
1274
1275done_scanning:
1276 if (kn_max) {
1277 VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max, 0, 0);
1278 VM_PRESSURE_DEBUG(1, "[vm_pressure] sending event to pid %d with %llu resident\n", proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max);
1279 }
1280
1281 return kn_max;
1282}
1283
1284/*
1285 * To avoid notification storms in a system with sawtooth behavior of pressure levels eg:
1286 * Normal -> warning (notify clients) -> critical (notify) -> warning (notify) -> critical (notify) -> warning (notify)...
1287 *
1288 * We have 'resting' periods: WARNING_NOTIFICATION_RESTING_PERIOD and CRITICAL_NOTIFICATION_RESTING_PERIOD
1289 *
1290 * So it would look like:-
1291 * Normal -> warning (notify) -> critical (notify) -> warning (notify if it has been RestPeriod since last warning) -> critical (notify if it has been RestPeriod since last critical) -> ...
1292 *
1293 * That's what these 2 timestamps below signify.
1294 */
1295
1296uint64_t next_warning_notification_sent_at_ts = 0;
1297uint64_t next_critical_notification_sent_at_ts = 0;
1298
1299boolean_t memorystatus_manual_testing_on = FALSE;
1300vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
1301
1302unsigned int memorystatus_sustained_pressure_maximum_band = JETSAM_PRIORITY_IDLE;
1303#if DEVELOPMENT || DEBUG
1304SYSCTL_INT(_kern, OID_AUTO, memorystatus_sustained_pressure_maximum_band, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &memorystatus_sustained_pressure_maximum_band, 0, "");
1305#endif /* DEVELOPMENT || DEBUG */
1306
1307#if CONFIG_JETSAM
1308
1309/*
1310 * TODO(jason): The memorystatus thread should be responsible for this
1311 * It can just check how long the pressure level has been at warning and the timestamp
1312 * of the last sustained pressure kill.
1313 */
1314static void
1315sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
1316{
1317 int max_kills = 0, kill_count = 0;
1318 /*
1319 * Pressure has been elevated for too long.
1320 * We don't want to leave the system in this state as it can delay background
1321 * work indefinitely & drain battery.
1322 *
1323 * Try to return the system to normal via jetsam.
1324 * We'll run through the idle band up to 2 times.
1325 * If the pressure hasn't been relieved by then, the problem is memory
1326 * consumption in a higher band and this churn is probably doing more harm than good.
1327 */
1328 max_kills = memorystatus_get_proccnt_upto_priority(memorystatus_sustained_pressure_maximum_band) * 2;
1329 memorystatus_log("memorystatus: Pressure level has been elevated for too long. killing up to %d idle processes\n", max_kills);
1330 while (memorystatus_vm_pressure_level != kVMPressureNormal && kill_count < max_kills) {
1331 boolean_t killed = memorystatus_kill_on_sustained_pressure();
1332 if (killed) {
1333 /*
1334 * Pause before our next kill & see if pressure reduces.
1335 */
1336 delay((int)(memorystatus_kill_on_sustained_pressure_delay_ms * NSEC_PER_MSEC / NSEC_PER_USEC));
1337 kill_count++;
1338 memorystatus_kill_on_sustained_pressure_count++;
1339 /* TODO(jason): Should use os_atomic but requires rdar://76310894. */
1340 memorystatus_pressure_interval_telemetry.num_kills++;
1341 } else {
1342 /* Nothing left to kill */
1343 break;
1344 }
1345 }
1346 if (memorystatus_vm_pressure_level != kVMPressureNormal) {
1347 memorystatus_log("memorystatus: Killed %d idle processes due to sustained pressure, but device didn't quiesce. Giving up.\n", kill_count);
1348 }
1349}
1350
1351#endif /* CONFIG_JETSAM */
1352
1353/*
1354 * Returns the number of processes registered for notifications at this level.
1355 */
1356static size_t
1357memorystatus_klist_length(int level)
1358{
1359 LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
1360 struct knote *kn;
1361 size_t count = 0;
1362 int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level);
1363 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1364 if (kn->kn_sfflags & knote_pressure_level) {
1365 count++;
1366 }
1367 }
1368 return count;
1369}
1370
1371/*
1372 * Updates the footprint telemetry for procs that have received notifications.
1373 */
1374static void
1375update_footprints_for_telemetry(void* arg0 __unused, void* arg1 __unused)
1376{
1377 uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1378 struct knote *kn;
1379
1380 memorystatus_klist_lock();
1381 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1382 proc_t p = PROC_NULL;
1383 struct task* t = TASK_NULL;
1384 uint64_t telemetry_update;
1385
1386 p = proc_ref(p: knote_get_kq(kn)->kq_p, false);
1387 if (p == PROC_NULL) {
1388 continue;
1389 }
1390 t = (struct task *)(proc_task(p));
1391 proc_rele(p);
1392 p = PROC_NULL;
1393 telemetry_update = update_knote_footprint_history(kn, task: t, curr_ts);
1394 next_telemetry_update = MIN(next_telemetry_update, telemetry_update);
1395 }
1396 memorystatus_klist_unlock();
1397 if (next_telemetry_update != UINT64_MAX) {
1398 uint64_t next_update_seconds;
1399 absolutetime_to_nanoseconds(abstime: next_telemetry_update, result: &next_update_seconds);
1400 next_update_seconds /= NSEC_PER_SEC;
1401 thread_call_enter_delayed(call: memorystatus_notify_update_telemetry_thread_call, deadline: next_telemetry_update);
1402 }
1403}
1404
1405kern_return_t
1406memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1407{
1408 struct knote *kn_max = NULL;
1409 struct knote *kn_cur = NULL, *kn_temp = NULL;/* for safe list traversal */
1410 pid_t target_pid = -1;
1411 struct klist dispatch_klist = { NULL };
1412 proc_t target_proc = PROC_NULL;
1413 struct task *task = NULL;
1414 boolean_t found_candidate = FALSE;
1415
1416 static vm_pressure_level_t level_snapshot = kVMPressureNormal;
1417 static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
1418 boolean_t smoothing_window_started = FALSE;
1419 struct timeval smoothing_window_start_tstamp = {0, 0};
1420 struct timeval curr_tstamp = {0, 0};
1421 int64_t elapsed_msecs = 0;
1422 uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1423
1424
1425 uint64_t logging_now;
1426 absolutetime_to_nanoseconds(abstime: curr_ts, result: &logging_now);
1427#if !CONFIG_JETSAM
1428#define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
1429
1430 int idle_kill_counter = 0;
1431
1432 /*
1433 * On desktop we take this opportunity to free up memory pressure
1434 * by immediately killing idle exitable processes. We use a delay
1435 * to avoid overkill. And we impose a max counter as a fail safe
1436 * in case daemons re-launch too fast.
1437 */
1438 while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1439 if (memorystatus_idle_exit_from_VM() == FALSE) {
1440 /* No idle exitable processes left to kill */
1441 break;
1442 }
1443 idle_kill_counter++;
1444
1445 if (memorystatus_manual_testing_on == TRUE) {
1446 /*
1447 * Skip the delay when testing
1448 * the pressure notification scheme.
1449 */
1450 } else {
1451 delay(usec: 1000000); /* 1 second */
1452 }
1453 }
1454#endif /* !CONFIG_JETSAM */
1455
1456 if (level_snapshot != kVMPressureNormal) {
1457 /*
1458 * Check to see if we are still in the 'resting' period
1459 * after having notified all clients interested in
1460 * a particular pressure level.
1461 */
1462
1463 level_snapshot = memorystatus_vm_pressure_level;
1464
1465 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1466 if (next_warning_notification_sent_at_ts) {
1467 if (curr_ts < next_warning_notification_sent_at_ts) {
1468 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1469 return KERN_SUCCESS;
1470 }
1471
1472 next_warning_notification_sent_at_ts = 0;
1473 memorystatus_klist_reset_all_for_level(pressure_level_to_clear: kVMPressureWarning);
1474 }
1475 } else if (level_snapshot == kVMPressureCritical) {
1476 if (next_critical_notification_sent_at_ts) {
1477 if (curr_ts < next_critical_notification_sent_at_ts) {
1478 delay(INTER_NOTIFICATION_DELAY * 4 /* 1 sec */);
1479 return KERN_SUCCESS;
1480 }
1481 next_critical_notification_sent_at_ts = 0;
1482 memorystatus_klist_reset_all_for_level(pressure_level_to_clear: kVMPressureCritical);
1483 }
1484 }
1485 }
1486
1487#if CONFIG_JETSAM
1488 if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) {
1489 if (memorystatus_should_kill_on_sustained_pressure) {
1490 memorystatus_log("memorystatus: Pressure has returned to level %d. Cancelling scheduled jetsam\n", memorystatus_vm_pressure_level);
1491 thread_call_cancel(sustained_pressure_handler_thread_call);
1492 }
1493 } else if (memorystatus_should_kill_on_sustained_pressure && memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) {
1494 /*
1495 * Pressure has increased from normal.
1496 * Hopefully the notifications will relieve it,
1497 * but as a fail-safe we'll trigger jetsam
1498 * after a configurable amount of time.
1499 */
1500 memorystatus_log("memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.\n", prev_level_snapshot, memorystatus_vm_pressure_level);
1501 uint64_t kill_time;
1502 nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time);
1503 kill_time += mach_absolute_time();
1504 thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time);
1505 }
1506#endif /* CONFIG_JETSAM */
1507
1508 while (1) {
1509 /*
1510 * There is a race window here. But it's not clear
1511 * how much we benefit from having extra synchronization.
1512 */
1513 level_snapshot = memorystatus_vm_pressure_level;
1514
1515 if (prev_level_snapshot > level_snapshot) {
1516 /*
1517 * Pressure decreased? Let's take a little breather
1518 * and see if this condition stays.
1519 */
1520 if (smoothing_window_started == FALSE) {
1521 smoothing_window_started = TRUE;
1522 microuptime(tv: &smoothing_window_start_tstamp);
1523 }
1524
1525 microuptime(tv: &curr_tstamp);
1526 timevalsub(t1: &curr_tstamp, t2: &smoothing_window_start_tstamp);
1527 elapsed_msecs = curr_tstamp.tv_sec * 1000 + curr_tstamp.tv_usec / 1000;
1528
1529 if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1530 delay(INTER_NOTIFICATION_DELAY);
1531 continue;
1532 }
1533 }
1534 if (level_snapshot == kVMPressureNormal) {
1535 memorystatus_pressure_telemetry_send();
1536 }
1537 prev_level_snapshot = level_snapshot;
1538 smoothing_window_started = FALSE;
1539 memorystatus_klist_lock();
1540
1541 if (level_snapshot > memorystatus_pressure_interval_telemetry.max_level) {
1542 memorystatus_pressure_interval_telemetry.num_processes_registered = memorystatus_klist_length(level: level_snapshot);
1543 memorystatus_pressure_interval_telemetry.max_level = level_snapshot;
1544 memorystatus_pressure_interval_telemetry.num_transitions++;
1545 if (memorystatus_pressure_interval_telemetry.duration == 0) {
1546 /* Set the start timestamp. Duration will be finalized when we send the event. */
1547 memorystatus_pressure_interval_telemetry.duration = curr_ts;
1548 }
1549 }
1550
1551 kn_max = vm_pressure_select_optimal_candidate_to_notify(candidate_list: &memorystatus_klist, level: level_snapshot, target_foreground_process, next_telemetry_update: &next_telemetry_update);
1552
1553 if (kn_max == NULL) {
1554 memorystatus_klist_unlock();
1555
1556 /*
1557 * No more level-based clients to notify.
1558 *
1559 * Start the 'resting' window within which clients will not be re-notified.
1560 */
1561
1562 if (level_snapshot != kVMPressureNormal) {
1563 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1564 nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, result: &curr_ts);
1565
1566 /* Next warning notification (if nothing changes) won't be sent before...*/
1567 next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1568 }
1569
1570 if (level_snapshot == kVMPressureCritical) {
1571 nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, result: &curr_ts);
1572
1573 /* Next critical notification (if nothing changes) won't be sent before...*/
1574 next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1575 }
1576 }
1577 absolutetime_to_nanoseconds(abstime: mach_absolute_time(), result: &logging_now);
1578 if (next_telemetry_update != UINT64_MAX) {
1579 thread_call_enter_delayed(call: memorystatus_notify_update_telemetry_thread_call, deadline: next_telemetry_update);
1580 } else {
1581 thread_call_cancel(call: memorystatus_notify_update_telemetry_thread_call);
1582 }
1583 return KERN_FAILURE;
1584 }
1585
1586 target_proc = proc_ref(p: knote_get_kq(kn: kn_max)->kq_p, false);
1587 if (target_proc == PROC_NULL) {
1588 memorystatus_klist_unlock();
1589 continue;
1590 }
1591
1592 target_pid = proc_getpid(target_proc);
1593
1594 task = (struct task *)(proc_task(target_proc));
1595
1596 if (level_snapshot != kVMPressureNormal) {
1597 if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
1598 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, pressure_level_to_clear: 0, pressure_level_to_set: kVMPressureWarning) == TRUE) {
1599 found_candidate = TRUE;
1600 }
1601 } else {
1602 if (level_snapshot == kVMPressureCritical) {
1603 if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, pressure_level_to_clear: 0, pressure_level_to_set: kVMPressureCritical) == TRUE) {
1604 found_candidate = TRUE;
1605 }
1606 }
1607 }
1608 } else {
1609 if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1610 task_clear_has_been_notified(task, pressurelevel: kVMPressureWarning);
1611 task_clear_has_been_notified(task, pressurelevel: kVMPressureCritical);
1612
1613 found_candidate = TRUE;
1614 }
1615 }
1616
1617 if (found_candidate == FALSE) {
1618 proc_rele(p: target_proc);
1619 memorystatus_klist_unlock();
1620 continue;
1621 }
1622
1623 SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1624 int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1625
1626 if (is_knote_registered_modify_task_pressure_bits(kn_max: kn_cur, knote_pressure_level, task, pressure_level_to_clear: 0, pressure_level_to_set: level_snapshot) == TRUE) {
1627 proc_t knote_proc = knote_get_kq(kn: kn_cur)->kq_p;
1628 pid_t knote_pid = proc_getpid(knote_proc);
1629 if (knote_pid == target_pid) {
1630 KNOTE_DETACH(&memorystatus_klist, kn_cur);
1631 KNOTE_ATTACH(&dispatch_klist, kn_cur);
1632 }
1633 }
1634 }
1635 if (level_snapshot != kVMPressureNormal) {
1636 mark_knote_send_time(kn: kn_max, task, knote_pressure_level: convert_internal_pressure_level_to_dispatch_level(level_snapshot),
1637 order_within_list: (uint16_t) MIN(UINT16_MAX, memorystatus_pressure_interval_telemetry.num_notifications_sent));
1638 memorystatus_pressure_interval_telemetry.num_notifications_sent++;
1639 }
1640
1641 KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1642
1643 SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1644 KNOTE_DETACH(&dispatch_klist, kn_cur);
1645 KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1646 }
1647
1648 memorystatus_klist_unlock();
1649
1650 microuptime(tv: &target_proc->vm_pressure_last_notify_tstamp);
1651 proc_rele(p: target_proc);
1652
1653 if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1654 break;
1655 }
1656
1657 if (memorystatus_manual_testing_on == TRUE) {
1658 /*
1659 * Testing out the pressure notification scheme.
1660 * No need for delays etc.
1661 */
1662 } else {
1663 uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1664#if CONFIG_JETSAM
1665 unsigned int page_delta = 0;
1666 unsigned int skip_delay_page_threshold = 0;
1667
1668 assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
1669
1670 page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / 2;
1671 skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
1672
1673 if (memorystatus_available_pages <= skip_delay_page_threshold) {
1674 /*
1675 * We are nearing the critcal mark fast and can't afford to wait between
1676 * notifications.
1677 */
1678 sleep_interval = 0;
1679 }
1680#endif /* CONFIG_JETSAM */
1681
1682 if (sleep_interval) {
1683 delay(usec: sleep_interval);
1684 }
1685 }
1686 }
1687
1688 return KERN_SUCCESS;
1689}
1690
1691static uint32_t
1692convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1693{
1694 uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1695
1696 switch (internal_pressure_level) {
1697 case kVMPressureNormal:
1698 {
1699 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1700 break;
1701 }
1702
1703 case kVMPressureWarning:
1704 case kVMPressureUrgent:
1705 {
1706 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1707 break;
1708 }
1709
1710 case kVMPressureCritical:
1711 {
1712 dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1713 break;
1714 }
1715
1716 default:
1717 break;
1718 }
1719
1720 return dispatch_level;
1721}
1722
1723/*
1724 * Notify any kexts that are waiting for notification that jetsam
1725 * is approaching the foreground bands. They should use this notification
1726 * to free cached memory.
1727 */
1728void
1729memorystatus_issue_fg_band_notify(void)
1730{
1731 uint64_t now;
1732
1733 lck_mtx_lock(lck: &memorystatus_jetsam_fg_band_lock);
1734 absolutetime_to_nanoseconds(abstime: mach_absolute_time(), result: &now);
1735 if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) {
1736 lck_mtx_unlock(lck: &memorystatus_jetsam_fg_band_lock);
1737 return;
1738 }
1739
1740 if (memorystatus_jetsam_fg_band_waiters > 0) {
1741 thread_wakeup(&memorystatus_jetsam_fg_band_waiters);
1742 memorystatus_jetsam_fg_band_waiters = 0;
1743 memorystatus_jetsam_fg_band_timestamp_ns = now;
1744 }
1745 lck_mtx_unlock(lck: &memorystatus_jetsam_fg_band_lock);
1746
1747 /* Notify the buffer cache, file systems, etc. to jetison everything they can. */
1748 if (consider_buffer_cache_collect != NULL) {
1749 (void)(*consider_buffer_cache_collect)(1);
1750 }
1751}
1752
1753
1754/*
1755 * Memorystatus notification debugging support
1756 */
1757
1758static int
1759sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1760{
1761#pragma unused(arg1, arg2, oidp)
1762#if !XNU_TARGET_OS_OSX
1763 int error = 0;
1764
1765 error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
1766 if (error) {
1767 return error;
1768 }
1769
1770#endif /* !XNU_TARGET_OS_OSX */
1771 uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(internal_pressure_level: memorystatus_vm_pressure_level);
1772
1773 return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1774}
1775
1776#if DEBUG || DEVELOPMENT
1777
1778SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
1779 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1780
1781#else /* DEBUG || DEVELOPMENT */
1782
1783SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1784 0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
1785
1786#endif /* DEBUG || DEVELOPMENT */
1787
1788/*
1789 * Trigger levels to test the mechanism.
1790 * Can be used via a sysctl.
1791 */
1792#define TEST_LOW_MEMORY_TRIGGER_ONE 1
1793#define TEST_LOW_MEMORY_TRIGGER_ALL 2
1794#define TEST_PURGEABLE_TRIGGER_ONE 3
1795#define TEST_PURGEABLE_TRIGGER_ALL 4
1796#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
1797#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
1798
1799static int
1800sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1801{
1802#pragma unused(arg1, arg2)
1803
1804 int level = 0;
1805 int error = 0;
1806 int pressure_level = 0;
1807 int trigger_request = 0;
1808 int force_purge;
1809
1810 error = sysctl_handle_int(oidp, arg1: &level, arg2: 0, req);
1811 if (error || !req->newptr) {
1812 return error;
1813 }
1814
1815 memorystatus_manual_testing_on = TRUE;
1816
1817 trigger_request = (level >> 16) & 0xFFFF;
1818 pressure_level = (level & 0xFFFF);
1819
1820 if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
1821 trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1822 return EINVAL;
1823 }
1824 switch (pressure_level) {
1825 case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1826 case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1827 case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1828 break;
1829 default:
1830 return EINVAL;
1831 }
1832
1833 /*
1834 * The pressure level is being set from user-space.
1835 * And user-space uses the constants in sys/event.h
1836 * So we translate those events to our internal levels here.
1837 */
1838 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1839 memorystatus_manual_testing_level = kVMPressureNormal;
1840 force_purge = 0;
1841 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1842 memorystatus_manual_testing_level = kVMPressureWarning;
1843 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1844 } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1845 memorystatus_manual_testing_level = kVMPressureCritical;
1846 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1847 }
1848
1849 memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1850
1851 /* purge according to the new pressure level */
1852 switch (trigger_request) {
1853 case TEST_PURGEABLE_TRIGGER_ONE:
1854 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1855 if (force_purge == 0) {
1856 /* no purging requested */
1857 break;
1858 }
1859 vm_purgeable_object_purge_one_unlocked(force_purge_below_group: force_purge);
1860 break;
1861 case TEST_PURGEABLE_TRIGGER_ALL:
1862 case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1863 if (force_purge == 0) {
1864 /* no purging requested */
1865 break;
1866 }
1867 while (vm_purgeable_object_purge_one_unlocked(force_purge_below_group: force_purge)) {
1868 ;
1869 }
1870 break;
1871 }
1872
1873 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
1874 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1875 memorystatus_update_vm_pressure(TRUE);
1876 }
1877
1878 if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
1879 (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1880 while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1881 continue;
1882 }
1883 }
1884
1885 if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1886 memorystatus_manual_testing_on = FALSE;
1887 }
1888
1889 return 0;
1890}
1891
1892SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED,
1893 0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
1894
1895
1896SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
1897SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
1898SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
1899
1900extern int vm_pressure_level_transition_threshold;
1901SYSCTL_INT(_kern, OID_AUTO, vm_pressure_level_transition_threshold, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_level_transition_threshold, 0, "");
1902
1903#if DEBUG || DEVELOPMENT
1904SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
1905
1906#if 0
1907#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1908static boolean_t
1909memorystatus_issue_pressure_kevent(boolean_t pressured)
1910{
1911 memorystatus_klist_lock();
1912 KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1913 memorystatus_klist_unlock();
1914 return TRUE;
1915}
1916#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1917#endif /* 0 */
1918
1919/*
1920 * This routine is used for targeted notifications regardless of system memory pressure
1921 * and regardless of whether or not the process has already been notified.
1922 * It bypasses and has no effect on the only-one-notification per soft-limit policy.
1923 *
1924 * "memnote" is the current user.
1925 */
1926
1927static int
1928sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1929{
1930#pragma unused(arg1, arg2)
1931 /* Need to be root or have memorystatus entitlement */
1932 if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT)) {
1933 return EPERM;
1934 }
1935
1936 int error = 0, pid = 0;
1937 struct knote *kn = NULL;
1938 boolean_t found_knote = FALSE;
1939 int fflags = 0; /* filter flags for EVFILT_MEMORYSTATUS */
1940 uint64_t value = 0;
1941
1942 error = sysctl_handle_quad(oidp, &value, 0, req);
1943 if (error || !req->newptr) {
1944 return error;
1945 }
1946
1947 /*
1948 * Find the pid in the low 32 bits of value passed in.
1949 */
1950 pid = (int)(value & 0xFFFFFFFF);
1951
1952 /*
1953 * Find notification in the high 32 bits of the value passed in.
1954 */
1955 fflags = (int)((value >> 32) & 0xFFFFFFFF);
1956
1957 /*
1958 * For backwards compatibility, when no notification is
1959 * passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1960 */
1961 if (fflags == 0) {
1962 fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1963 // printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1964 }
1965
1966 /* wake up everybody waiting for kVMPressureJetsam */
1967 if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
1968 memorystatus_issue_fg_band_notify();
1969 return error;
1970 }
1971
1972 /*
1973 * See event.h ... fflags for EVFILT_MEMORYSTATUS
1974 */
1975 if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) ||
1976 (fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) ||
1977 (fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) ||
1978 (fflags == NOTE_MEMORYSTATUS_LOW_SWAP) ||
1979 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) ||
1980 (fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) ||
1981 (((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != 0 &&
1982 ((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == 0))))) {
1983 memorystatus_log_error("memorystatus_vm_pressure_send: notification [0x%x] not supported\n", fflags);
1984 error = 1;
1985 return error;
1986 }
1987
1988 /*
1989 * Forcibly send pid a memorystatus notification.
1990 */
1991
1992 memorystatus_klist_lock();
1993
1994 SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1995 proc_t knote_proc = knote_get_kq(kn)->kq_p;
1996 pid_t knote_pid = proc_getpid(knote_proc);
1997
1998 if (knote_pid == pid) {
1999 /*
2000 * Forcibly send this pid a memorystatus notification.
2001 */
2002 kn->kn_fflags = fflags;
2003 found_knote = TRUE;
2004 }
2005 }
2006
2007 if (found_knote) {
2008 KNOTE(&memorystatus_klist, 0);
2009 memorystatus_log_debug("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d]\n", value, fflags, pid);
2010 error = 0;
2011 } else {
2012 memorystatus_log_error("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
2013 error = 1;
2014 }
2015
2016 memorystatus_klist_unlock();
2017
2018 return error;
2019}
2020
2021SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD | CTLFLAG_WR | CTLFLAG_LOCKED | CTLFLAG_MASKED | CTLFLAG_ANYBODY,
2022 0, 0, &sysctl_memorystatus_vm_pressure_send, "Q", "");
2023
2024#endif /* DEBUG || DEVELOPMENT */
2025
2026#endif /* VM_PRESSURE_EVENTS */
2027