kern_memorystatus_notify.c source code [xnu/bsd/kern/kern_memorystatus_notify.c]

1	/*
2	* Copyright (c) 2006-2018 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*
28	*/
29
30	#include <sys/kern_event.h>
31	#include <kern/sched_prim.h>
32	#include <kern/assert.h>
33	#include <kern/debug.h>
34	#include <kern/locks.h>
35	#include <kern/task.h>
36	#include <kern/thread.h>
37	#include <kern/thread_call.h>
38	#include <kern/host.h>
39	#include <kern/policy_internal.h>
40	#include <kern/thread_group.h>
41
42	#include <IOKit/IOBSD.h>
43
44	#include <libkern/libkern.h>
45	#include <libkern/coreanalytics/coreanalytics.h>
46	#include <mach/coalition.h>
47	#include <mach/clock_types.h>
48	#include <mach/mach_time.h>
49	#include <mach/task.h>
50	#include <mach/host_priv.h>
51	#include <mach/mach_host.h>
52	#include <os/log.h>
53	#include <pexpert/pexpert.h>
54	#include <sys/coalition.h>
55	#include <sys/kern_event.h>
56	#include <sys/proc.h>
57	#include <sys/proc_info.h>
58	#include <sys/reason.h>
59	#include <sys/signal.h>
60	#include <sys/signalvar.h>
61	#include <sys/sysctl.h>
62	#include <sys/sysproto.h>
63	#include <sys/time.h>
64	#include <sys/wait.h>
65	#include <sys/tree.h>
66	#include <sys/priv.h>
67	#include <vm/vm_pageout.h>
68	#include <vm/vm_protos.h>
69	#include <mach/machine/sdt.h>
70	#include <libkern/section_keywords.h>
71	#include <stdatomic.h>
72
73	#if CONFIG_FREEZE
74	#include <vm/vm_map.h>
75	#endif /* CONFIG_FREEZE */
76
77	#include <kern/kern_memorystatus_internal.h>
78	#include <sys/kern_memorystatus.h>
79	#include <sys/kern_memorystatus_notify.h>
80
81	/*
82	* Memorystatus klist structures
83	*/
84	struct klist memorystatus_klist;
85	static lck_mtx_t memorystatus_klist_mutex;
86	static void memorystatus_klist_lock(void);
87	static void memorystatus_klist_unlock(void);
88
89	/*
90	* Memorystatus kevent filter routines
91	*/
92	static int filt_memorystatusattach(struct knote kn, struct* kevent_qos_s *kev);
93	static void filt_memorystatusdetach(struct knote *kn);
94	static int filt_memorystatus(struct knote kn, long* hint);
95	static int filt_memorystatustouch(struct knote kn, struct* kevent_qos_s *kev);
96	static int filt_memorystatusprocess(struct knote kn, struct* kevent_qos_s *kev);
97
98	SECURITY_READ_ONLY_EARLY(struct filterops) memorystatus_filtops = {
99	.f_attach = filt_memorystatusattach,
100	.f_detach = filt_memorystatusdetach,
101	.f_event = filt_memorystatus,
102	.f_touch = filt_memorystatustouch,
103	.f_process = filt_memorystatusprocess,
104	};
105
106	/*
107	* Memorystatus notification events
108	*/
109	enum {
110	kMemorystatusNoPressure = `0x1`,
111	kMemorystatusPressure = `0x2`,
112	kMemorystatusLowSwap = `0x4`,
113	kMemorystatusProcLimitWarn = `0x8`,
114	kMemorystatusProcLimitCritical = `0x10`
115	};
116
117	#define INTER_NOTIFICATION_DELAY (250000) /* .25 second */
118	#define VM_PRESSURE_DECREASED_SMOOTHING_PERIOD 5000 /* milliseconds */
119	#define WARNING_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
120	#define CRITICAL_NOTIFICATION_RESTING_PERIOD 25 /* seconds */
121
122	/*
123	* Memorystatus notification helper routines
124	*/
125	static vm_pressure_level_t convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
126	static boolean_t is_knote_registered_modify_task_pressure_bits(struct knote, int*, task_t, vm_pressure_level_t, vm_pressure_level_t);
127	static void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
128	static struct knote vm_pressure_select_optimal_candidate_to_notify(struct* klist candidate_list, int* level, boolean_t target_foreground_process, uint64_t *next_telemetry_update);
129	static void vm_dispatch_memory_pressure(void);
130	kern_return_t memorystatus_update_vm_pressure(boolean_t target_foreground_process);
131
132	#if VM_PRESSURE_EVENTS
133
134	/*
135	* This value is the threshold that a process must meet to be considered for scavenging.
136	*/
137	#if XNU_TARGET_OS_OSX
138	#define VM_PRESSURE_MINIMUM_RSIZE 10 /* MB */
139	#else /* XNU_TARGET_OS_OSX */
140	#define VM_PRESSURE_MINIMUM_RSIZE 6 /* MB */
141	#endif /* XNU_TARGET_OS_OSX */
142
143	static uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
144
145	#if DEVELOPMENT \|\| DEBUG
146	SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW \| CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, `0`, "");
147	#endif /* DEVELOPMENT \|\| DEBUG */
148
149	vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
150
151	/*
152	* We use this flag to signal if we have any HWM offenders
153	* on the system. This way we can reduce the number of wakeups
154	* of the memorystatus_thread when the system is between the
155	* "pressure" and "critical" threshold.
156	*
157	* The (re-)setting of this variable is done without any locks
158	* or synchronization simply because it is not possible (currently)
159	* to keep track of HWM offenders that drop down below their memory
160	* limit and/or exit. So, we choose to burn a couple of wasted wakeups
161	* by allowing the unguarded modification of this variable.
162	*/
163	boolean_t memorystatus_hwm_candidates = `0`;
164
165	#endif /* VM_PRESSURE_EVENTS */
166
167	#if CONFIG_JETSAM
168
169	extern unsigned int memorystatus_available_pages;
170	extern unsigned int memorystatus_available_pages_pressure;
171	extern unsigned int memorystatus_available_pages_critical;
172	extern unsigned int memorystatus_available_pages_critical_base;
173	extern unsigned int memorystatus_available_pages_critical_idle_offset;
174
175	#else /* CONFIG_JETSAM */
176
177	extern uint64_t memorystatus_available_pages;
178	extern uint64_t memorystatus_available_pages_pressure;
179	extern uint64_t memorystatus_available_pages_critical;
180
181	#endif /* CONFIG_JETSAM */
182
183	extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
184	uint32_t memorystatus_jetsam_fg_band_waiters = `0`;
185	static uint64_t memorystatus_jetsam_fg_band_timestamp_ns = `0`; / nanosec /
186	static uint64_t memorystatus_jetsam_fg_band_delay_ns = `5ull` * `1000` * `1000` * `1000`; / nanosec /
187
188	extern boolean_t(*volatile consider_buffer_cache_collect)(int);
189
190	#if DEVELOPMENT \|\| DEBUG
191	SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_jetsam_fg_band_delay_ns, CTLFLAG_RW \| CTLFLAG_LOCKED,
192	&memorystatus_jetsam_fg_band_delay_ns, "");
193	#endif
194
195	static int
196	filt_memorystatusattach(struct knote kn, __unused struct* kevent_qos_s *kev)
197	{
198	int error;
199
200	kn->kn_flags \|= EV_CLEAR; / automatically set /
201	kn->kn_sdata = `0`; / incoming data is ignored /
202	memset(s: &kn->kn_ext, c: `0`, n: sizeof(kn->kn_ext));
203
204	error = memorystatus_knote_register(kn);
205	if (error) {
206	knote_set_error(kn, error);
207	}
208	return `0`;
209	}
210
211	static void
212	filt_memorystatusdetach(struct knote *kn)
213	{
214	memorystatus_knote_unregister(kn);
215	}
216
217	static int
218	filt_memorystatus(struct knote kn __unused, long* hint)
219	{
220	if (hint) {
221	switch (hint) {
222	case kMemorystatusNoPressure:
223	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
224	kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
225	}
226	break;
227	case kMemorystatusPressure:
228	if (memorystatus_vm_pressure_level == kVMPressureWarning \|\| memorystatus_vm_pressure_level == kVMPressureUrgent) {
229	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
230	kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
231	}
232	} else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
233	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
234	kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
235	}
236	}
237	break;
238	case kMemorystatusLowSwap:
239	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_LOW_SWAP) {
240	kn->kn_fflags = NOTE_MEMORYSTATUS_LOW_SWAP;
241	}
242	break;
243
244	case kMemorystatusProcLimitWarn:
245	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
246	kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
247	}
248	break;
249
250	case kMemorystatusProcLimitCritical:
251	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
252	kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
253	}
254	break;
255
256	default:
257	break;
258	}
259	}
260
261	#if 0
262	if (kn->kn_fflags != `0`) {
263	proc_t knote_proc = knote_get_kq(kn)->kq_p;
264	pid_t knote_pid = proc_getpid(knote_proc);
265
266	printf("filt_memorystatus: sending kn 0x%lx (event 0x%x) for pid (%d)\n",
267	(unsigned long)kn, kn->kn_fflags, knote_pid);
268	}
269	#endif
270
271	return kn->kn_fflags != `0`;
272	}
273
274	static int
275	filt_memorystatustouch(struct knote kn, struct* kevent_qos_s *kev)
276	{
277	int res;
278	int prev_kn_sfflags = `0`;
279
280	memorystatus_klist_lock();
281
282	/*
283	* copy in new kevent settings
284	* (saving the "desired" data and fflags).
285	*/
286
287	prev_kn_sfflags = kn->kn_sfflags;
288	kn->kn_sfflags = (kev->fflags & EVFILT_MEMORYSTATUS_ALL_MASK);
289
290	#if XNU_TARGET_OS_OSX
291	/*
292	* Only on desktop do we restrict notifications to
293	* one per active/inactive state (soft limits only).
294	*/
295	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
296	/*
297	* Is there previous state to preserve?
298	*/
299	if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
300	/*
301	* This knote was previously interested in proc_limit_warn,
302	* so yes, preserve previous state.
303	*/
304	if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
305	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
306	}
307	if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
308	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
309	}
310	} else {
311	/*
312	* This knote was not previously interested in proc_limit_warn,
313	* but it is now. Set both states.
314	*/
315	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
316	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
317	}
318	}
319
320	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
321	/*
322	* Is there previous state to preserve?
323	*/
324	if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
325	/*
326	* This knote was previously interested in proc_limit_critical,
327	* so yes, preserve previous state.
328	*/
329	if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
330	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
331	}
332	if (prev_kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
333	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
334	}
335	} else {
336	/*
337	* This knote was not previously interested in proc_limit_critical,
338	* but it is now. Set both states.
339	*/
340	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
341	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
342	}
343	}
344	#endif /* XNU_TARGET_OS_OSX */
345
346	/*
347	* reset the output flags based on a
348	* combination of the old events and
349	* the new desired event list.
350	*/
351	//kn->kn_fflags &= kn->kn_sfflags;
352
353	res = (kn->kn_fflags != `0`);
354
355	memorystatus_klist_unlock();
356
357	return res;
358	}
359
360	static int
361	filt_memorystatusprocess(struct knote kn, struct* kevent_qos_s *kev)
362	{
363	int res = `0`;
364
365	memorystatus_klist_lock();
366	if (kn->kn_fflags) {
367	knote_fill_kevent(kn, kev, data: `0`);
368	res = `1`;
369	}
370	memorystatus_klist_unlock();
371
372	return res;
373	}
374
375	static void
376	memorystatus_klist_lock(void)
377	{
378	lck_mtx_lock(lck: &memorystatus_klist_mutex);
379	}
380
381	static void
382	memorystatus_klist_unlock(void)
383	{
384	lck_mtx_unlock(lck: &memorystatus_klist_mutex);
385	}
386
387	void
388	memorystatus_kevent_init(lck_grp_t grp, lck_attr_t attr)
389	{
390	lck_mtx_init(lck: &memorystatus_klist_mutex, grp, attr);
391	klist_init(list: &memorystatus_klist);
392	}
393
394	int
395	memorystatus_knote_register(struct knote *kn)
396	{
397	int error = `0`;
398
399	memorystatus_klist_lock();
400
401	/*
402	* Support only userspace visible flags.
403	*/
404	if ((kn->kn_sfflags & EVFILT_MEMORYSTATUS_ALL_MASK) == (unsigned int) kn->kn_sfflags) {
405	#if XNU_TARGET_OS_OSX
406	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
407	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
408	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
409	}
410
411	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
412	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
413	kn->kn_sfflags \|= NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
414	}
415	#endif /* XNU_TARGET_OS_OSX */
416
417	KNOTE_ATTACH(&memorystatus_klist, kn);
418	} else {
419	error = ENOTSUP;
420	}
421
422	memorystatus_klist_unlock();
423
424	return error;
425	}
426
427	void
428	memorystatus_knote_unregister(struct knote *kn __unused)
429	{
430	memorystatus_klist_lock();
431	KNOTE_DETACH(&memorystatus_klist, kn);
432	memorystatus_klist_unlock();
433	}
434
435	#if VM_PRESSURE_EVENTS
436
437	#if CONFIG_JETSAM
438
439	static thread_call_t sustained_pressure_handler_thread_call;
440	int memorystatus_should_kill_on_sustained_pressure = `1`;
441	/ Count the number of sustained pressure kills we've done since boot. /
442	uint64_t memorystatus_kill_on_sustained_pressure_count = `0`;
443	uint64_t memorystatus_kill_on_sustained_pressure_window_s = `60` * `10`; / 10 Minutes /
444	uint64_t memorystatus_kill_on_sustained_pressure_delay_ms = `500`; / .5 seconds /
445
446	#if DEVELOPMENT \|\| DEBUG
447	SYSCTL_INT(_kern, OID_AUTO, memorystatus_should_kill_on_sustained_pressure, CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &memorystatus_should_kill_on_sustained_pressure, `0`, "");
448	#endif /* DEVELOPMENT \|\| DEBUG */
449	SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_count, CTLFLAG_RD \| CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_count, "");
450	SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_window_s, CTLFLAG_RW \| CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_window_s, "");
451	SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_kill_on_sustained_pressure_delay_ms, CTLFLAG_RW \| CTLFLAG_LOCKED, &memorystatus_kill_on_sustained_pressure_delay_ms, "");
452
453	static void sustained_pressure_handler(void, void**);
454	#endif /* CONFIG_JETSAM */
455	static thread_call_t memorystatus_notify_update_telemetry_thread_call;
456	static void update_footprints_for_telemetry(void, void**);
457
458
459	void
460	memorystatus_notify_init()
461	{
462	#if CONFIG_JETSAM
463	sustained_pressure_handler_thread_call = thread_call_allocate_with_options(sustained_pressure_handler, NULL, THREAD_CALL_PRIORITY_KERNEL_HIGH, THREAD_CALL_OPTIONS_ONCE);
464	#endif /* CONFIG_JETSAM */
465	memorystatus_notify_update_telemetry_thread_call = thread_call_allocate_with_options(func: update_footprints_for_telemetry, NULL, pri: THREAD_CALL_PRIORITY_USER, options: THREAD_CALL_OPTIONS_ONCE);
466	}
467
468	#if CONFIG_MEMORYSTATUS
469
470	inline int
471	memorystatus_send_note(int event_code, void *data, uint32_t data_length)
472	{
473	int ret;
474	struct kev_msg ev_msg;
475
476	ev_msg.vendor_code = KEV_VENDOR_APPLE;
477	ev_msg.kev_class = KEV_SYSTEM_CLASS;
478	ev_msg.kev_subclass = KEV_MEMORYSTATUS_SUBCLASS;
479
480	ev_msg.event_code = event_code;
481
482	ev_msg.dv[`0`].data_length = data_length;
483	ev_msg.dv[`0`].data_ptr = data;
484	ev_msg.dv[`1`].data_length = `0`;
485
486	ret = kev_post_msg(event: &ev_msg);
487	if (ret) {
488	memorystatus_log_error("%s: kev_post_msg() failed, err %d\n", __func__, ret);
489	}
490
491	return ret;
492	}
493
494	boolean_t
495	memorystatus_warn_process(const proc_t p, __unused boolean_t is_active, __unused boolean_t is_fatal, boolean_t limit_exceeded)
496	{
497	/*
498	* This function doesn't take a reference to p or lock it. So it better be the current process.
499	*/
500	assert(p == current_proc());
501	pid_t pid = proc_getpid(p);
502	boolean_t ret = FALSE;
503	boolean_t found_knote = FALSE;
504	struct knote *kn = NULL;
505	int send_knote_count = `0`;
506	uint32_t platform;
507	platform = proc_platform(p);
508
509	/*
510	* See comment in sysctl_memorystatus_vm_pressure_send.
511	*/
512
513	memorystatus_klist_lock();
514
515	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
516	proc_t knote_proc = knote_get_kq(kn)->kq_p;
517	pid_t knote_pid = proc_getpid(knote_proc);
518
519	if (knote_pid == pid) {
520	/*
521	* By setting the "fflags" here, we are forcing
522	* a process to deal with the case where it's
523	* bumping up into its memory limits. If we don't
524	* do this here, we will end up depending on the
525	* system pressure snapshot evaluation in
526	* filt_memorystatus().
527	*/
528
529	/*
530	* The type of notification and the frequency are different between
531	* embedded and desktop.
532	*
533	* Embedded processes register for global pressure notifications
534	* (NOTE_MEMORYSTATUS_PRESSURE_WARN \| NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) via UIKit
535	* (see applicationDidReceiveMemoryWarning in UIKit). We'll warn them here if
536	* they are near there memory limit. filt_memorystatus() will warn them based
537	* on the system pressure level.
538	*
539	* On desktop, (NOTE_MEMORYSTATUS_PRESSURE_WARN \| NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)
540	* are only expected to fire for system level warnings. Desktop procesess
541	* register for NOTE_MEMORYSTATUS_PROC_LIMIT_WARN
542	* if they want to be warned when they approach their limit
543	* and for NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL to be warned when they
544	* exceed their limit.
545	*
546	* On embedded we continuously warn processes that are approaching their
547	* memory limit. However on desktop, we only send one warning while
548	* the process is active/inactive if the limit is soft..
549	*
550	*/
551	if (platform == PLATFORM_MACOS \|\| platform == PLATFORM_MACCATALYST \|\| platform == PLATFORM_DRIVERKIT) {
552	if (!limit_exceeded) {
553	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
554	found_knote = TRUE;
555	if (!is_fatal) {
556	/*
557	* Restrict proc_limit_warn notifications when
558	* non-fatal (soft) limit is at play.
559	*/
560	if (is_active) {
561	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE) {
562	/*
563	* Mark this knote for delivery.
564	*/
565	kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
566	/*
567	* And suppress it from future notifications.
568	*/
569	kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_ACTIVE;
570	send_knote_count++;
571	}
572	} else {
573	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE) {
574	/*
575	* Mark this knote for delivery.
576	*/
577	kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
578	/*
579	* And suppress it from future notifications.
580	*/
581	kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_WARN_INACTIVE;
582	send_knote_count++;
583	}
584	}
585	} else {
586	/*
587	* No restriction on proc_limit_warn notifications when
588	* fatal (hard) limit is at play.
589	*/
590	kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
591	send_knote_count++;
592	}
593	}
594	} else {
595	/*
596	* Send this notification when a process has exceeded a soft limit,
597	*/
598
599	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
600	found_knote = TRUE;
601	if (!is_fatal) {
602	/*
603	* Restrict critical notifications for soft limits.
604	*/
605
606	if (is_active) {
607	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE) {
608	/*
609	* Suppress future proc_limit_critical notifications
610	* for the active soft limit.
611	*/
612	kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_ACTIVE;
613	kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
614	send_knote_count++;
615	}
616	} else {
617	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE) {
618	/*
619	* Suppress future proc_limit_critical_notifications
620	* for the inactive soft limit.
621	*/
622	kn->kn_sfflags &= ~NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL_INACTIVE;
623	kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
624	send_knote_count++;
625	}
626	}
627	} else {
628	/*
629	* We should never be trying to send a critical notification for
630	* a hard limit... the process would be killed before it could be
631	* received.
632	*/
633	panic("Caught sending pid %d a critical warning for a fatal limit.", pid);
634	}
635	}
636	}
637	} else {
638	if (!limit_exceeded) {
639	/*
640	* Intentionally set either the unambiguous limit warning,
641	* the system-wide critical or the system-wide warning
642	* notification bit.
643	*/
644
645	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) {
646	kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_WARN;
647	found_knote = TRUE;
648	send_knote_count++;
649	} else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
650	kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
651	found_knote = TRUE;
652	send_knote_count++;
653	} else if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
654	kn->kn_fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
655	found_knote = TRUE;
656	send_knote_count++;
657	}
658	} else {
659	/*
660	* Send this notification when a process has exceeded a soft limit.
661	*/
662	if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) {
663	kn->kn_fflags = NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL;
664	found_knote = TRUE;
665	send_knote_count++;
666	}
667	}
668	}
669	}
670	}
671
672	if (found_knote) {
673	if (send_knote_count > `0`) {
674	KNOTE(&memorystatus_klist, `0`);
675	}
676	ret = TRUE;
677	}
678
679	memorystatus_klist_unlock();
680
681	return ret;
682	}
683
684	/*
685	* Can only be set by the current task on itself.
686	*/
687	int
688	memorystatus_low_mem_privileged_listener(uint32_t op_flags)
689	{
690	boolean_t set_privilege = FALSE;
691	/*
692	* Need an entitlement check here?
693	*/
694	if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_ENABLE) {
695	set_privilege = TRUE;
696	} else if (op_flags == MEMORYSTATUS_CMD_PRIVILEGED_LISTENER_DISABLE) {
697	set_privilege = FALSE;
698	} else {
699	return EINVAL;
700	}
701
702	return task_low_mem_privileged_listener(task: current_task(), new_value: set_privilege, NULL);
703	}
704
705	int
706	memorystatus_send_pressure_note(pid_t pid)
707	{
708	memorystatus_log_debug("memorystatus_send_pressure_note(): pid %d\n", pid);
709	return memorystatus_send_note(event_code: kMemorystatusPressureNote, data: &pid, data_length: sizeof(pid));
710	}
711
712	boolean_t
713	memorystatus_is_foreground_locked(proc_t p)
714	{
715	return (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) \|\|
716	(p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT);
717	}
718
719	/*
720	* This is meant for stackshot and kperf -- it does not take the proc_list_lock
721	* to access the p_memstat_dirty field.
722	*/
723	void
724	memorystatus_proc_flags_unsafe(void * v, boolean_t is_dirty, boolean_t is_dirty_tracked, boolean_t *allow_idle_exit)
725	{
726	if (!v) {
727	*is_dirty = FALSE;
728	*is_dirty_tracked = FALSE;
729	*allow_idle_exit = FALSE;
730	} else {
731	proc_t p = (proc_t)v;
732	*is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != `0`;
733	*is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != `0`;
734	*allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != `0`;
735	}
736	}
737
738	boolean_t
739	memorystatus_bg_pressure_eligible(proc_t p)
740	{
741	boolean_t eligible = FALSE;
742
743	proc_list_lock();
744
745	memorystatus_log_debug("memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", proc_getpid(p), p->p_memstat_state);
746
747	/ Foreground processes have already been dealt with at this point, so just test for eligibility /
748	if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED \| P_MEMSTAT_LOCKED \| P_MEMSTAT_SUSPENDED \| P_MEMSTAT_FROZEN))) {
749	eligible = TRUE;
750	}
751
752	if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
753	/*
754	* IDLE and IDLE_DEFERRED bands contain processes
755	* that have dropped memory to be under their inactive
756	* memory limits. And so they can't really give back
757	* anything.
758	*/
759	eligible = FALSE;
760	}
761
762	proc_list_unlock();
763
764	return eligible;
765	}
766
767	void
768	memorystatus_send_low_swap_note(void)
769	{
770	struct knote *kn = NULL;
771
772	memorystatus_klist_lock();
773	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
774	/ We call is_knote_registered_modify_task_pressure_bits to check if the sfflags for the*
775	* current note contain NOTE_MEMORYSTATUS_LOW_SWAP. Once we find one note in the memorystatus_klist
776	* that has the NOTE_MEMORYSTATUS_LOW_SWAP flags in its sfflags set, we call KNOTE with
777	* kMemoryStatusLowSwap as the hint to process and update all knotes on the memorystatus_klist accordingly. */
778	if (is_knote_registered_modify_task_pressure_bits(kn, NOTE_MEMORYSTATUS_LOW_SWAP, NULL, `0`, `0`) == TRUE) {
779	KNOTE(&memorystatus_klist, kMemorystatusLowSwap);
780	break;
781	}
782	}
783
784	memorystatus_klist_unlock();
785	}
786
787	#endif /* CONFIG_MEMORYSTATUS */
788
789	/*
790	* Notification telemetry
791	*/
792	CA_EVENT(memorystatus_pressure_interval,
793	CA_INT, num_processes_registered,
794	CA_INT, num_notifications_sent,
795	CA_INT, max_level,
796	CA_INT, num_transitions,
797	CA_INT, num_kills,
798	CA_INT, duration);
799	static CA_EVENT_TYPE(memorystatus_pressure_interval) memorystatus_pressure_interval_telemetry;
800
801	CA_EVENT(memorystatus_proc_notification,
802	CA_INT, footprint_before_notification,
803	CA_INT, footprint_1_min_after_first_warning,
804	CA_INT, footprint_5_min_after_first_warning,
805	CA_INT, footprint_20_min_after_first_warning,
806	CA_INT, footprint_1_min_after_first_critical,
807	CA_INT, footprint_5_min_after_first_critical,
808	CA_INT, footprint_20_min_after_first_critical,
809	CA_INT, order_within_list,
810	CA_INT, num_notifications_sent,
811	CA_INT, time_between_warning_and_critical,
812	CA_STATIC_STRING(CA_PROCNAME_LEN), proc_name);
813
814	/ The send timestamps for the first notifications are stored in the knote's kn_sdata field /
815	#define KNOTE_SEND_TIMESTAMP_WARNING_INDEX 0
816	#define KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX 1
817
818	/ The footprint history for this task is stored in the knote's kn_ext array. /
819	struct knote_footprint_history {
820	uint32_t kfh_starting_footprint;
821	uint32_t kfh_footprint_after_warn_1; / 1 minute after first warning notification /
822	uint32_t kfh_footprint_after_warn_5; / 5 minutes after first warning notification /
823	uint32_t kfh_footprint_after_warn_20; / 20 minutes after first warning notification /
824	uint32_t kfh_footprint_after_critical_1; / 1 minute after first critical notification /
825	uint32_t kfh_footprint_after_critical_5; / 5 minutes after first critical notification /
826	uint32_t kfh_footprint_after_critical_20; / 20 minutes after first critical notification /
827	uint16_t kfh_num_notifications;
828	uint16_t kfh_notification_order;
829	} __attribute__((packed));
830
831
832	static_assert(sizeof(struct knote_footprint_history) <= sizeof(uint64_t) * `4`, "footprint history fits in knote extensions");
833
834	static void
835	mark_knote_send_time(struct knote kn, task_t task, int* knote_pressure_level, uint16_t order_within_list)
836	{
837	uint32_t *timestamps;
838	uint32_t index;
839	uint64_t curr_ts, curr_ts_seconds;
840	struct knote_footprint_history footprint_history = (struct* knote_footprint_history *)kn->kn_ext;
841	if (knote_pressure_level != NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
842	timestamps = (uint32_t *)&(kn->kn_sdata);
843	index = knote_pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN ?
844	KNOTE_SEND_TIMESTAMP_WARNING_INDEX : KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX;
845	if (timestamps[index] == `0`) {
846	/ First notification for this level since pressure elevated from normal. /
847	curr_ts = mach_absolute_time();
848	curr_ts_seconds = `0`;
849	absolutetime_to_nanoseconds(abstime: curr_ts, result: &curr_ts_seconds);
850	curr_ts_seconds /= NSEC_PER_SEC;
851
852	timestamps[index] = (uint32_t)MIN(UINT32_MAX, curr_ts_seconds);
853
854	/ Record task initial footprint /
855	if (timestamps[index == KNOTE_SEND_TIMESTAMP_WARNING_INDEX ? KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX : KNOTE_SEND_TIMESTAMP_WARNING_INDEX] == `0`) {
856	/*
857	* First notification at any level since pressure elevated from normal.
858	* Record the footprint and our order in the notification list.
859	*/
860	footprint_history->kfh_starting_footprint = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (`2UL` << `20`));
861	footprint_history->kfh_notification_order = order_within_list;
862	}
863	}
864	}
865	footprint_history->kfh_num_notifications++;
866	}
867
868	/*
869	* Records the current footprint for this task in the knote telemetry.
870	*
871	* Returns the soonest absolutetime when this footprint history should be updated again.
872	*/
873	static uint64_t
874	update_knote_footprint_history(struct knote *kn, task_t task, uint64_t curr_ts)
875	{
876	uint32_t timestamps = (uint32_t )&(kn->kn_sdata);
877	struct knote_footprint_history footprint_history = (struct* knote_footprint_history *)kn->kn_ext;
878	uint64_t warning_send_time, critical_send_time, minutes_since_warning = UINT64_MAX, minutes_since_critical = UINT64_MAX;
879	warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
880	critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
881	uint32_t task_phys_footprint_mb = (uint32_t) MIN(UINT32_MAX, get_task_phys_footprint(task) / (`2UL` << `20`));
882	uint64_t next_run = UINT64_MAX, absolutetime_in_minute = `0`, minutes_since_last_notification = `0`, curr_ts_s;
883	absolutetime_to_nanoseconds(abstime: curr_ts, result: &curr_ts_s);
884	nanoseconds_to_absolutetime(nanoseconds: `60` * NSEC_PER_SEC, result: &absolutetime_in_minute);
885	curr_ts_s /= NSEC_PER_SEC;
886
887	if (warning_send_time != `0`) {
888	/ This task received a warning notification. /
889	minutes_since_warning = (curr_ts_s - warning_send_time) / `60`;
890	if (footprint_history->kfh_footprint_after_warn_1 == `0` && minutes_since_warning >= `1`) {
891	footprint_history->kfh_footprint_after_warn_1 = task_phys_footprint_mb;
892	}
893	if (footprint_history->kfh_footprint_after_warn_5 == `0` && minutes_since_warning >= `5`) {
894	footprint_history->kfh_footprint_after_warn_5 = task_phys_footprint_mb;
895	}
896	if (footprint_history->kfh_footprint_after_warn_20 == `0` && minutes_since_warning >= `20`) {
897	footprint_history->kfh_footprint_after_warn_20 = task_phys_footprint_mb;
898	}
899	}
900	if (critical_send_time != `0`) {
901	/ This task received a critical notification. /
902	minutes_since_critical = (curr_ts_s - critical_send_time) / `60`;
903	if (footprint_history->kfh_footprint_after_critical_1 == `0` && minutes_since_critical >= `1`) {
904	footprint_history->kfh_footprint_after_critical_1 = task_phys_footprint_mb;
905	}
906	if (footprint_history->kfh_footprint_after_critical_5 == `0` && minutes_since_critical >= `5`) {
907	footprint_history->kfh_footprint_after_critical_5 = task_phys_footprint_mb;
908	}
909	if (footprint_history->kfh_footprint_after_critical_20 == `0` && minutes_since_critical >= `20`) {
910	footprint_history->kfh_footprint_after_critical_20 = task_phys_footprint_mb;
911	}
912	}
913
914	minutes_since_last_notification = MIN(minutes_since_warning, minutes_since_critical);
915	if (minutes_since_last_notification < `20`) {
916	if (minutes_since_last_notification < `5`) {
917	if (minutes_since_last_notification < `1`) {
918	next_run = curr_ts + absolutetime_in_minute;
919	} else {
920	next_run = curr_ts + (absolutetime_in_minute * `5`);
921	}
922	} else {
923	next_run = curr_ts + (absolutetime_in_minute * `20`);
924	}
925	}
926
927	return next_run;
928	}
929
930	extern char proc_name_address(void* *p);
931	/*
932	* Attempt to send the given level telemetry event.
933	* Finalizes the duration.
934	* Clears the src_event struct.
935	*/
936	static void
937	memorystatus_pressure_interval_send(CA_EVENT_TYPE(memorystatus_pressure_interval) *src_event)
938	{
939	uint64_t duration_nanoseconds = `0`;
940	uint64_t curr_ts = mach_absolute_time();
941	src_event->duration = curr_ts - src_event->duration;
942	absolutetime_to_nanoseconds(abstime: src_event->duration, result: &duration_nanoseconds);
943	src_event->duration = (int64_t) (duration_nanoseconds / NSEC_PER_SEC);
944
945	/*
946	* Drop the event rather than block for memory. We should be in a normal pressure level now,
947	* but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
948	*/
949	ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_pressure_interval, Z_NOWAIT);
950	if (event_wrapper) {
951	memcpy(dst: event_wrapper->data, src: src_event, n: sizeof(CA_EVENT_TYPE(memorystatus_pressure_interval)));
952	CA_EVENT_SEND(event_wrapper);
953	}
954	src_event->num_processes_registered = `0`;
955	src_event->num_notifications_sent = `0`;
956	src_event->max_level = `0`;
957	src_event->num_transitions = `0`;
958	src_event->num_kills = `0`;
959	src_event->duration = `0`;
960	}
961
962
963	/*
964	* Attempt to send the per-proc telemetry events.
965	* Clears the footprint histories on the knotes.
966	*/
967	static void
968	memorystatus_pressure_proc_telemetry_send(void)
969	{
970	struct knote *kn = NULL;
971	memorystatus_klist_lock();
972	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
973	proc_t p = PROC_NULL;
974	struct knote_footprint_history footprint_history = (struct* knote_footprint_history *)kn->kn_ext;
975	uint32_t timestamps = (uint32_t )&(kn->kn_sdata);
976	uint32_t warning_send_time = timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX];
977	uint32_t critical_send_time = timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX];
978	CA_EVENT_TYPE(memorystatus_proc_notification) * event = NULL;
979	if (warning_send_time != `0` \|\| critical_send_time != `0`) {
980	/*
981	* Drop the event rather than block for memory. We should be in a normal pressure level now,
982	* but we don't want to end up blocked in page_wait if there's a sudden spike in pressure.
983	*/
984	ca_event_t event_wrapper = CA_EVENT_ALLOCATE_FLAGS(memorystatus_proc_notification, Z_NOWAIT \| Z_ZERO);
985	if (event_wrapper) {
986	event = event_wrapper->data;
987
988	event->footprint_before_notification = footprint_history->kfh_starting_footprint;
989	event->footprint_1_min_after_first_warning = footprint_history->kfh_footprint_after_warn_1;
990	event->footprint_5_min_after_first_warning = footprint_history->kfh_footprint_after_warn_5;
991	event->footprint_20_min_after_first_warning = footprint_history->kfh_footprint_after_warn_20;
992	event->footprint_1_min_after_first_critical = footprint_history->kfh_footprint_after_critical_1;
993	event->footprint_5_min_after_first_critical = footprint_history->kfh_footprint_after_critical_5;
994	event->footprint_20_min_after_first_critical = footprint_history->kfh_footprint_after_critical_20;
995	event->num_notifications_sent = footprint_history->kfh_num_notifications;
996	if (warning_send_time != `0` && critical_send_time != `0`) {
997	event->time_between_warning_and_critical = (critical_send_time - warning_send_time) / `60`; // Minutes
998	}
999	event->order_within_list = footprint_history->kfh_notification_order;
1000
1001	p = proc_ref(p: knote_get_kq(kn)->kq_p, false);
1002	if (p == NULL) {
1003	CA_EVENT_DEALLOCATE(event_wrapper);
1004	continue;
1005	}
1006	strlcpy(dst: event->proc_name, src: proc_name_address(p), n: sizeof(event->proc_name));
1007
1008	proc_rele(p);
1009	CA_EVENT_SEND(event_wrapper);
1010	}
1011	}
1012	memset(s: footprint_history, c: `0`, n: sizeof(*footprint_history));
1013	timestamps[KNOTE_SEND_TIMESTAMP_WARNING_INDEX] = `0`;
1014	timestamps[KNOTE_SEND_TIMESTAMP_CRITICAL_INDEX] = `0`;
1015	}
1016	memorystatus_klist_unlock();
1017	}
1018
1019	/*
1020	* Send all telemetry associated with the increased pressure interval.
1021	*/
1022	static void
1023	memorystatus_pressure_telemetry_send(void)
1024	{
1025	LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_NOTOWNED);
1026	memorystatus_pressure_interval_send(src_event: &memorystatus_pressure_interval_telemetry);
1027	memorystatus_pressure_proc_telemetry_send();
1028	}
1029
1030
1031	/*
1032	* kn_max - knote
1033	*
1034	* knote_pressure_level - to check if the knote is registered for this notification level.
1035	*
1036	* task - task whose bits we'll be modifying
1037	*
1038	* pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
1039	*
1040	* pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
1041	*
1042	*/
1043
1044	static boolean_t
1045	is_knote_registered_modify_task_pressure_bits(struct knote kn_max, int* knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
1046	{
1047	if (kn_max->kn_sfflags & knote_pressure_level) {
1048	if (pressure_level_to_clear && task_has_been_notified(task, pressurelevel: pressure_level_to_clear) == TRUE) {
1049	task_clear_has_been_notified(task, pressurelevel: pressure_level_to_clear);
1050	}
1051
1052	task_mark_has_been_notified(task, pressurelevel: pressure_level_to_set);
1053	return TRUE;
1054	}
1055
1056	return FALSE;
1057	}
1058
1059	static void
1060	memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear)
1061	{
1062	struct knote *kn = NULL;
1063
1064	memorystatus_klist_lock();
1065
1066	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1067	proc_t p = knote_get_kq(kn)->kq_p;
1068
1069	if (p == proc_ref(p, false)) {
1070	task_clear_has_been_notified(task: proc_task(p), pressurelevel: pressure_level_to_clear);
1071	proc_rele(p);
1072	}
1073	}
1074
1075	memorystatus_klist_unlock();
1076	}
1077
1078	/*
1079	* Used by the vm_pressure_thread which is
1080	* signalled from within vm_pageout_scan().
1081	*/
1082
1083	void
1084	consider_vm_pressure_events(void)
1085	{
1086	vm_dispatch_memory_pressure();
1087	}
1088
1089	static void
1090	vm_dispatch_memory_pressure(void)
1091	{
1092	memorystatus_update_vm_pressure(FALSE);
1093	}
1094
1095	static struct knote *
1096	vm_pressure_select_optimal_candidate_to_notify(struct klist candidate_list, int* level, boolean_t target_foreground_process, uint64_t *next_telemetry_update)
1097	{
1098	struct knote kn = NULL, kn_max = NULL;
1099	uint64_t resident_max = `0`;/ MB /
1100	int selected_task_importance = `0`;
1101	static int pressure_snapshot = -`1`;
1102	boolean_t pressure_increase = FALSE;
1103	uint64_t curr_ts = mach_absolute_time();
1104	*next_telemetry_update = UINT64_MAX;
1105
1106	if (pressure_snapshot == -`1`) {
1107	/*
1108	* Initial snapshot.
1109	*/
1110	pressure_snapshot = level;
1111	pressure_increase = TRUE;
1112	} else {
1113	if (level && (level >= pressure_snapshot)) {
1114	pressure_increase = TRUE;
1115	} else {
1116	pressure_increase = FALSE;
1117	}
1118
1119	pressure_snapshot = level;
1120	}
1121
1122	if (pressure_increase == TRUE) {
1123	/*
1124	* We'll start by considering the largest
1125	* unimportant task in our list.
1126	*/
1127	selected_task_importance = INT_MAX;
1128	} else {
1129	/*
1130	* We'll start by considering the largest
1131	* important task in our list.
1132	*/
1133	selected_task_importance = `0`;
1134	}
1135
1136	SLIST_FOREACH(kn, candidate_list, kn_selnext) {
1137	uint64_t resident_size = `0`;/ MB /
1138	proc_t p = PROC_NULL;
1139	struct task* t = TASK_NULL;
1140	int curr_task_importance = `0`;
1141	uint64_t telemetry_update = `0`;
1142	boolean_t consider_knote = FALSE;
1143	boolean_t privileged_listener = FALSE;
1144
1145	p = proc_ref(p: knote_get_kq(kn)->kq_p, false);
1146	if (p == PROC_NULL) {
1147	continue;
1148	}
1149
1150	#if CONFIG_MEMORYSTATUS
1151	if (target_foreground_process == TRUE && !memorystatus_is_foreground_locked(p)) {
1152	/*
1153	* Skip process not marked foreground.
1154	*/
1155	proc_rele(p);
1156	continue;
1157	}
1158	#endif /* CONFIG_MEMORYSTATUS */
1159
1160	t = (struct task *)(proc_task(p));
1161	telemetry_update = update_knote_footprint_history(kn, task: t, curr_ts);
1162	next_telemetry_update = MIN(next_telemetry_update, telemetry_update);
1163
1164	vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(level);
1165
1166	if ((kn->kn_sfflags & dispatch_level) == `0`) {
1167	proc_rele(p);
1168	continue;
1169	}
1170
1171	#if CONFIG_MEMORYSTATUS
1172	if (target_foreground_process == FALSE && !memorystatus_bg_pressure_eligible(p)) {
1173	VM_PRESSURE_DEBUG(`1`, "[vm_pressure] skipping process %d\n", proc_getpid(p));
1174	proc_rele(p);
1175	continue;
1176	}
1177	#endif /* CONFIG_MEMORYSTATUS */
1178
1179	#if XNU_TARGET_OS_OSX
1180	curr_task_importance = task_importance_estimate(task: t);
1181	#else /* XNU_TARGET_OS_OSX */
1182	curr_task_importance = p->p_memstat_effectivepriority;
1183	#endif /* XNU_TARGET_OS_OSX */
1184
1185	/*
1186	* Privileged listeners are only considered in the multi-level pressure scheme
1187	* AND only if the pressure is increasing.
1188	*/
1189	if (level > `0`) {
1190	if (task_has_been_notified(task: t, pressurelevel: level) == FALSE) {
1191	/*
1192	* Is this a privileged listener?
1193	*/
1194	if (task_low_mem_privileged_listener(task: t, FALSE, old_value: &privileged_listener) == `0`) {
1195	if (privileged_listener) {
1196	kn_max = kn;
1197	proc_rele(p);
1198	goto done_scanning;
1199	}
1200	}
1201	} else {
1202	proc_rele(p);
1203	continue;
1204	}
1205	} else if (level == `0`) {
1206	/*
1207	* Task wasn't notified when the pressure was increasing and so
1208	* no need to notify it that the pressure is decreasing.
1209	*/
1210	if ((task_has_been_notified(task: t, pressurelevel: kVMPressureWarning) == FALSE) && (task_has_been_notified(task: t, pressurelevel: kVMPressureCritical) == FALSE)) {
1211	proc_rele(p);
1212	continue;
1213	}
1214	}
1215
1216	/*
1217	* We don't want a small process to block large processes from
1218	* being notified again. <rdar://problem/7955532>
1219	*/
1220	resident_size = (get_task_phys_footprint(t)) / (`1024` * `1024ULL`); / MB /
1221
1222	if (resident_size >= vm_pressure_task_footprint_min) {
1223	if (level > `0`) {
1224	/*
1225	* Warning or Critical Pressure.
1226	*/
1227	if (pressure_increase) {
1228	if ((curr_task_importance < selected_task_importance) \|\|
1229	((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1230	/*
1231	* We have found a candidate process which is:
1232	* a) at a lower importance than the current selected process
1233	* OR
1234	* b) has importance equal to that of the current selected process but is larger
1235	*/
1236
1237	consider_knote = TRUE;
1238	}
1239	} else {
1240	if ((curr_task_importance > selected_task_importance) \|\|
1241	((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1242	/*
1243	* We have found a candidate process which is:
1244	* a) at a higher importance than the current selected process
1245	* OR
1246	* b) has importance equal to that of the current selected process but is larger
1247	*/
1248
1249	consider_knote = TRUE;
1250	}
1251	}
1252	} else if (level == `0`) {
1253	/*
1254	* Pressure back to normal.
1255	*/
1256	if ((curr_task_importance > selected_task_importance) \|\|
1257	((curr_task_importance == selected_task_importance) && (resident_size > resident_max))) {
1258	consider_knote = TRUE;
1259	}
1260	}
1261
1262	if (consider_knote) {
1263	resident_max = resident_size;
1264	kn_max = kn;
1265	selected_task_importance = curr_task_importance;
1266	consider_knote = FALSE; / reset for the next candidate /
1267	}
1268	} else {
1269	/ There was no candidate with enough resident memory to scavenge /
1270	VM_PRESSURE_DEBUG(`0`, "[vm_pressure] threshold failed for pid %d with %llu resident...\n", proc_getpid(p), resident_size);
1271	}
1272	proc_rele(p);
1273	}
1274
1275	done_scanning:
1276	if (kn_max) {
1277	VM_DEBUG_CONSTANT_EVENT(vm_pressure_event, VM_PRESSURE_EVENT, DBG_FUNC_NONE, proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max, `0`, `0`);
1278	VM_PRESSURE_DEBUG(`1`, "[vm_pressure] sending event to pid %d with %llu resident\n", proc_getpid(knote_get_kq(kn_max)->kq_p), resident_max);
1279	}
1280
1281	return kn_max;
1282	}
1283
1284	/*
1285	* To avoid notification storms in a system with sawtooth behavior of pressure levels eg:
1286	* Normal -> warning (notify clients) -> critical (notify) -> warning (notify) -> critical (notify) -> warning (notify)...
1287	*
1288	* We have 'resting' periods: WARNING_NOTIFICATION_RESTING_PERIOD and CRITICAL_NOTIFICATION_RESTING_PERIOD
1289	*
1290	* So it would look like:-
1291	* Normal -> warning (notify) -> critical (notify) -> warning (notify if it has been RestPeriod since last warning) -> critical (notify if it has been RestPeriod since last critical) -> ...
1292	*
1293	* That's what these 2 timestamps below signify.
1294	*/
1295
1296	uint64_t next_warning_notification_sent_at_ts = `0`;
1297	uint64_t next_critical_notification_sent_at_ts = `0`;
1298
1299	boolean_t memorystatus_manual_testing_on = FALSE;
1300	vm_pressure_level_t memorystatus_manual_testing_level = kVMPressureNormal;
1301
1302	unsigned int memorystatus_sustained_pressure_maximum_band = JETSAM_PRIORITY_IDLE;
1303	#if DEVELOPMENT \|\| DEBUG
1304	SYSCTL_INT(_kern, OID_AUTO, memorystatus_sustained_pressure_maximum_band, CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &memorystatus_sustained_pressure_maximum_band, `0`, "");
1305	#endif /* DEVELOPMENT \|\| DEBUG */
1306
1307	#if CONFIG_JETSAM
1308
1309	/*
1310	* TODO(jason): The memorystatus thread should be responsible for this
1311	* It can just check how long the pressure level has been at warning and the timestamp
1312	* of the last sustained pressure kill.
1313	*/
1314	static void
1315	sustained_pressure_handler(void* arg0 __unused, void* arg1 __unused)
1316	{
1317	int max_kills = `0`, kill_count = `0`;
1318	/*
1319	* Pressure has been elevated for too long.
1320	* We don't want to leave the system in this state as it can delay background
1321	* work indefinitely & drain battery.
1322	*
1323	* Try to return the system to normal via jetsam.
1324	* We'll run through the idle band up to 2 times.
1325	* If the pressure hasn't been relieved by then, the problem is memory
1326	* consumption in a higher band and this churn is probably doing more harm than good.
1327	*/
1328	max_kills = memorystatus_get_proccnt_upto_priority(memorystatus_sustained_pressure_maximum_band) * `2`;
1329	memorystatus_log("memorystatus: Pressure level has been elevated for too long. killing up to %d idle processes\n", max_kills);
1330	while (memorystatus_vm_pressure_level != kVMPressureNormal && kill_count < max_kills) {
1331	boolean_t killed = memorystatus_kill_on_sustained_pressure();
1332	if (killed) {
1333	/*
1334	* Pause before our next kill & see if pressure reduces.
1335	*/
1336	delay((int)(memorystatus_kill_on_sustained_pressure_delay_ms * NSEC_PER_MSEC / NSEC_PER_USEC));
1337	kill_count++;
1338	memorystatus_kill_on_sustained_pressure_count++;
1339	/ TODO(jason): Should use os_atomic but requires rdar://76310894. /
1340	memorystatus_pressure_interval_telemetry.num_kills++;
1341	} else {
1342	/ Nothing left to kill /
1343	break;
1344	}
1345	}
1346	if (memorystatus_vm_pressure_level != kVMPressureNormal) {
1347	memorystatus_log("memorystatus: Killed %d idle processes due to sustained pressure, but device didn't quiesce. Giving up.\n", kill_count);
1348	}
1349	}
1350
1351	#endif /* CONFIG_JETSAM */
1352
1353	/*
1354	* Returns the number of processes registered for notifications at this level.
1355	*/
1356	static size_t
1357	memorystatus_klist_length(int level)
1358	{
1359	LCK_MTX_ASSERT(&memorystatus_klist_mutex, LCK_MTX_ASSERT_OWNED);
1360	struct knote *kn;
1361	size_t count = `0`;
1362	int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level);
1363	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1364	if (kn->kn_sfflags & knote_pressure_level) {
1365	count++;
1366	}
1367	}
1368	return count;
1369	}
1370
1371	/*
1372	* Updates the footprint telemetry for procs that have received notifications.
1373	*/
1374	static void
1375	update_footprints_for_telemetry(void* arg0 __unused, void* arg1 __unused)
1376	{
1377	uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1378	struct knote *kn;
1379
1380	memorystatus_klist_lock();
1381	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1382	proc_t p = PROC_NULL;
1383	struct task* t = TASK_NULL;
1384	uint64_t telemetry_update;
1385
1386	p = proc_ref(p: knote_get_kq(kn)->kq_p, false);
1387	if (p == PROC_NULL) {
1388	continue;
1389	}
1390	t = (struct task *)(proc_task(p));
1391	proc_rele(p);
1392	p = PROC_NULL;
1393	telemetry_update = update_knote_footprint_history(kn, task: t, curr_ts);
1394	next_telemetry_update = MIN(next_telemetry_update, telemetry_update);
1395	}
1396	memorystatus_klist_unlock();
1397	if (next_telemetry_update != UINT64_MAX) {
1398	uint64_t next_update_seconds;
1399	absolutetime_to_nanoseconds(abstime: next_telemetry_update, result: &next_update_seconds);
1400	next_update_seconds /= NSEC_PER_SEC;
1401	thread_call_enter_delayed(call: memorystatus_notify_update_telemetry_thread_call, deadline: next_telemetry_update);
1402	}
1403	}
1404
1405	kern_return_t
1406	memorystatus_update_vm_pressure(boolean_t target_foreground_process)
1407	{
1408	struct knote *kn_max = NULL;
1409	struct knote kn_cur = NULL, kn_temp = NULL;/ for safe list traversal /
1410	pid_t target_pid = -`1`;
1411	struct klist dispatch_klist = { NULL };
1412	proc_t target_proc = PROC_NULL;
1413	struct task *task = NULL;
1414	boolean_t found_candidate = FALSE;
1415
1416	static vm_pressure_level_t level_snapshot = kVMPressureNormal;
1417	static vm_pressure_level_t prev_level_snapshot = kVMPressureNormal;
1418	boolean_t smoothing_window_started = FALSE;
1419	struct timeval smoothing_window_start_tstamp = {`0`, `0`};
1420	struct timeval curr_tstamp = {`0`, `0`};
1421	int64_t elapsed_msecs = `0`;
1422	uint64_t curr_ts = mach_absolute_time(), next_telemetry_update = UINT64_MAX;
1423
1424
1425	uint64_t logging_now;
1426	absolutetime_to_nanoseconds(abstime: curr_ts, result: &logging_now);
1427	#if !CONFIG_JETSAM
1428	#define MAX_IDLE_KILLS 100 /* limit the number of idle kills allowed */
1429
1430	int idle_kill_counter = `0`;
1431
1432	/*
1433	* On desktop we take this opportunity to free up memory pressure
1434	* by immediately killing idle exitable processes. We use a delay
1435	* to avoid overkill. And we impose a max counter as a fail safe
1436	* in case daemons re-launch too fast.
1437	*/
1438	while ((memorystatus_vm_pressure_level != kVMPressureNormal) && (idle_kill_counter < MAX_IDLE_KILLS)) {
1439	if (memorystatus_idle_exit_from_VM() == FALSE) {
1440	/ No idle exitable processes left to kill /
1441	break;
1442	}
1443	idle_kill_counter++;
1444
1445	if (memorystatus_manual_testing_on == TRUE) {
1446	/*
1447	* Skip the delay when testing
1448	* the pressure notification scheme.
1449	*/
1450	} else {
1451	delay(usec: `1000000`); / 1 second /
1452	}
1453	}
1454	#endif /* !CONFIG_JETSAM */
1455
1456	if (level_snapshot != kVMPressureNormal) {
1457	/*
1458	* Check to see if we are still in the 'resting' period
1459	* after having notified all clients interested in
1460	* a particular pressure level.
1461	*/
1462
1463	level_snapshot = memorystatus_vm_pressure_level;
1464
1465	if (level_snapshot == kVMPressureWarning \|\| level_snapshot == kVMPressureUrgent) {
1466	if (next_warning_notification_sent_at_ts) {
1467	if (curr_ts < next_warning_notification_sent_at_ts) {
1468	delay(INTER_NOTIFICATION_DELAY * `4` / 1 sec /);
1469	return KERN_SUCCESS;
1470	}
1471
1472	next_warning_notification_sent_at_ts = `0`;
1473	memorystatus_klist_reset_all_for_level(pressure_level_to_clear: kVMPressureWarning);
1474	}
1475	} else if (level_snapshot == kVMPressureCritical) {
1476	if (next_critical_notification_sent_at_ts) {
1477	if (curr_ts < next_critical_notification_sent_at_ts) {
1478	delay(INTER_NOTIFICATION_DELAY * `4` / 1 sec /);
1479	return KERN_SUCCESS;
1480	}
1481	next_critical_notification_sent_at_ts = `0`;
1482	memorystatus_klist_reset_all_for_level(pressure_level_to_clear: kVMPressureCritical);
1483	}
1484	}
1485	}
1486
1487	#if CONFIG_JETSAM
1488	if (memorystatus_vm_pressure_level == kVMPressureNormal && prev_level_snapshot != kVMPressureNormal) {
1489	if (memorystatus_should_kill_on_sustained_pressure) {
1490	memorystatus_log("memorystatus: Pressure has returned to level %d. Cancelling scheduled jetsam\n", memorystatus_vm_pressure_level);
1491	thread_call_cancel(sustained_pressure_handler_thread_call);
1492	}
1493	} else if (memorystatus_should_kill_on_sustained_pressure && memorystatus_vm_pressure_level != kVMPressureNormal && prev_level_snapshot == kVMPressureNormal) {
1494	/*
1495	* Pressure has increased from normal.
1496	* Hopefully the notifications will relieve it,
1497	* but as a fail-safe we'll trigger jetsam
1498	* after a configurable amount of time.
1499	*/
1500	memorystatus_log("memorystatus: Pressure level has increased from %d to %d. Scheduling jetsam.\n", prev_level_snapshot, memorystatus_vm_pressure_level);
1501	uint64_t kill_time;
1502	nanoseconds_to_absolutetime(memorystatus_kill_on_sustained_pressure_window_s * NSEC_PER_SEC, &kill_time);
1503	kill_time += mach_absolute_time();
1504	thread_call_enter_delayed(sustained_pressure_handler_thread_call, kill_time);
1505	}
1506	#endif /* CONFIG_JETSAM */
1507
1508	while (`1`) {
1509	/*
1510	* There is a race window here. But it's not clear
1511	* how much we benefit from having extra synchronization.
1512	*/
1513	level_snapshot = memorystatus_vm_pressure_level;
1514
1515	if (prev_level_snapshot > level_snapshot) {
1516	/*
1517	* Pressure decreased? Let's take a little breather
1518	* and see if this condition stays.
1519	*/
1520	if (smoothing_window_started == FALSE) {
1521	smoothing_window_started = TRUE;
1522	microuptime(tv: &smoothing_window_start_tstamp);
1523	}
1524
1525	microuptime(tv: &curr_tstamp);
1526	timevalsub(t1: &curr_tstamp, t2: &smoothing_window_start_tstamp);
1527	elapsed_msecs = curr_tstamp.tv_sec * `1000` + curr_tstamp.tv_usec / `1000`;
1528
1529	if (elapsed_msecs < VM_PRESSURE_DECREASED_SMOOTHING_PERIOD) {
1530	delay(INTER_NOTIFICATION_DELAY);
1531	continue;
1532	}
1533	}
1534	if (level_snapshot == kVMPressureNormal) {
1535	memorystatus_pressure_telemetry_send();
1536	}
1537	prev_level_snapshot = level_snapshot;
1538	smoothing_window_started = FALSE;
1539	memorystatus_klist_lock();
1540
1541	if (level_snapshot > memorystatus_pressure_interval_telemetry.max_level) {
1542	memorystatus_pressure_interval_telemetry.num_processes_registered = memorystatus_klist_length(level: level_snapshot);
1543	memorystatus_pressure_interval_telemetry.max_level = level_snapshot;
1544	memorystatus_pressure_interval_telemetry.num_transitions++;
1545	if (memorystatus_pressure_interval_telemetry.duration == `0`) {
1546	/ Set the start timestamp. Duration will be finalized when we send the event. /
1547	memorystatus_pressure_interval_telemetry.duration = curr_ts;
1548	}
1549	}
1550
1551	kn_max = vm_pressure_select_optimal_candidate_to_notify(candidate_list: &memorystatus_klist, level: level_snapshot, target_foreground_process, next_telemetry_update: &next_telemetry_update);
1552
1553	if (kn_max == NULL) {
1554	memorystatus_klist_unlock();
1555
1556	/*
1557	* No more level-based clients to notify.
1558	*
1559	* Start the 'resting' window within which clients will not be re-notified.
1560	*/
1561
1562	if (level_snapshot != kVMPressureNormal) {
1563	if (level_snapshot == kVMPressureWarning \|\| level_snapshot == kVMPressureUrgent) {
1564	nanoseconds_to_absolutetime(WARNING_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, result: &curr_ts);
1565
1566	/ Next warning notification (if nothing changes) won't be sent before.../
1567	next_warning_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1568	}
1569
1570	if (level_snapshot == kVMPressureCritical) {
1571	nanoseconds_to_absolutetime(CRITICAL_NOTIFICATION_RESTING_PERIOD * NSEC_PER_SEC, result: &curr_ts);
1572
1573	/ Next critical notification (if nothing changes) won't be sent before.../
1574	next_critical_notification_sent_at_ts = mach_absolute_time() + curr_ts;
1575	}
1576	}
1577	absolutetime_to_nanoseconds(abstime: mach_absolute_time(), result: &logging_now);
1578	if (next_telemetry_update != UINT64_MAX) {
1579	thread_call_enter_delayed(call: memorystatus_notify_update_telemetry_thread_call, deadline: next_telemetry_update);
1580	} else {
1581	thread_call_cancel(call: memorystatus_notify_update_telemetry_thread_call);
1582	}
1583	return KERN_FAILURE;
1584	}
1585
1586	target_proc = proc_ref(p: knote_get_kq(kn: kn_max)->kq_p, false);
1587	if (target_proc == PROC_NULL) {
1588	memorystatus_klist_unlock();
1589	continue;
1590	}
1591
1592	target_pid = proc_getpid(target_proc);
1593
1594	task = (struct task *)(proc_task(target_proc));
1595
1596	if (level_snapshot != kVMPressureNormal) {
1597	if (level_snapshot == kVMPressureWarning \|\| level_snapshot == kVMPressureUrgent) {
1598	if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, pressure_level_to_clear: `0`, pressure_level_to_set: kVMPressureWarning) == TRUE) {
1599	found_candidate = TRUE;
1600	}
1601	} else {
1602	if (level_snapshot == kVMPressureCritical) {
1603	if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, pressure_level_to_clear: `0`, pressure_level_to_set: kVMPressureCritical) == TRUE) {
1604	found_candidate = TRUE;
1605	}
1606	}
1607	}
1608	} else {
1609	if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1610	task_clear_has_been_notified(task, pressurelevel: kVMPressureWarning);
1611	task_clear_has_been_notified(task, pressurelevel: kVMPressureCritical);
1612
1613	found_candidate = TRUE;
1614	}
1615	}
1616
1617	if (found_candidate == FALSE) {
1618	proc_rele(p: target_proc);
1619	memorystatus_klist_unlock();
1620	continue;
1621	}
1622
1623	SLIST_FOREACH_SAFE(kn_cur, &memorystatus_klist, kn_selnext, kn_temp) {
1624	int knote_pressure_level = convert_internal_pressure_level_to_dispatch_level(level_snapshot);
1625
1626	if (is_knote_registered_modify_task_pressure_bits(kn_max: kn_cur, knote_pressure_level, task, pressure_level_to_clear: `0`, pressure_level_to_set: level_snapshot) == TRUE) {
1627	proc_t knote_proc = knote_get_kq(kn: kn_cur)->kq_p;
1628	pid_t knote_pid = proc_getpid(knote_proc);
1629	if (knote_pid == target_pid) {
1630	KNOTE_DETACH(&memorystatus_klist, kn_cur);
1631	KNOTE_ATTACH(&dispatch_klist, kn_cur);
1632	}
1633	}
1634	}
1635	if (level_snapshot != kVMPressureNormal) {
1636	mark_knote_send_time(kn: kn_max, task, knote_pressure_level: convert_internal_pressure_level_to_dispatch_level(level_snapshot),
1637	order_within_list: (uint16_t) MIN(UINT16_MAX, memorystatus_pressure_interval_telemetry.num_notifications_sent));
1638	memorystatus_pressure_interval_telemetry.num_notifications_sent++;
1639	}
1640
1641	KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
1642
1643	SLIST_FOREACH_SAFE(kn_cur, &dispatch_klist, kn_selnext, kn_temp) {
1644	KNOTE_DETACH(&dispatch_klist, kn_cur);
1645	KNOTE_ATTACH(&memorystatus_klist, kn_cur);
1646	}
1647
1648	memorystatus_klist_unlock();
1649
1650	microuptime(tv: &target_proc->vm_pressure_last_notify_tstamp);
1651	proc_rele(p: target_proc);
1652
1653	if (memorystatus_manual_testing_on == TRUE && target_foreground_process == TRUE) {
1654	break;
1655	}
1656
1657	if (memorystatus_manual_testing_on == TRUE) {
1658	/*
1659	* Testing out the pressure notification scheme.
1660	* No need for delays etc.
1661	*/
1662	} else {
1663	uint32_t sleep_interval = INTER_NOTIFICATION_DELAY;
1664	#if CONFIG_JETSAM
1665	unsigned int page_delta = `0`;
1666	unsigned int skip_delay_page_threshold = `0`;
1667
1668	assert(memorystatus_available_pages_pressure >= memorystatus_available_pages_critical_base);
1669
1670	page_delta = (memorystatus_available_pages_pressure - memorystatus_available_pages_critical_base) / `2`;
1671	skip_delay_page_threshold = memorystatus_available_pages_pressure - page_delta;
1672
1673	if (memorystatus_available_pages <= skip_delay_page_threshold) {
1674	/*
1675	* We are nearing the critcal mark fast and can't afford to wait between
1676	* notifications.
1677	*/
1678	sleep_interval = `0`;
1679	}
1680	#endif /* CONFIG_JETSAM */
1681
1682	if (sleep_interval) {
1683	delay(usec: sleep_interval);
1684	}
1685	}
1686	}
1687
1688	return KERN_SUCCESS;
1689	}
1690
1691	static uint32_t
1692	convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
1693	{
1694	uint32_t dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1695
1696	switch (internal_pressure_level) {
1697	case kVMPressureNormal:
1698	{
1699	dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
1700	break;
1701	}
1702
1703	case kVMPressureWarning:
1704	case kVMPressureUrgent:
1705	{
1706	dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1707	break;
1708	}
1709
1710	case kVMPressureCritical:
1711	{
1712	dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
1713	break;
1714	}
1715
1716	default:
1717	break;
1718	}
1719
1720	return dispatch_level;
1721	}
1722
1723	/*
1724	* Notify any kexts that are waiting for notification that jetsam
1725	* is approaching the foreground bands. They should use this notification
1726	* to free cached memory.
1727	*/
1728	void
1729	memorystatus_issue_fg_band_notify(void)
1730	{
1731	uint64_t now;
1732
1733	lck_mtx_lock(lck: &memorystatus_jetsam_fg_band_lock);
1734	absolutetime_to_nanoseconds(abstime: mach_absolute_time(), result: &now);
1735	if (now - memorystatus_jetsam_fg_band_timestamp_ns < memorystatus_jetsam_fg_band_delay_ns) {
1736	lck_mtx_unlock(lck: &memorystatus_jetsam_fg_band_lock);
1737	return;
1738	}
1739
1740	if (memorystatus_jetsam_fg_band_waiters > `0`) {
1741	thread_wakeup(&memorystatus_jetsam_fg_band_waiters);
1742	memorystatus_jetsam_fg_band_waiters = `0`;
1743	memorystatus_jetsam_fg_band_timestamp_ns = now;
1744	}
1745	lck_mtx_unlock(lck: &memorystatus_jetsam_fg_band_lock);
1746
1747	/ Notify the buffer cache, file systems, etc. to jetison everything they can. /
1748	if (consider_buffer_cache_collect != NULL) {
1749	(void)(*consider_buffer_cache_collect)(`1`);
1750	}
1751	}
1752
1753
1754	/*
1755	* Memorystatus notification debugging support
1756	*/
1757
1758	static int
1759	sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
1760	{
1761	#pragma unused(arg1, arg2, oidp)
1762	#if !XNU_TARGET_OS_OSX
1763	int error = `0`;
1764
1765	error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, `0`);
1766	if (error) {
1767	return error;
1768	}
1769
1770	#endif /* !XNU_TARGET_OS_OSX */
1771	uint32_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(internal_pressure_level: memorystatus_vm_pressure_level);
1772
1773	return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
1774	}
1775
1776	#if DEBUG \|\| DEVELOPMENT
1777
1778	SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
1779	`0`, `0`, &sysctl_memorystatus_vm_pressure_level, "I", "");
1780
1781	#else /* DEBUG \|\| DEVELOPMENT */
1782
1783	SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT \| CTLFLAG_RD \| CTLFLAG_LOCKED \| CTLFLAG_MASKED,
1784	`0`, `0`, &sysctl_memorystatus_vm_pressure_level, "I", "");
1785
1786	#endif /* DEBUG \|\| DEVELOPMENT */
1787
1788	/*
1789	* Trigger levels to test the mechanism.
1790	* Can be used via a sysctl.
1791	*/
1792	#define TEST_LOW_MEMORY_TRIGGER_ONE 1
1793	#define TEST_LOW_MEMORY_TRIGGER_ALL 2
1794	#define TEST_PURGEABLE_TRIGGER_ONE 3
1795	#define TEST_PURGEABLE_TRIGGER_ALL 4
1796	#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE 5
1797	#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL 6
1798
1799	static int
1800	sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
1801	{
1802	#pragma unused(arg1, arg2)
1803
1804	int level = `0`;
1805	int error = `0`;
1806	int pressure_level = `0`;
1807	int trigger_request = `0`;
1808	int force_purge;
1809
1810	error = sysctl_handle_int(oidp, arg1: &level, arg2: `0`, req);
1811	if (error \|\| !req->newptr) {
1812	return error;
1813	}
1814
1815	memorystatus_manual_testing_on = TRUE;
1816
1817	trigger_request = (level >> `16`) & `0xFFFF`;
1818	pressure_level = (level & `0xFFFF`);
1819
1820	if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE \|\|
1821	trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
1822	return EINVAL;
1823	}
1824	switch (pressure_level) {
1825	case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
1826	case NOTE_MEMORYSTATUS_PRESSURE_WARN:
1827	case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
1828	break;
1829	default:
1830	return EINVAL;
1831	}
1832
1833	/*
1834	* The pressure level is being set from user-space.
1835	* And user-space uses the constants in sys/event.h
1836	* So we translate those events to our internal levels here.
1837	*/
1838	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1839	memorystatus_manual_testing_level = kVMPressureNormal;
1840	force_purge = `0`;
1841	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
1842	memorystatus_manual_testing_level = kVMPressureWarning;
1843	force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1844	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
1845	memorystatus_manual_testing_level = kVMPressureCritical;
1846	force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1847	}
1848
1849	memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
1850
1851	/ purge according to the new pressure level /
1852	switch (trigger_request) {
1853	case TEST_PURGEABLE_TRIGGER_ONE:
1854	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
1855	if (force_purge == `0`) {
1856	/ no purging requested /
1857	break;
1858	}
1859	vm_purgeable_object_purge_one_unlocked(force_purge_below_group: force_purge);
1860	break;
1861	case TEST_PURGEABLE_TRIGGER_ALL:
1862	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
1863	if (force_purge == `0`) {
1864	/ no purging requested /
1865	break;
1866	}
1867	while (vm_purgeable_object_purge_one_unlocked(force_purge_below_group: force_purge)) {
1868	;
1869	}
1870	break;
1871	}
1872
1873	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) \|\|
1874	(trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
1875	memorystatus_update_vm_pressure(TRUE);
1876	}
1877
1878	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) \|\|
1879	(trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
1880	while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
1881	continue;
1882	}
1883	}
1884
1885	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
1886	memorystatus_manual_testing_on = FALSE;
1887	}
1888
1889	return `0`;
1890	}
1891
1892	SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT \| CTLFLAG_WR \| CTLFLAG_LOCKED \| CTLFLAG_MASKED,
1893	`0`, `0`, &sysctl_memorypressure_manual_trigger, "I", "");
1894
1895
1896	SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW \| CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, `0`, "");
1897	SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, `0`, "");
1898	SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, `0`, "");
1899
1900	extern int vm_pressure_level_transition_threshold;
1901	SYSCTL_INT(_kern, OID_AUTO, vm_pressure_level_transition_threshold, CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &vm_pressure_level_transition_threshold, `0`, "");
1902
1903	#if DEBUG \|\| DEVELOPMENT
1904	SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW \| CTLFLAG_LOCKED, &vm_pressure_events_enabled, `0`, "");
1905
1906	#if 0
1907	#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
1908	static boolean_t
1909	memorystatus_issue_pressure_kevent(boolean_t pressured)
1910	{
1911	memorystatus_klist_lock();
1912	KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
1913	memorystatus_klist_unlock();
1914	return TRUE;
1915	}
1916	#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
1917	#endif /* 0 */
1918
1919	/*
1920	* This routine is used for targeted notifications regardless of system memory pressure
1921	* and regardless of whether or not the process has already been notified.
1922	* It bypasses and has no effect on the only-one-notification per soft-limit policy.
1923	*
1924	* "memnote" is the current user.
1925	*/
1926
1927	static int
1928	sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
1929	{
1930	#pragma unused(arg1, arg2)
1931	/ Need to be root or have memorystatus entitlement /
1932	if (!kauth_cred_issuser(kauth_cred_get()) && !IOCurrentTaskHasEntitlement(MEMORYSTATUS_ENTITLEMENT)) {
1933	return EPERM;
1934	}
1935
1936	int error = `0`, pid = `0`;
1937	struct knote *kn = NULL;
1938	boolean_t found_knote = FALSE;
1939	int fflags = `0`; / filter flags for EVFILT_MEMORYSTATUS /
1940	uint64_t value = `0`;
1941
1942	error = sysctl_handle_quad(oidp, &value, `0`, req);
1943	if (error \|\| !req->newptr) {
1944	return error;
1945	}
1946
1947	/*
1948	* Find the pid in the low 32 bits of value passed in.
1949	*/
1950	pid = (int)(value & `0xFFFFFFFF`);
1951
1952	/*
1953	* Find notification in the high 32 bits of the value passed in.
1954	*/
1955	fflags = (int)((value >> `32`) & `0xFFFFFFFF`);
1956
1957	/*
1958	* For backwards compatibility, when no notification is
1959	* passed in, default to the NOTE_MEMORYSTATUS_PRESSURE_WARN
1960	*/
1961	if (fflags == `0`) {
1962	fflags = NOTE_MEMORYSTATUS_PRESSURE_WARN;
1963	// printf("memorystatus_vm_pressure_send: using default notification [0x%x]\n", fflags);
1964	}
1965
1966	/ wake up everybody waiting for kVMPressureJetsam /
1967	if (fflags == NOTE_MEMORYSTATUS_JETSAM_FG_BAND) {
1968	memorystatus_issue_fg_band_notify();
1969	return error;
1970	}
1971
1972	/*
1973	* See event.h ... fflags for EVFILT_MEMORYSTATUS
1974	*/
1975	if (!((fflags == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) \|\|
1976	(fflags == NOTE_MEMORYSTATUS_PRESSURE_WARN) \|\|
1977	(fflags == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) \|\|
1978	(fflags == NOTE_MEMORYSTATUS_LOW_SWAP) \|\|
1979	(fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_WARN) \|\|
1980	(fflags == NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL) \|\|
1981	(((fflags & NOTE_MEMORYSTATUS_MSL_STATUS) != `0` &&
1982	((fflags & ~NOTE_MEMORYSTATUS_MSL_STATUS) == `0`))))) {
1983	memorystatus_log_error("memorystatus_vm_pressure_send: notification [0x%x] not supported\n", fflags);
1984	error = `1`;
1985	return error;
1986	}
1987
1988	/*
1989	* Forcibly send pid a memorystatus notification.
1990	*/
1991
1992	memorystatus_klist_lock();
1993
1994	SLIST_FOREACH(kn, &memorystatus_klist, kn_selnext) {
1995	proc_t knote_proc = knote_get_kq(kn)->kq_p;
1996	pid_t knote_pid = proc_getpid(knote_proc);
1997
1998	if (knote_pid == pid) {
1999	/*
2000	* Forcibly send this pid a memorystatus notification.
2001	*/
2002	kn->kn_fflags = fflags;
2003	found_knote = TRUE;
2004	}
2005	}
2006
2007	if (found_knote) {
2008	KNOTE(&memorystatus_klist, `0`);
2009	memorystatus_log_debug("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] sent to process [%d]\n", value, fflags, pid);
2010	error = `0`;
2011	} else {
2012	memorystatus_log_error("memorystatus_vm_pressure_send: (value 0x%llx) notification [0x%x] not sent to process [%d] (none registered?)\n", value, fflags, pid);
2013	error = `1`;
2014	}
2015
2016	memorystatus_klist_unlock();
2017
2018	return error;
2019	}
2020
2021	SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_QUAD \| CTLFLAG_WR \| CTLFLAG_LOCKED \| CTLFLAG_MASKED \| CTLFLAG_ANYBODY,
2022	`0`, `0`, &sysctl_memorystatus_vm_pressure_send, "Q", "");
2023
2024	#endif /* DEBUG \|\| DEVELOPMENT */
2025
2026	#endif /* VM_PRESSURE_EVENTS */
2027

Browse the source code of xnu/bsd/kern/kern_memorystatus_notify.c