sched_average.c source code [xnu/osfmk/kern/sched_average.c]

1	/*
2	* Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56	/*
57	*/
58	/*
59	* Author: Avadis Tevanian, Jr.
60	* Date: 1986
61	*
62	* Compute various averages.
63	*/
64
65	#include <mach/mach_types.h>
66
67	#include <kern/sched.h>
68	#include <kern/assert.h>
69	#include <kern/processor.h>
70	#include <kern/thread.h>
71	#if CONFIG_TELEMETRY
72	#include <kern/telemetry.h>
73	#endif
74	#include <kern/zalloc_internal.h>
75
76	#include <sys/kdebug.h>
77
78	uint32_t avenrun[`3`] = {`0`, `0`, `0`};
79	uint32_t mach_factor[`3`] = {`0`, `0`, `0`};
80
81	uint32_t sched_load_average, sched_mach_factor;
82
83	#if defined(CONFIG_SCHED_TIMESHARE_CORE)
84	/*
85	* Values are scaled by LOAD_SCALE, defined in processor_info.h
86	*/
87	#define base(n) ((n) << SCHED_TICK_SHIFT)
88	#define frac(n) (((base(n) - 1) * LOAD_SCALE) / base(n))
89
90	static uint32_t fract[`3`] = {
91	frac(`5`), / 5 second average /
92	frac(`30`), / 30 second average /
93	frac(`60`), / 1 minute average /
94	};
95
96	#undef base
97	#undef frac
98
99	#endif /* CONFIG_SCHED_TIMESHARE_CORE */
100
101	static unsigned int sched_nrun;
102
103	typedef void (*sched_avg_comp_t)(
104	void *param);
105
106	static struct sched_average {
107	sched_avg_comp_t comp;
108	void *param;
109	int period; / in seconds /
110	uint64_t deadline;
111	} sched_average[] = {
112	{ compute_averunnable, &sched_nrun, `5`, `0` },
113	{ .comp: compute_stack_target, NULL, .period: `5`, .deadline: `1` },
114	{ .comp: compute_pageout_gc_throttle, NULL, .period: `1`, .deadline: `0` },
115	{ .comp: compute_pmap_gc_throttle, NULL, .period: `60`, .deadline: `0` },
116	{ .comp: compute_zone_working_set_size, NULL, ZONE_WSS_UPDATE_PERIOD, .deadline: `0` },
117	#if CONFIG_TELEMETRY
118	{ .comp: compute_telemetry, NULL, .period: `1`, .deadline: `0` },
119	#endif
120	{ NULL, NULL, .period: `0`, .deadline: `0` }
121	};
122
123	typedef struct sched_average *sched_average_t;
124
125	/*
126	* Scheduler load calculation algorithm
127	*
128	* The scheduler load values provide an estimate of the number of runnable
129	* timeshare threads in the system at various priority bands. The load
130	* ultimately affects the priority shifts applied to all threads in a band
131	* causing them to timeshare with other threads in the system. The load is
132	* maintained in buckets, with each bucket corresponding to a priority band.
133	*
134	* Each runnable thread on the system contributes its load to its priority
135	* band and to the bands above it. The contribution of a thread to the bands
136	* above it is not strictly 1:1 and is weighted based on the priority band
137	* of the thread. The rules of thread load contribution to each of its higher
138	* bands are as follows:
139	*
140	* - DF threads: Upto (2 * NCPUs) threads
141	* - UT threads: Upto NCPUs threads
142	* - BG threads: Upto 1 thread
143	*
144	* To calculate the load values, the various run buckets are sampled (every
145	* sched_load_compute_interval_abs) and the weighted contributions of the the
146	* lower bucket threads are added. The resultant value is plugged into an
147	* exponentially weighted moving average formula:
148	* new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
149	* (where, alpha < 1)
150	* The calculations for the scheduler load are done using fixpoint math with
151	* a scale factor of 16 to avoid expensive divides and floating point
152	* operations. The final load values are a smooth curve representative of
153	* the actual number of runnable threads in a priority band.
154	*/
155
156	/ Maintains the current (scaled for fixpoint) load in various buckets /
157	uint32_t sched_load[TH_BUCKET_MAX];
158
159	/*
160	* Alpha factor for the EWMA alogrithm. The current values are chosen as
161	* 6:10 ("old load":"new samples") to make sure the scheduler reacts fast
162	* enough to changing system load but does not see too many spikes from bursty
163	* activity. The current values ensure that the scheduler would converge
164	* to the latest load in 2-3 sched_load_compute_interval_abs intervals
165	* (which amounts to ~30-45ms with current values).
166	*/
167	#define SCHED_LOAD_EWMA_ALPHA_OLD 6
168	#define SCHED_LOAD_EWMA_ALPHA_NEW 10
169	#define SCHED_LOAD_EWMA_ALPHA_SHIFT 4
170	static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (`1ul` << SCHED_LOAD_EWMA_ALPHA_SHIFT));
171
172	/ For fixpoint EWMA, roundup the load to make it converge /
173	#define SCHED_LOAD_EWMA_ROUNDUP(load) (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
174
175	/ Macro to convert scaled sched load to a real load value /
176	#define SCHED_LOAD_EWMA_UNSCALE(load) (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
177
178	/*
179	* Routine to capture the latest runnable counts and update sched_load (only used for non-clutch schedulers)
180	*/
181	void
182	compute_sched_load(void)
183	{
184	/*
185	* Retrieve a snapshot of the current run counts.
186	*
187	* Why not a bcopy()? Because we need atomic word-sized reads of sched_run_buckets,
188	* not byte-by-byte copy.
189	*/
190	uint32_t ncpus = processor_avail_count;
191	uint32_t load_now[TH_BUCKET_MAX];
192
193	load_now[TH_BUCKET_RUN] = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed);
194	load_now[TH_BUCKET_FIXPRI] = os_atomic_load(&sched_run_buckets[TH_BUCKET_FIXPRI], relaxed);
195	load_now[TH_BUCKET_SHARE_FG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_FG], relaxed);
196	load_now[TH_BUCKET_SHARE_DF] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_DF], relaxed);
197	load_now[TH_BUCKET_SHARE_UT] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_UT], relaxed);
198	load_now[TH_BUCKET_SHARE_BG] = os_atomic_load(&sched_run_buckets[TH_BUCKET_SHARE_BG], relaxed);
199
200	assert(load_now[TH_BUCKET_RUN] >= `0`);
201	assert(load_now[TH_BUCKET_FIXPRI] >= `0`);
202
203	uint32_t nthreads = load_now[TH_BUCKET_RUN];
204	uint32_t nfixpri = load_now[TH_BUCKET_FIXPRI];
205
206	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
207	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) \| DBG_FUNC_NONE,
208	load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
209	load_now[TH_BUCKET_SHARE_BG], load_now[TH_BUCKET_SHARE_UT], `0`);
210
211	/*
212	* Compute the timeshare priority conversion factor based on loading.
213	* Because our counters may be incremented and accessed
214	* concurrently with respect to each other, we may have
215	* windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
216	* is broken, so truncate values in these cases.
217	*/
218	uint32_t timeshare_threads = (nthreads - nfixpri);
219	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
220	if (load_now[i] > timeshare_threads) {
221	load_now[i] = timeshare_threads;
222	}
223	}
224
225	/*
226	* Default threads contribute up to (NCPUS * 2) of load to FG threads
227	*/
228	if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * `2`)) {
229	load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
230	} else {
231	load_now[TH_BUCKET_SHARE_FG] += (ncpus * `2`);
232	}
233
234	/*
235	* Utility threads contribute up to NCPUS of load to FG & DF threads
236	*/
237	if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
238	load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
239	load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
240	} else {
241	load_now[TH_BUCKET_SHARE_FG] += ncpus;
242	load_now[TH_BUCKET_SHARE_DF] += ncpus;
243	}
244
245	/*
246	* BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
247	*/
248	if (load_now[TH_BUCKET_SHARE_BG] > `0`) {
249	load_now[TH_BUCKET_SHARE_FG] += `1`;
250	load_now[TH_BUCKET_SHARE_DF] += `1`;
251	load_now[TH_BUCKET_SHARE_UT] += `1`;
252	}
253
254	/*
255	* The conversion factor consists of two components:
256	* a fixed value based on the absolute time unit (sched_fixed_shift),
257	* and a dynamic portion based on load (sched_load_shifts).
258	*
259	* Zero load results in a out of range shift count.
260	*/
261
262	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
263	uint32_t bucket_load = `0`;
264
265	if (load_now[i] > ncpus) {
266	/ Normalize the load to number of CPUs /
267	if (ncpus > `1`) {
268	bucket_load = load_now[i] / ncpus;
269	} else {
270	bucket_load = load_now[i];
271	}
272
273	if (bucket_load > MAX_LOAD) {
274	bucket_load = MAX_LOAD;
275	}
276	}
277	/ Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load /
278	sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
279	sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
280	}
281
282	KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
283	MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) \| DBG_FUNC_NONE,
284	SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
285	SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), `0`);
286	}
287
288	void
289	compute_averages(uint64_t stdelta)
290	{
291	uint32_t nthreads = os_atomic_load(&sched_run_buckets[TH_BUCKET_RUN], relaxed) - `1`;
292	uint32_t ncpus = processor_avail_count;
293
294	/ Update the global pri_shifts based on the latest values /
295	for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG; i++) {
296	uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
297	uint32_t shift = sched_fixed_shift - sched_load_shifts[bucket_load];
298
299	if (shift > SCHED_PRI_SHIFT_MAX) {
300	sched_pri_shifts[i] = INT8_MAX;
301	} else {
302	sched_pri_shifts[i] = shift;
303	}
304	}
305
306	/*
307	* Sample total running threads for the load average calculation.
308	*/
309	sched_nrun = nthreads;
310
311	/*
312	* Load average and mach factor calculations for
313	* those which ask about these things.
314	*/
315	uint32_t average_now = nthreads * LOAD_SCALE;
316	uint32_t factor_now;
317
318	if (nthreads > ncpus) {
319	factor_now = (ncpus * LOAD_SCALE) / (nthreads + `1`);
320	} else {
321	factor_now = (ncpus - nthreads) * LOAD_SCALE;
322	}
323
324	/*
325	* For those statistics that formerly relied on being recomputed
326	* on timer ticks, advance by the approximate number of corresponding
327	* elapsed intervals, thus compensating for potential idle intervals.
328	*/
329	for (uint32_t index = `0`; index < stdelta; index++) {
330	sched_mach_factor = ((sched_mach_factor << `2`) + factor_now) / `5`;
331	sched_load_average = ((sched_load_average << `2`) + average_now) / `5`;
332	}
333
334	/*
335	* Compute old-style Mach load averages.
336	*/
337	for (uint32_t index = `0`; index < stdelta; index++) {
338	for (uint32_t i = `0`; i < `3`; i++) {
339	mach_factor[i] = ((mach_factor[i] * fract[i]) +
340	(factor_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
341
342	avenrun[i] = ((avenrun[i] * fract[i]) +
343	(average_now * (LOAD_SCALE - fract[i]))) / LOAD_SCALE;
344	}
345	}
346
347	/*
348	* Compute averages in other components.
349	*/
350	uint64_t abstime = mach_absolute_time();
351
352	for (sched_average_t avg = sched_average; avg->comp != NULL; ++avg) {
353	if (abstime >= avg->deadline) {
354	uint64_t period_abs = (avg->period * sched_one_second_interval);
355	uint64_t ninvokes = `1`;
356
357	ninvokes += (abstime - avg->deadline) / period_abs;
358	ninvokes = MIN(ninvokes, SCHED_TICK_MAX_DELTA);
359
360	for (uint32_t index = `0`; index < ninvokes; index++) {
361	(*avg->comp)(avg->param);
362	}
363	avg->deadline = abstime + period_abs;
364	}
365	}
366	}
367

Browse the source code of xnu/osfmk/kern/sched_average.c