1/*
2 * Copyright (c) 2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <mach/mach_types.h>
30#include <mach/machine.h>
31#include <machine/machine_routines.h>
32#include <machine/sched_param.h>
33#include <machine/machine_cpu.h>
34#include <kern/kern_types.h>
35#include <kern/debug.h>
36#include <kern/machine.h>
37#include <kern/misc_protos.h>
38#include <kern/processor.h>
39#include <kern/queue.h>
40#include <kern/sched.h>
41#include <kern/sched_prim.h>
42#include <kern/task.h>
43#include <kern/thread.h>
44#include <machine/atomic.h>
45#include <sys/kdebug.h>
46#include <kern/sched_amp_common.h>
47#include <stdatomic.h>
48
49#if __AMP__
50
51/* Exported globals */
52processor_set_t ecore_set = NULL;
53processor_set_t pcore_set = NULL;
54
55/*
56 * sched_amp_init()
57 *
58 * Initialize the pcore_set and ecore_set globals which describe the
59 * P/E processor sets.
60 */
61void
62sched_amp_init(void)
63{
64 sched_timeshare_init();
65}
66
67/* Spill threshold load average is ncpus in pset + (sched_amp_spill_count/(1 << PSET_LOAD_FRACTIONAL_SHIFT) */
68int sched_amp_spill_count = 3;
69int sched_amp_idle_steal = 1;
70int sched_amp_spill_steal = 1;
71
72/*
73 * We see performance gains from doing immediate IPIs to P-cores to run
74 * P-eligible threads and lesser P-E migrations from using deferred IPIs
75 * for spill.
76 */
77int sched_amp_spill_deferred_ipi = 1;
78int sched_amp_pcores_preempt_immediate_ipi = 1;
79
80/*
81 * sched_perfcontrol_inherit_recommendation_from_tg changes amp
82 * scheduling policy away from default and allows policy to be
83 * modified at run-time.
84 *
85 * once modified from default, the policy toggles between "follow
86 * thread group" and "restrict to e".
87 */
88
89_Atomic sched_perfctl_class_policy_t sched_perfctl_policy_util = SCHED_PERFCTL_POLICY_DEFAULT;
90_Atomic sched_perfctl_class_policy_t sched_perfctl_policy_bg = SCHED_PERFCTL_POLICY_DEFAULT;
91
92/*
93 * sched_amp_spill_threshold()
94 *
95 * Routine to calulate spill threshold which decides if cluster should spill.
96 */
97int
98sched_amp_spill_threshold(processor_set_t pset)
99{
100 int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
101
102 return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + sched_amp_spill_count;
103}
104
105/*
106 * pset_signal_spill()
107 *
108 * Routine to signal a running/idle CPU to cause a spill onto that CPU.
109 * Called with pset locked, returns unlocked
110 */
111void
112pset_signal_spill(processor_set_t pset, int spilled_thread_priority)
113{
114 processor_t processor;
115 sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
116
117 uint64_t idle_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE];
118 for (int cpuid = lsb_first(idle_map); cpuid >= 0; cpuid = lsb_next(idle_map, cpuid)) {
119 processor = processor_array[cpuid];
120 if (bit_set_if_clear(pset->pending_spill_cpu_mask, processor->cpu_id)) {
121 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 0, 0, 0);
122
123 processor->deadline = UINT64_MAX;
124
125 if (processor == current_processor()) {
126 pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
127 if (bit_set_if_clear(pset->pending_AST_URGENT_cpu_mask, processor->cpu_id)) {
128 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PENDING_AST_URGENT) | DBG_FUNC_START,
129 processor->cpu_id, pset->pending_AST_URGENT_cpu_mask, 0, 6);
130 }
131 } else {
132 ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
133 }
134 pset_unlock(pset);
135 sched_ipi_perform(processor, ipi_type);
136 return;
137 }
138 }
139
140 processor_t ast_processor = NULL;
141 ast_t preempt = AST_NONE;
142 uint64_t running_map = pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING];
143 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
144 processor = processor_array[cpuid];
145 if (processor->current_recommended_pset_type == PSET_AMP_P) {
146 /* Already running a spilled P-core recommended thread */
147 continue;
148 }
149 if (bit_test(pset->pending_spill_cpu_mask, processor->cpu_id)) {
150 /* Already received a spill signal */
151 continue;
152 }
153 if (processor->current_pri >= spilled_thread_priority) {
154 /* Already running a higher or equal priority thread */
155 continue;
156 }
157
158 /* Found a suitable processor */
159 bit_set(pset->pending_spill_cpu_mask, processor->cpu_id);
160 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_SIGNAL_SPILL) | DBG_FUNC_NONE, processor->cpu_id, 1, 0, 0);
161 if (processor == current_processor()) {
162 preempt = AST_PREEMPT;
163 }
164 ipi_type = sched_ipi_action(processor, NULL, SCHED_IPI_EVENT_SPILL);
165 if (ipi_type != SCHED_IPI_NONE) {
166 ast_processor = processor;
167 }
168 break;
169 }
170
171 pset_unlock(pset);
172 sched_ipi_perform(ast_processor, ipi_type);
173
174 if (preempt != AST_NONE) {
175 ast_t new_preempt = update_pending_nonurgent_preemption(processor, preempt);
176 ast_on(new_preempt);
177 }
178}
179
180/*
181 * pset_should_accept_spilled_thread()
182 *
183 * Routine to decide if pset should accept spilled threads.
184 * This function must be safe to call (to use as a hint) without holding the pset lock.
185 */
186bool
187pset_should_accept_spilled_thread(processor_set_t pset, int spilled_thread_priority)
188{
189 if (!pset) {
190 return false;
191 }
192
193 if ((pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
194 return true;
195 }
196
197 uint64_t cpu_map = (pset->recommended_bitmask & pset->cpu_state_map[PROCESSOR_RUNNING]);
198
199 for (int cpuid = lsb_first(cpu_map); cpuid >= 0; cpuid = lsb_next(cpu_map, cpuid)) {
200 processor_t processor = processor_array[cpuid];
201
202 if (processor->current_recommended_pset_type == PSET_AMP_P) {
203 /* This processor is already running a spilled thread */
204 continue;
205 }
206
207 if (processor->current_pri < spilled_thread_priority) {
208 return true;
209 }
210 }
211
212 return false;
213}
214
215/*
216 * should_spill_to_ecores()
217 *
218 * Spill policy is implemented here
219 */
220bool
221should_spill_to_ecores(processor_set_t nset, thread_t thread)
222{
223 if (nset->pset_cluster_type == PSET_AMP_E) {
224 /* Not relevant if ecores already preferred */
225 return false;
226 }
227
228 if (!pset_is_recommended(ecore_set)) {
229 /* E cores must be recommended */
230 return false;
231 }
232
233 if (thread->th_bound_cluster_id == pcore_set->pset_id) {
234 /* Thread bound to the P-cluster */
235 return false;
236 }
237
238 if (thread->sched_pri >= BASEPRI_RTQUEUES) {
239 /* Never spill realtime threads */
240 return false;
241 }
242
243 if ((nset->recommended_bitmask & nset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
244 /* Don't spill if idle cores */
245 return false;
246 }
247
248 if ((sched_get_pset_load_average(nset, 0) >= sched_amp_spill_threshold(nset)) && /* There is already a load on P cores */
249 pset_should_accept_spilled_thread(ecore_set, thread->sched_pri)) { /* There are lower priority E cores */
250 return true;
251 }
252
253 return false;
254}
255
256/*
257 * sched_amp_check_spill()
258 *
259 * Routine to check if the thread should be spilled and signal the pset if needed.
260 */
261void
262sched_amp_check_spill(processor_set_t pset, thread_t thread)
263{
264 /* pset is unlocked */
265
266 /* Bound threads don't call this function */
267 assert(thread->bound_processor == PROCESSOR_NULL);
268
269 if (should_spill_to_ecores(pset, thread)) {
270 pset_lock(ecore_set);
271
272 pset_signal_spill(ecore_set, thread->sched_pri);
273 /* returns with ecore_set unlocked */
274 }
275}
276
277/*
278 * sched_amp_steal_threshold()
279 *
280 * Routine to calculate the steal threshold
281 */
282int
283sched_amp_steal_threshold(processor_set_t pset, bool spill_pending)
284{
285 int recommended_processor_count = bit_count(pset->recommended_bitmask & pset->cpu_bitmask);
286
287 return (recommended_processor_count << PSET_LOAD_FRACTIONAL_SHIFT) + (spill_pending ? sched_amp_spill_steal : sched_amp_idle_steal);
288}
289
290/*
291 * sched_amp_steal_thread_enabled()
292 *
293 */
294bool
295sched_amp_steal_thread_enabled(processor_set_t pset)
296{
297 return (pset->pset_cluster_type == PSET_AMP_E) && (pcore_set != NULL) && (pcore_set->online_processor_count > 0);
298}
299
300/*
301 * sched_amp_balance()
302 *
303 * Invoked with pset locked, returns with pset unlocked
304 */
305bool
306sched_amp_balance(processor_t cprocessor, processor_set_t cpset)
307{
308 assert(cprocessor == current_processor());
309
310 pset_unlock(cpset);
311
312 if (!ecore_set || cpset->pset_cluster_type == PSET_AMP_E || !cprocessor->is_recommended) {
313 return false;
314 }
315
316 /*
317 * cprocessor is an idle, recommended P core processor.
318 * Look for P-eligible threads that have spilled to an E core
319 * and coax them to come back.
320 */
321 processor_set_t pset = ecore_set;
322
323 pset_lock(pset);
324
325 processor_t eprocessor;
326 uint64_t ast_processor_map = 0;
327
328 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
329 uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
330 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
331 eprocessor = processor_array[cpuid];
332 if ((eprocessor->current_pri < BASEPRI_RTQUEUES) &&
333 (eprocessor->current_recommended_pset_type == PSET_AMP_P)) {
334 ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
335 if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
336 bit_set(ast_processor_map, eprocessor->cpu_id);
337 assert(eprocessor != cprocessor);
338 }
339 }
340 }
341
342 pset_unlock(pset);
343
344 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
345 processor_t ast_processor = processor_array[cpuid];
346 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
347 }
348
349 /* Core should light-weight idle using WFE if it just sent out rebalance IPIs */
350 return ast_processor_map != 0;
351}
352
353/*
354 * Helper function for sched_amp_thread_group_recommendation_change()
355 * Find all the cores in the pset running threads from the thread_group tg
356 * and send them a rebalance interrupt.
357 */
358void
359sched_amp_bounce_thread_group_from_ecores(processor_set_t pset, struct thread_group *tg)
360{
361 if (!pset) {
362 return;
363 }
364
365 assert(pset->pset_cluster_type == PSET_AMP_E);
366 uint64_t ast_processor_map = 0;
367 sched_ipi_type_t ipi_type[MAX_CPUS] = {SCHED_IPI_NONE};
368
369 spl_t s = splsched();
370 pset_lock(pset);
371
372 uint64_t running_map = pset->cpu_state_map[PROCESSOR_RUNNING];
373 for (int cpuid = lsb_first(running_map); cpuid >= 0; cpuid = lsb_next(running_map, cpuid)) {
374 processor_t eprocessor = processor_array[cpuid];
375 if (eprocessor->current_thread_group == tg) {
376 ipi_type[eprocessor->cpu_id] = sched_ipi_action(eprocessor, NULL, SCHED_IPI_EVENT_REBALANCE);
377 if (ipi_type[eprocessor->cpu_id] != SCHED_IPI_NONE) {
378 bit_set(ast_processor_map, eprocessor->cpu_id);
379 } else if (eprocessor == current_processor()) {
380 ast_on(AST_PREEMPT);
381 bit_set(pset->pending_AST_PREEMPT_cpu_mask, eprocessor->cpu_id);
382 }
383 }
384 }
385
386 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_AMP_RECOMMENDATION_CHANGE) | DBG_FUNC_NONE, tg, ast_processor_map, 0, 0);
387
388 pset_unlock(pset);
389
390 for (int cpuid = lsb_first(ast_processor_map); cpuid >= 0; cpuid = lsb_next(ast_processor_map, cpuid)) {
391 processor_t ast_processor = processor_array[cpuid];
392 sched_ipi_perform(ast_processor, ipi_type[cpuid]);
393 }
394
395 splx(s);
396}
397
398/*
399 * sched_amp_ipi_policy()
400 */
401sched_ipi_type_t
402sched_amp_ipi_policy(processor_t dst, thread_t thread, boolean_t dst_idle, sched_ipi_event_t event)
403{
404 processor_set_t pset = dst->processor_set;
405 assert(dst != current_processor());
406
407 boolean_t deferred_ipi_supported = false;
408#if defined(CONFIG_SCHED_DEFERRED_AST)
409 deferred_ipi_supported = true;
410#endif /* CONFIG_SCHED_DEFERRED_AST */
411
412 switch (event) {
413 case SCHED_IPI_EVENT_SPILL:
414 /* For Spill event, use deferred IPIs if sched_amp_spill_deferred_ipi set */
415 if (deferred_ipi_supported && sched_amp_spill_deferred_ipi) {
416 return sched_ipi_deferred_policy(pset, dst, thread, event);
417 }
418 break;
419 case SCHED_IPI_EVENT_PREEMPT:
420 /* For preemption, the default policy is to use deferred IPIs
421 * for Non-RT P-core preemption. Override that behavior if
422 * sched_amp_pcores_preempt_immediate_ipi is set
423 */
424 if (thread && thread->sched_pri < BASEPRI_RTQUEUES) {
425 if (sched_amp_pcores_preempt_immediate_ipi && (pset == pcore_set)) {
426 return dst_idle ? SCHED_IPI_IDLE : SCHED_IPI_IMMEDIATE;
427 }
428 }
429 break;
430 default:
431 break;
432 }
433 /* Default back to the global policy for all other scenarios */
434 return sched_ipi_policy(dst, thread, dst_idle, event);
435}
436
437/*
438 * sched_amp_qos_max_parallelism()
439 */
440uint32_t
441sched_amp_qos_max_parallelism(int qos, uint64_t options)
442{
443 uint32_t ecount = ecore_set ? ecore_set->cpu_set_count : 0;
444 uint32_t pcount = pcore_set ? pcore_set->cpu_set_count : 0;
445
446 /*
447 * The AMP scheduler does not support more than 1 of each type of cluster
448 * but the P-cluster is optional (e.g. watchOS)
449 */
450 uint32_t ecluster_count = ecount ? 1 : 0;
451 uint32_t pcluster_count = pcount ? 1 : 0;
452
453 if (options & QOS_PARALLELISM_REALTIME) {
454 /* For realtime threads on AMP, we would want them
455 * to limit the width to just the P-cores since we
456 * do not spill/rebalance for RT threads.
457 */
458 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? pcluster_count : pcount;
459 }
460
461 /*
462 * The default AMP scheduler policy is to run utility and by
463 * threads on E-Cores only. Run-time policy adjustment unlocks
464 * ability of utility and bg to threads to be scheduled based on
465 * run-time conditions.
466 */
467 switch (qos) {
468 case THREAD_QOS_UTILITY:
469 if (os_atomic_load(&sched_perfctl_policy_util, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
470 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
471 } else {
472 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
473 }
474 case THREAD_QOS_BACKGROUND:
475 case THREAD_QOS_MAINTENANCE:
476 if (os_atomic_load(&sched_perfctl_policy_bg, relaxed) == SCHED_PERFCTL_POLICY_DEFAULT) {
477 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? ecluster_count : ecount;
478 } else {
479 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
480 }
481 default:
482 return (options & QOS_PARALLELISM_CLUSTER_SHARED_RESOURCE) ? (ecluster_count + pcluster_count) : (ecount + pcount);
483 }
484}
485
486pset_node_t
487sched_amp_choose_node(thread_t thread)
488{
489 pset_node_t node = (recommended_pset_type(thread) == PSET_AMP_P) ? pcore_node : ecore_node;
490 return ((node != NULL) && (node->pset_map != 0)) ? node : &pset_node0;
491}
492
493#endif /* __AMP__ */
494