1/*
2 * Copyright (c) 2007 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <kern/affinity.h>
30#include <kern/task.h>
31#include <kern/kalloc.h>
32#include <machine/cpu_affinity.h>
33
34/*
35 * Affinity involves 2 objects:
36 * - affinity namespace:
37 * shared by a task family, this controls affinity tag lookup and
38 * allocation; it anchors all affinity sets in one namespace
39 * - affinity set:
40 * anchors all threads with membership of this affinity set
41 * and which share an affinity tag in the owning namespace.
42 *
43 * Locking:
44 * - The task lock protects the creation of an affinity namespace.
45 * - The affinity namespace mutex protects the inheritance of a namespace
46 * and its thread membership. This includes its destruction when the task
47 * reference count goes to zero.
48 * - The thread mutex protects a thread's affinity set membership, but in
49 * addition, the thread_lock is taken to write thread->affinity_set since this
50 * field (representng the active affinity set) is read by the scheduler.
51 *
52 * The lock ordering is: task lock, thread mutex, namespace mutex, thread lock.
53 */
54
55#if AFFINITY_DEBUG
56#define DBG(x...) kprintf("DBG: " x)
57#else
58#define DBG(x...)
59#endif
60
61struct affinity_space {
62 lck_mtx_t aspc_lock;
63 uint32_t aspc_task_count;
64 queue_head_t aspc_affinities;
65};
66typedef struct affinity_space *affinity_space_t;
67
68static affinity_space_t affinity_space_alloc(void);
69static void affinity_space_free(affinity_space_t aspc);
70static affinity_set_t affinity_set_alloc(void);
71static void affinity_set_free(affinity_set_t aset);
72static affinity_set_t affinity_set_find(affinity_space_t aspc, uint32_t tag);
73static void affinity_set_place(affinity_space_t aspc, affinity_set_t aset);
74static void affinity_set_add(affinity_set_t aset, thread_t thread);
75static affinity_set_t affinity_set_remove(affinity_set_t aset, thread_t thread);
76
77/*
78 * The following globals may be modified by the sysctls
79 * kern.affinity_sets_enabled - disables hinting if cleared
80 * kern.affinity_sets_mapping - controls cache distribution policy
81 * See bsd/kern_sysctl.c
82 *
83 * Affinity sets are not used on embedded, which typically only
84 * has a single pset, and last-processor affinity is
85 * more important than pset affinity.
86 */
87#if !defined(XNU_TARGET_OS_OSX)
88boolean_t affinity_sets_enabled = FALSE;
89int affinity_sets_mapping = 0;
90#else /* !defined(XNU_TARGET_OS_OSX) */
91boolean_t affinity_sets_enabled = TRUE;
92int affinity_sets_mapping = 1;
93#endif /* !defined(XNU_TARGET_OS_OSX) */
94
95boolean_t
96thread_affinity_is_supported(void)
97{
98 return ml_get_max_affinity_sets() != 0;
99}
100
101
102/*
103 * thread_affinity_get()
104 * Return the affinity tag for a thread.
105 * Called with the thread mutex held.
106 */
107uint32_t
108thread_affinity_get(thread_t thread)
109{
110 uint32_t tag;
111
112 if (thread->affinity_set != NULL) {
113 tag = thread->affinity_set->aset_tag;
114 } else {
115 tag = THREAD_AFFINITY_TAG_NULL;
116 }
117
118 return tag;
119}
120
121
122/*
123 * thread_affinity_set()
124 * Place a thread in an affinity set identified by a tag.
125 * Called with thread referenced but not locked.
126 */
127kern_return_t
128thread_affinity_set(thread_t thread, uint32_t tag)
129{
130 task_t task = get_threadtask(thread);
131 affinity_set_t aset;
132 affinity_set_t empty_aset = NULL;
133 affinity_space_t aspc;
134 affinity_space_t new_aspc = NULL;
135
136 DBG("thread_affinity_set(%p,%u)\n", thread, tag);
137
138 task_lock(task);
139 aspc = task->affinity_space;
140 if (aspc == NULL) {
141 task_unlock(task);
142 new_aspc = affinity_space_alloc();
143 if (new_aspc == NULL) {
144 return KERN_RESOURCE_SHORTAGE;
145 }
146 task_lock(task);
147 if (task->affinity_space == NULL) {
148 task->affinity_space = new_aspc;
149 new_aspc = NULL;
150 }
151 aspc = task->affinity_space;
152 }
153 task_unlock(task);
154 if (new_aspc) {
155 affinity_space_free(aspc: new_aspc);
156 }
157
158 thread_mtx_lock(thread);
159 if (!thread->active) {
160 /* Beaten to lock and the thread is dead */
161 thread_mtx_unlock(thread);
162 return KERN_TERMINATED;
163 }
164
165 lck_mtx_lock(lck: &aspc->aspc_lock);
166 aset = thread->affinity_set;
167 if (aset != NULL) {
168 /*
169 * Remove thread from current affinity set
170 */
171 DBG("thread_affinity_set(%p,%u) removing from aset %p\n",
172 thread, tag, aset);
173 empty_aset = affinity_set_remove(aset, thread);
174 }
175
176 if (tag != THREAD_AFFINITY_TAG_NULL) {
177 aset = affinity_set_find(aspc, tag);
178 if (aset != NULL) {
179 /*
180 * Add thread to existing affinity set
181 */
182 DBG("thread_affinity_set(%p,%u) found aset %p\n",
183 thread, tag, aset);
184 } else {
185 /*
186 * Use the new affinity set, add this thread
187 * and place it in a suitable processor set.
188 */
189 if (empty_aset != NULL) {
190 aset = empty_aset;
191 empty_aset = NULL;
192 } else {
193 aset = affinity_set_alloc();
194 if (aset == NULL) {
195 lck_mtx_unlock(lck: &aspc->aspc_lock);
196 thread_mtx_unlock(thread);
197 return KERN_RESOURCE_SHORTAGE;
198 }
199 }
200 DBG("thread_affinity_set(%p,%u) (re-)using aset %p\n",
201 thread, tag, aset);
202 aset->aset_tag = tag;
203 affinity_set_place(aspc, aset);
204 }
205 affinity_set_add(aset, thread);
206 }
207
208 lck_mtx_unlock(lck: &aspc->aspc_lock);
209 thread_mtx_unlock(thread);
210
211 /*
212 * If we wound up not using an empty aset we created,
213 * free it here.
214 */
215 if (empty_aset != NULL) {
216 affinity_set_free(aset: empty_aset);
217 }
218
219 if (thread == current_thread()) {
220 thread_block(THREAD_CONTINUE_NULL);
221 }
222
223 return KERN_SUCCESS;
224}
225
226/*
227 * task_affinity_create()
228 * Called from task create.
229 */
230void
231task_affinity_create(task_t parent_task, task_t child_task)
232{
233 affinity_space_t aspc = parent_task->affinity_space;
234
235 DBG("task_affinity_create(%p,%p)\n", parent_task, child_task);
236
237 assert(aspc);
238
239 /*
240 * Bump the task reference count on the shared namespace and
241 * give it to the child.
242 */
243 lck_mtx_lock(lck: &aspc->aspc_lock);
244 aspc->aspc_task_count++;
245 child_task->affinity_space = aspc;
246 lck_mtx_unlock(lck: &aspc->aspc_lock);
247}
248
249/*
250 * task_affinity_deallocate()
251 * Called from task_deallocate() when there's a namespace to dereference.
252 */
253void
254task_affinity_deallocate(task_t task)
255{
256 affinity_space_t aspc = task->affinity_space;
257
258 DBG("task_affinity_deallocate(%p) aspc %p task_count %d\n",
259 task, aspc, aspc->aspc_task_count);
260
261 lck_mtx_lock(lck: &aspc->aspc_lock);
262 if (--(aspc->aspc_task_count) == 0) {
263 assert(queue_empty(&aspc->aspc_affinities));
264 lck_mtx_unlock(lck: &aspc->aspc_lock);
265 affinity_space_free(aspc);
266 } else {
267 lck_mtx_unlock(lck: &aspc->aspc_lock);
268 }
269}
270
271/*
272 * task_affinity_info()
273 * Return affinity tag info (number, min, max) for the task.
274 *
275 * Conditions: task is locked.
276 */
277kern_return_t
278task_affinity_info(
279 task_t task,
280 task_info_t task_info_out,
281 mach_msg_type_number_t *task_info_count)
282{
283 affinity_set_t aset;
284 affinity_space_t aspc;
285 task_affinity_tag_info_t info;
286
287 *task_info_count = TASK_AFFINITY_TAG_INFO_COUNT;
288 info = (task_affinity_tag_info_t) task_info_out;
289 info->set_count = 0;
290 info->task_count = 0;
291 info->min = THREAD_AFFINITY_TAG_NULL;
292 info->max = THREAD_AFFINITY_TAG_NULL;
293
294 aspc = task->affinity_space;
295 if (aspc) {
296 lck_mtx_lock(lck: &aspc->aspc_lock);
297 queue_iterate(&aspc->aspc_affinities,
298 aset, affinity_set_t, aset_affinities) {
299 info->set_count++;
300 if (info->min == THREAD_AFFINITY_TAG_NULL ||
301 aset->aset_tag < (uint32_t) info->min) {
302 info->min = aset->aset_tag;
303 }
304 if (info->max == THREAD_AFFINITY_TAG_NULL ||
305 aset->aset_tag > (uint32_t) info->max) {
306 info->max = aset->aset_tag;
307 }
308 }
309 info->task_count = aspc->aspc_task_count;
310 lck_mtx_unlock(lck: &aspc->aspc_lock);
311 }
312 return KERN_SUCCESS;
313}
314
315/*
316 * Called from thread_dup() during fork() with child's mutex held.
317 * Set the child into the parent's affinity set.
318 * Note the affinity space is shared.
319 */
320void
321thread_affinity_dup(thread_t parent, thread_t child)
322{
323 affinity_set_t aset;
324 affinity_space_t aspc;
325
326 thread_mtx_lock(thread: parent);
327 aset = parent->affinity_set;
328 DBG("thread_affinity_dup(%p,%p) aset %p\n", parent, child, aset);
329 if (aset == NULL) {
330 thread_mtx_unlock(thread: parent);
331 return;
332 }
333
334 aspc = aset->aset_space;
335 assert(aspc == get_threadtask(parent)->affinity_space);
336 assert(aspc == get_threadtask(child)->affinity_space);
337
338 lck_mtx_lock(lck: &aspc->aspc_lock);
339 affinity_set_add(aset, thread: child);
340 lck_mtx_unlock(lck: &aspc->aspc_lock);
341
342 thread_mtx_unlock(thread: parent);
343}
344
345/*
346 * thread_affinity_terminate()
347 * Remove thread from any affinity set.
348 * Called with the thread mutex locked.
349 */
350void
351thread_affinity_terminate(thread_t thread)
352{
353 affinity_set_t aset = thread->affinity_set;
354 affinity_space_t aspc;
355
356 DBG("thread_affinity_terminate(%p)\n", thread);
357
358 aspc = aset->aset_space;
359 lck_mtx_lock(lck: &aspc->aspc_lock);
360 if (affinity_set_remove(aset, thread)) {
361 affinity_set_free(aset);
362 }
363 lck_mtx_unlock(lck: &aspc->aspc_lock);
364}
365
366/*
367 * thread_affinity_exec()
368 * Called from execve() to cancel any current affinity - a new image implies
369 * the calling thread terminates any expressed or inherited affinity.
370 */
371void
372thread_affinity_exec(thread_t thread)
373{
374 if (thread->affinity_set != AFFINITY_SET_NULL) {
375 thread_affinity_terminate(thread);
376 }
377}
378
379/*
380 * Create an empty affinity namespace data structure.
381 */
382static affinity_space_t
383affinity_space_alloc(void)
384{
385 affinity_space_t aspc;
386
387 aspc = kalloc_type(struct affinity_space, Z_WAITOK | Z_NOFAIL);
388
389 lck_mtx_init(lck: &aspc->aspc_lock, grp: &task_lck_grp, attr: &task_lck_attr);
390 queue_init(&aspc->aspc_affinities);
391 aspc->aspc_task_count = 1;
392
393 DBG("affinity_space_create() returns %p\n", aspc);
394 return aspc;
395}
396
397/*
398 * Destroy the given empty affinity namespace data structure.
399 */
400static void
401affinity_space_free(affinity_space_t aspc)
402{
403 assert(queue_empty(&aspc->aspc_affinities));
404
405 lck_mtx_destroy(lck: &aspc->aspc_lock, grp: &task_lck_grp);
406 DBG("affinity_space_free(%p)\n", aspc);
407 kfree_type(struct affinity_space, aspc);
408}
409
410
411/*
412 * Create an empty affinity set data structure
413 * entering it into a list anchored by the owning task.
414 */
415static affinity_set_t
416affinity_set_alloc(void)
417{
418 affinity_set_t aset;
419
420 aset = kalloc_type(struct affinity_set, Z_WAITOK | Z_NOFAIL);
421
422 aset->aset_thread_count = 0;
423 queue_init(&aset->aset_affinities);
424 queue_init(&aset->aset_threads);
425 aset->aset_num = 0;
426 aset->aset_pset = PROCESSOR_SET_NULL;
427 aset->aset_space = NULL;
428
429 DBG("affinity_set_create() returns %p\n", aset);
430 return aset;
431}
432
433/*
434 * Destroy the given empty affinity set data structure
435 * after removing it from the parent task.
436 */
437static void
438affinity_set_free(affinity_set_t aset)
439{
440 assert(queue_empty(&aset->aset_threads));
441
442 DBG("affinity_set_free(%p)\n", aset);
443 kfree_type(struct affinity_set, aset);
444}
445
446/*
447 * Add a thread to an affinity set.
448 * The caller must have the thread mutex and space locked.
449 */
450static void
451affinity_set_add(affinity_set_t aset, thread_t thread)
452{
453 spl_t s;
454
455 DBG("affinity_set_add(%p,%p)\n", aset, thread);
456 queue_enter(&aset->aset_threads,
457 thread, thread_t, affinity_threads);
458 aset->aset_thread_count++;
459 s = splsched();
460 thread_lock(thread);
461 thread->affinity_set = aset;
462 thread_unlock(thread);
463 splx(s);
464}
465
466/*
467 * Remove a thread from an affinity set returning the set if now empty.
468 * The caller must have the thread mutex and space locked.
469 */
470static affinity_set_t
471affinity_set_remove(affinity_set_t aset, thread_t thread)
472{
473 spl_t s;
474
475 s = splsched();
476 thread_lock(thread);
477 thread->affinity_set = NULL;
478 thread_unlock(thread);
479 splx(s);
480
481 aset->aset_thread_count--;
482 queue_remove(&aset->aset_threads,
483 thread, thread_t, affinity_threads);
484 if (queue_empty(&aset->aset_threads)) {
485 queue_remove(&aset->aset_space->aspc_affinities,
486 aset, affinity_set_t, aset_affinities);
487 assert(aset->aset_thread_count == 0);
488 aset->aset_tag = THREAD_AFFINITY_TAG_NULL;
489 aset->aset_num = 0;
490 aset->aset_pset = PROCESSOR_SET_NULL;
491 aset->aset_space = NULL;
492 DBG("affinity_set_remove(%p,%p) set now empty\n", aset, thread);
493 return aset;
494 } else {
495 DBG("affinity_set_remove(%p,%p)\n", aset, thread);
496 return NULL;
497 }
498}
499
500/*
501 * Find an affinity set in the parent task with the given affinity tag.
502 * The caller must have the space locked.
503 */
504static affinity_set_t
505affinity_set_find(affinity_space_t space, uint32_t tag)
506{
507 affinity_set_t aset;
508
509 queue_iterate(&space->aspc_affinities,
510 aset, affinity_set_t, aset_affinities) {
511 if (aset->aset_tag == tag) {
512 DBG("affinity_set_find(%p,%u) finds %p\n",
513 space, tag, aset);
514 return aset;
515 }
516 }
517 DBG("affinity_set_find(%p,%u) not found\n", space, tag);
518 return NULL;
519}
520
521/*
522 * affinity_set_place() assigns an affinity set to a suitable processor_set.
523 * The selection criteria is:
524 * - the set currently occupied by the least number of affinities
525 * belonging to the owning the task.
526 * The caller must have the space locked.
527 */
528static void
529affinity_set_place(affinity_space_t aspc, affinity_set_t new_aset)
530{
531 unsigned short set_occupancy[MAX_CPUS] = { 0 };
532 unsigned num_cpu_asets = ml_get_max_affinity_sets();
533 unsigned i_least_occupied;
534 affinity_set_t aset;
535
536 if (__improbable(num_cpu_asets > MAX_CPUS)) {
537 // If this triggers then the array needs to be made bigger.
538 panic("num_cpu_asets = %d > %d too big in %s", num_cpu_asets, MAX_CPUS, __FUNCTION__);
539 }
540
541 /*
542 * Scan the affinity sets calculating the number of sets
543 * occupy the available physical affinities.
544 */
545 queue_iterate(&aspc->aspc_affinities,
546 aset, affinity_set_t, aset_affinities) {
547 if (aset->aset_num < num_cpu_asets) {
548 set_occupancy[aset->aset_num]++;
549 } else {
550 panic("aset_num = %d in %s", aset->aset_num, __FUNCTION__);
551 }
552 }
553
554 /*
555 * Find the least occupied set (or the first empty set).
556 * To distribute placements somewhat, start searching from
557 * a cpu affinity chosen randomly per namespace:
558 * [(unsigned int)aspc % 127] % num_cpu_asets
559 * unless this mapping policy is overridden.
560 */
561 if (affinity_sets_mapping == 0) {
562 i_least_occupied = 0;
563 } else {
564 i_least_occupied = (unsigned int)(((uintptr_t)aspc % 127) % num_cpu_asets);
565 }
566 for (unsigned i = 0; i < num_cpu_asets; i++) {
567 unsigned int j = (i_least_occupied + i) % num_cpu_asets;
568 if (set_occupancy[j] == 0) {
569 i_least_occupied = j;
570 break;
571 }
572 if (set_occupancy[j] < set_occupancy[i_least_occupied]) {
573 i_least_occupied = j;
574 }
575 }
576 new_aset->aset_num = i_least_occupied;
577 new_aset->aset_pset = ml_affinity_to_pset(affinity_num: i_least_occupied);
578
579 /* Add the new affinity set to the group */
580 new_aset->aset_space = aspc;
581 queue_enter(&aspc->aspc_affinities,
582 new_aset, affinity_set_t, aset_affinities);
583
584 DBG("affinity_set_place(%p,%p) selected affinity %u pset %p\n",
585 aspc, new_aset, new_aset->aset_num, new_aset->aset_pset);
586}
587