1 | /* |
2 | * Copyright (c) 2018 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #ifdef __x86_64__ |
30 | #error This file is only needed on weakly-ordered systems! |
31 | #endif |
32 | |
33 | #include <machine/atomic.h> |
34 | #include <machine/commpage.h> |
35 | #include <machine/machine_cpu.h> |
36 | |
37 | #include <kern/sched_prim.h> |
38 | #include <kern/processor.h> |
39 | #include <kern/ast.h> |
40 | |
41 | #include <kern/cpu_quiesce.h> |
42 | |
43 | /* |
44 | * CPU quiescing generation counter implemented with a checkin mask |
45 | * |
46 | * A tri-state bitfield, with 2 bits for each processor:; |
47 | * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier |
48 | * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle. |
49 | * |
50 | * When a processor causes the 'expected' bits to equal the 'checkin' bits, which |
51 | * indicates that all processors have executed the barrier, it ticks the algorithm |
52 | * and resets the state. |
53 | * |
54 | * Idle CPUs won't check in, because they don't run, so the algorithm won't tick. |
55 | * However, they can't do anything in userspace while idle, so we don't need |
56 | * them to execute barriers, so we have them 'leave' the counter so that |
57 | * they don't delay the tick while idle. |
58 | * |
59 | * This bitfield currently limits MAX_CPUS to 32 on LP64. |
60 | * In the future, we can use double-wide atomics and int128 if we need 64 CPUS. |
61 | * |
62 | * The mask only guarantees ordering to code running in userspace. |
63 | * We defer joining the counter until we actually reach userspace, allowing |
64 | * processors that come out of idle and only run kernel code to avoid the overhead |
65 | * of participation. |
66 | * |
67 | * We additionally defer updating the counter for a minimum interval to |
68 | * reduce the frequency of executing the exclusive atomic operations. |
69 | * |
70 | * The longest delay between two checkins assuming that at least one processor |
71 | * joins is <checkin delay> + (<thread quantum> * 2) |
72 | */ |
73 | |
74 | typedef unsigned long checkin_mask_t; |
75 | |
76 | static _Atomic checkin_mask_t cpu_quiescing_checkin_state; |
77 | |
78 | static uint64_t cpu_checkin_last_commit; |
79 | |
80 | #define CPU_CHECKIN_MIN_INTERVAL_US 4000 /* 4ms */ |
81 | #define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */ |
82 | static uint64_t cpu_checkin_min_interval; |
83 | uint32_t cpu_checkin_min_interval_us; |
84 | |
85 | #if __LP64__ |
86 | static_assert(MAX_CPUS <= 32); |
87 | #define CPU_CHECKIN_MASK 0x5555555555555555UL |
88 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) |
89 | #else |
90 | /* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */ |
91 | static_assert(MAX_CPUS <= 16); |
92 | #define CPU_CHECKIN_MASK 0x55555555UL |
93 | #define CPU_EXPECTED_MASK (~CPU_CHECKIN_MASK) |
94 | #endif |
95 | |
96 | static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1); |
97 | |
98 | static inline checkin_mask_t |
99 | cpu_checked_in_bit(int cpuid) |
100 | { |
101 | return 1UL << (2 * cpuid); |
102 | } |
103 | |
104 | static inline checkin_mask_t |
105 | cpu_expected_bit(int cpuid) |
106 | { |
107 | return 1UL << (2 * cpuid + 1); |
108 | } |
109 | |
110 | void |
111 | cpu_quiescent_counter_init(void) |
112 | { |
113 | assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS)); |
114 | assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS)); |
115 | assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS)) == 0); |
116 | assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS)) == 0); |
117 | |
118 | cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US); |
119 | } |
120 | |
121 | void |
122 | cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us) |
123 | { |
124 | /* clamp to something vaguely sane */ |
125 | if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) |
126 | new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US; |
127 | |
128 | cpu_checkin_min_interval_us = new_value_us; |
129 | |
130 | uint64_t abstime = 0; |
131 | clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us, |
132 | NSEC_PER_USEC, &abstime); |
133 | cpu_checkin_min_interval = abstime; |
134 | } |
135 | |
136 | |
137 | /* |
138 | * Called when all running CPUs have checked in. |
139 | * |
140 | * The commpage increment is protected by the 'lock' of having caused the tick, |
141 | * and it is published by the state reset release barrier. |
142 | */ |
143 | static void |
144 | cpu_quiescent_counter_commit(uint64_t ctime) |
145 | { |
146 | __kdebug_only uint64_t old_gen; |
147 | __kdebug_only checkin_mask_t old_state; |
148 | |
149 | old_gen = commpage_increment_cpu_quiescent_counter(); |
150 | |
151 | cpu_checkin_last_commit = ctime; |
152 | |
153 | old_state = os_atomic_and(&cpu_quiescing_checkin_state, ~CPU_CHECKIN_MASK, release); |
154 | |
155 | KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0); |
156 | } |
157 | |
158 | /* |
159 | * Have all the expected CPUs checked in? |
160 | */ |
161 | static bool |
162 | cpu_quiescent_counter_needs_commit(checkin_mask_t state) |
163 | { |
164 | return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1); |
165 | } |
166 | |
167 | /* |
168 | * Called when a processor wants to start participating in the counter, e.g. |
169 | * 1) when context switching away from the idle thread |
170 | * 2) when coming up for the first time |
171 | * 3) when coming up after a shutdown |
172 | * |
173 | * Called with interrupts disabled. |
174 | */ |
175 | void |
176 | cpu_quiescent_counter_join(__unused uint64_t ctime) |
177 | { |
178 | processor_t processor = current_processor(); |
179 | __assert_only int cpuid = processor->cpu_id; |
180 | |
181 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_NONE || |
182 | processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_LEFT); |
183 | |
184 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & |
185 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); |
186 | |
187 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_PENDING_JOIN; |
188 | |
189 | /* |
190 | * Mark the processor to call cpu_quiescent_counter_ast before it |
191 | * ever returns to userspace. |
192 | */ |
193 | ast_on(AST_UNQUIESCE); |
194 | } |
195 | |
196 | /* |
197 | * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback |
198 | * It needs to acquire the counter to see data and the counter published by other CPUs. |
199 | */ |
200 | void |
201 | cpu_quiescent_counter_ast(void) |
202 | { |
203 | processor_t processor = current_processor(); |
204 | int cpuid = processor->cpu_id; |
205 | |
206 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN); |
207 | |
208 | /* We had better not already be joined. */ |
209 | assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) & |
210 | (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0); |
211 | |
212 | /* |
213 | * No release barrier needed because we have no prior state to publish. |
214 | * Acquire barrier needed because we need this processor to see |
215 | * the latest counter value. |
216 | * |
217 | * The state may be in 'needs checkin' both before and after |
218 | * this atomic or. |
219 | * |
220 | * Additionally, if this is the first processor to come out of idle, |
221 | * it may need to kickstart the algorithm, otherwise it would |
222 | * stay in 'needs commit' perpetually with no processor assigned to |
223 | * actually do the commit. To do that, the first processor only adds |
224 | * its expected bit. |
225 | */ |
226 | |
227 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_JOINED; |
228 | processor->cpu_quiesce_last_checkin = mach_absolute_time(); |
229 | |
230 | checkin_mask_t old_mask, new_mask; |
231 | os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, { |
232 | if (old_mask == 0) { |
233 | new_mask = old_mask | cpu_expected_bit(cpuid); |
234 | } else { |
235 | new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid); |
236 | } |
237 | }); |
238 | } |
239 | |
240 | /* |
241 | * Called when a processor no longer wants to participate in the counter, |
242 | * i.e. when a processor is on its way to idle or shutdown. |
243 | * |
244 | * Called with interrupts disabled. |
245 | * |
246 | * The processor needs to remove itself from the expected mask, to allow the |
247 | * algorithm to continue ticking without its participation. |
248 | * However, it needs to ensure that anything it has done since the last time |
249 | * it checked in has been published before the next tick is allowed to commit. |
250 | */ |
251 | void |
252 | cpu_quiescent_counter_leave(uint64_t ctime) |
253 | { |
254 | processor_t processor = current_processor(); |
255 | int cpuid = processor->cpu_id; |
256 | |
257 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED || |
258 | processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN); |
259 | |
260 | /* We no longer need the cpu_quiescent_counter_ast callback to be armed */ |
261 | ast_off(AST_UNQUIESCE); |
262 | |
263 | if (processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN) { |
264 | /* We never actually joined, so we don't have to do the work to leave. */ |
265 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT; |
266 | return; |
267 | } |
268 | |
269 | /* Leaving can't be deferred, even if we're within the min interval */ |
270 | processor->cpu_quiesce_last_checkin = ctime; |
271 | |
272 | checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid); |
273 | |
274 | checkin_mask_t orig_state = os_atomic_and_orig(&cpu_quiescing_checkin_state, |
275 | ~mask, acq_rel); |
276 | |
277 | assert((orig_state & cpu_expected_bit(cpuid))); |
278 | |
279 | processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT; |
280 | |
281 | if (cpu_quiescent_counter_needs_commit(orig_state)) { |
282 | /* |
283 | * the old state indicates someone else was already doing a commit |
284 | * but hadn't finished yet. We successfully inserted the acq_rel |
285 | * before they finished the commit by resetting the bitfield, |
286 | * so we're done here. |
287 | */ |
288 | return; |
289 | } |
290 | |
291 | checkin_mask_t new_state = orig_state & ~mask; |
292 | |
293 | if (cpu_quiescent_counter_needs_commit(new_state)) { |
294 | cpu_quiescent_counter_commit(ctime); |
295 | } |
296 | } |
297 | |
298 | /* |
299 | * Called when a processor wants to check in to the counter |
300 | * If it hasn't yet fully joined, it doesn't need to check in. |
301 | * |
302 | * Called with interrupts disabled. |
303 | */ |
304 | void |
305 | cpu_quiescent_counter_checkin(uint64_t ctime) |
306 | { |
307 | processor_t processor = current_processor(); |
308 | int cpuid = processor->cpu_id; |
309 | |
310 | assert(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_NONE); |
311 | |
312 | /* If we're not joined yet, we don't need to check in */ |
313 | if (__probable(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_JOINED)) |
314 | return; |
315 | |
316 | /* If we've checked in recently, we don't need to check in yet. */ |
317 | if (__probable((ctime - processor->cpu_quiesce_last_checkin) <= cpu_checkin_min_interval)) |
318 | return; |
319 | |
320 | processor->cpu_quiesce_last_checkin = ctime; |
321 | |
322 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); |
323 | |
324 | assert((state & cpu_expected_bit(cpuid))); |
325 | |
326 | if (__probable((state & cpu_checked_in_bit(cpuid)))) { |
327 | /* |
328 | * Processor has already checked in for this round, no need to |
329 | * acquire the cacheline exclusive. |
330 | */ |
331 | return; |
332 | } |
333 | |
334 | checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state, |
335 | cpu_checked_in_bit(cpuid), acq_rel); |
336 | |
337 | checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid); |
338 | |
339 | if (cpu_quiescent_counter_needs_commit(new_state)) { |
340 | assertf(!cpu_quiescent_counter_needs_commit(orig_state), |
341 | "old: 0x%lx, new: 0x%lx" , orig_state, new_state); |
342 | cpu_quiescent_counter_commit(ctime); |
343 | } |
344 | } |
345 | |
346 | #if MACH_ASSERT |
347 | /* |
348 | * Called on all AST exits to userspace to assert this processor actually joined |
349 | * |
350 | * Called with interrupts disabled after the AST should have been handled |
351 | */ |
352 | void |
353 | cpu_quiescent_counter_assert_ast(void) |
354 | { |
355 | processor_t processor = current_processor(); |
356 | int cpuid = processor->cpu_id; |
357 | |
358 | assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED); |
359 | |
360 | checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed); |
361 | assert((state & cpu_expected_bit(cpuid))); |
362 | } |
363 | #endif /* MACH_ASSERT */ |
364 | |
365 | |