1/*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <kern/locks_internal.h>
30#include <kern/cpu_data.h>
31#include <kern/machine.h>
32#include <kern/mpsc_queue.h>
33#include <kern/percpu.h>
34#include <kern/sched.h>
35#include <kern/smr.h>
36#include <kern/smr_hash.h>
37#include <kern/thread.h>
38#include <kern/zalloc.h>
39#include <machine/commpage.h>
40#include <os/hash.h>
41
42
43#pragma mark - SMR domains
44
45/*
46 * This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
47 *
48 * Major differences are:
49 *
50 * - only eager clocks are implemented (no lazy, no implicit)
51 *
52 *
53 * SMR clocks have 3 state machines interacting at any given time:
54 *
55 * 1. reader critical sections
56 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
57 *
58 * Each CPU can disable preemption and do this sequence:
59 *
60 * CPU::c_rd_seq = GLOBAL::c_wr_seq;
61 *
62 * < unfortunate place to receive a long IRQ > [I]
63 *
64 * os_atomic_thread_fence(seq_cst); [R1]
65 *
66 * {
67 * // critical section
68 * }
69 *
70 * os_atomic_store(&CPU::c_rd_seq, INVALID, release); [R2]
71 *
72 *
73 *
74 * 2. writer sequence advances
75 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~
76 *
77 * Each writer can increment the global write sequence
78 * at any given time:
79 *
80 * os_atomic_add(&GLOBAL::c_wr_seq, SMR_SEQ_INC, release); [W]
81 *
82 *
83 *
84 * 3. synchronization sequence: poll/wait/scan
85 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
86 *
87 * This state machine synchronizes with the other two in order to decide
88 * if a given "goal" is in the past. Only the cases when the call
89 * is successful is interresting for barrier purposes, and we will focus
90 * on cases that do not take an early return for failures.
91 *
92 * a. __smr_poll:
93 *
94 * rd_seq = os_atomic_load(&GLOBAL::c_rd_seq, acquire); [S1]
95 * if (goal < rd_seq) SUCCESS.
96 * wr_seq = os_atomic_load(&GLOBAL::c_rd_seq, relaxed);
97 *
98 * b. __smr_scan
99 *
100 * os_atomic_thread_fence(seq_cst) [S2]
101 *
102 * observe the minimum CPU::c_rd_seq "min_rd_seq"
103 * value possible or rw_seq if no CPU was in a critical section.
104 * (possibly spinning until it satisfies "goal")
105 *
106 * c. __smr_rd_advance
107 *
108 * cur_rd_seq = load_exclusive(&GLOBAL::c_rd_seq);
109 * os_atomic_thread_fence(seq_cst); [S3]
110 * if (min_rd_seq > cur_rd_seq) {
111 * store_exlusive(&GLOBAL::c_rd_seq, min_rd_seq);
112 * }
113 *
114 *
115 * One sentence summary
116 * ~~~~~~~~~~~~~~~~~~~~
117 *
118 * A simplistic one-sentence summary of the algorithm is that __smr_scan()
119 * works really hard to insert itself in the timeline of write sequences and
120 * observe a reasonnable bound for first safe-to-reclaim sequence, and
121 * issues [S3] to sequence everything around "c_rd_seq" (via [S3] -> [S1]):
122 *
123 * GLOBAL::c_rd_seq GLOBAL::c_wr_seq
124 * v v
125 * ──────────────────────┬────────────────┬─────────────────────
126 * ... safe to reclaim │ deferred │ future ...
127 * ──────────────────────┴────────────────┴─────────────────────
128 *
129 *
130 * Detailed explanation
131 * ~~~~~~~~~~~~~~~~~~~~
132 *
133 * [W] -> [R1] establishes a "happens before" relationship between a given
134 * writer and this critical section. The loaded GLOBAL::c_wr_seq might
135 * however be stale with respect to the one [R1] really synchronizes with
136 * (see [I] explanation below).
137 *
138 *
139 * [R1] -> [S2] establishes a "happens before" relationship between all the
140 * active critical sections and the scanner.
141 * It lets us compute the oldest possible sequence pinned by an active
142 * critical section.
143 *
144 *
145 * [R2] -> [S3] establishes a "happens before" relationship between all the
146 * inactive critical sections and the scanner.
147 *
148 *
149 * [S3] -> [S1] is the typical expected fastpath: when the caller can decide
150 * that its goal is older than the last update an __smr_rd_advance() did.
151 * Note that [S3] doubles as an "[S1]" when two __smr_scan() race each other
152 * and one of them finishes last but observed a "worse" read sequence.
153 *
154 *
155 * [W], [S3] -> [S1] is the last crucial property: all updates to the global
156 * clock are totally ordered because they update the entire 128bit state
157 * every time with an RMW. This guarantees that __smr_poll() can't load
158 * an `rd_seq` that is younger than the `wr_seq` it loads next.
159 *
160 *
161 * [I] __smr_enter() also can be unfortunately delayed after observing
162 * a given write sequence and right before [R1] at [I].
163 *
164 * However for a read sequence to have move past what __smr_enter() observed,
165 * it means another __smr_scan() didn't observe the store to CPU::c_rd_seq
166 * made by __smr_enter() and thought the section was inactive.
167 *
168 * This can only happen if the scan's [S2] was issued before the delayed
169 * __smr_enter() [R1] (during the [I] window).
170 *
171 * As a consequence the outcome of that scan can be accepted as the "real"
172 * write sequence __smr_enter() should have observed.
173 *
174 *
175 * Litmus tests
176 * ~~~~~~~~~~~~
177 *
178 * This is the proof of [W] -> [R1] -> [S2] being established properly:
179 * - P0 sets a global and calls smr_synchronize()
180 * - P1 does smr_enter() and loads the global
181 *
182 * AArch64 MP
183 * {
184 * global = 0;
185 * wr_seq = 123;
186 * p1_rd_seq = 0;
187 *
188 * 0:x0 = global; 0:x1 = wr_seq; 0:x2 = p1_rd_seq;
189 * 1:x0 = global; 1:x1 = wr_seq; 1:x2 = p1_rd_seq;
190 * }
191 * P0 | P1 ;
192 * MOV X8, #2 | LDR X8, [X1] ;
193 * STR X8, [X0] | STR X8, [X2] ;
194 * LDADDL X8, X9, [X1] | DMB SY ;
195 * DMB SY | LDR X10, [X0] ;
196 * LDR X10, [X2] | ;
197 * exists (0:X10 = 0 /\ 1:X8 = 123 /\ 1:X10 = 0)
198 *
199 *
200 * This is the proof that deferred advances are also correct:
201 * - P0 sets a global and does a smr_deferred_advance()
202 * - P1 does an smr_synchronize() and reads the global
203 *
204 * AArch64 MP
205 * {
206 * global = 0;
207 * wr_seq = 123;
208 *
209 * 0:x0 = global; 0:x1 = wr_seq; 0:x2 = 2;
210 * 1:x0 = global; 1:x1 = wr_seq; 1:x2 = 2;
211 * }
212 * P0 | P1 ;
213 * STR X2, [X0] | LDADDL X2, X9, [X1] ;
214 * DMB SY | DMB SY ;
215 * LDR X9, [X1] | LDR X10, [X0] ;
216 * ADD X9, X9, X2 | ;
217 * exists (0:X9 = 125 /\ 1:X9 = 123 /\ 1:X10 = 0)
218 *
219 */
220
221/*!
222 * @struct smr_worker
223 *
224 * @brief
225 * Structure tracking the per-cpu SMR workers state.
226 *
227 * @discussion
228 * This structure is system wide and global and is used to track
229 * the various active SMR domains at the granularity of a CPU.
230 *
231 * Each structure has an associated thread which is responsible
232 * for the forward progress the @c smr_call() and @c smr_barrier()
233 * interfaces.
234 *
235 * It also tracks all the active, non stalled, sleepable SMR sections.
236 */
237struct smr_worker {
238 /*
239 * The thread for this worker,
240 * and conveniency pointer to the processor it is bound to.
241 */
242 struct thread *thread;
243 struct processor *processor;
244
245 /*
246 * Thread binding/locking logic:
247 *
248 * If the worker thread is running on its canonical CPU,
249 * then locking to access the various SMR per-cpu data
250 * structures it is draining is just preemption disablement.
251 *
252 * However, if it is currently not bound to its canonical
253 * CPU because the CPU has been offlined or de-recommended,
254 * then a lock which serializes with the CPU going online
255 * again is being used.
256 */
257 struct waitq waitq;
258 smr_cpu_reason_t detach_reason;
259
260#if CONFIG_QUIESCE_COUNTER
261 /*
262 * Currently active quiescent generation for this processor,
263 * and the last timestamp when a scan of all cores was performed.
264 */
265 smr_seq_t rd_quiesce_seq;
266#endif
267
268 /*
269 * List of all the active sleepable sections that haven't
270 * been stalled.
271 */
272 struct smrq_list_head sect_queue;
273 struct thread *sect_waiter;
274
275 /*
276 * Queue of SMR domains with pending smr_call()
277 * callouts to drain.
278 *
279 * This uses an ageing strategy in order to amortize
280 * SMR clock updates:
281 *
282 * - the "old" queue have domains whose callbacks have
283 * a committed and aged sequence,
284 * - the "age" queue have domains whose callbacks have
285 * a commited but fresh sequence and need ageing,
286 * - the "cur" queue have domains whose callbacks have
287 * a sequence in the future and need for it to be committed.
288 */
289 struct smr_pcpu *whead;
290 struct smr_pcpu **wold_tail;
291 struct smr_pcpu **wage_tail;
292 struct smr_pcpu **wcur_tail;
293 uint64_t drain_ctime;
294
295 /*
296 * Queue of smr_barrier() calls in flight,
297 * that will be picked up by the worker thread
298 * to enqueue as smr_call() entries in their
299 * respective per-CPU data structures.
300 */
301 struct mpsc_queue_head barrier_queue;
302} __attribute__((aligned(64)));
303
304
305typedef struct smr_pcpu {
306 /*
307 * CPU private cacheline.
308 *
309 * Nothing else than the CPU this state is made for,
310 * ever writes to this cacheline.
311 *
312 * It holds the epoch activity witness (rd_seq), and
313 * the local smr_call() queue, which is structured this way:
314 *
315 * head -> n1 -> n2 -> n3 -> n4 -> ... -> ni -> ... -> nN -> NULL
316 * ^ ^ ^
317 * qold_tail -------------' | |
318 * qage_tail --------------------------' |
319 * qcur_tail ---------------------------------------------'
320 *
321 * - the "old" queue can be reclaimed once qold_seq is past,
322 * qold_seq is always a commited sequence.
323 * - the "age" queue can be reclaimed once qage_seq is past,
324 * qage_seq might not be commited yet.
325 * - the "cur" queue has an approximate size of qcur_size bytes,
326 * and a length of qcur_cnt callbacks.
327 */
328
329 smr_seq_t c_rd_seq; /* might have SMR_SEQ_SLEEPABLE set */
330
331 smr_node_t qhead;
332
333 smr_seq_t qold_seq;
334 smr_node_t *qold_tail;
335
336 smr_seq_t qage_seq;
337 smr_node_t *qage_tail;
338
339 uint32_t qcur_size;
340 uint32_t qcur_cnt;
341 smr_node_t *qcur_tail;
342
343 uint8_t __cacheline_sep[0];
344
345 /*
346 * Drain queue.
347 *
348 * This is used to drive smr_call() via the smr worker threads.
349 * If the SMR domain is not using smr_call() or smr_barrier(),
350 * this isn't used.
351 */
352 struct smr *drain_smr;
353 struct smr_pcpu *drain_next;
354 uint16_t __check_cpu;
355 uint8_t __check_reason;
356 uint8_t __check_list;
357
358 /*
359 * Stalled queue.
360 *
361 * Stalled sections are enqueued onto this queue by the scheduler
362 * when their thread blocks (see smr_mark_active_trackers_stalled()).
363 *
364 * If the SMR domain is not sleepable, then this isn't used.
365 *
366 * This list is protected by a lock.
367 *
368 * When there are stalled sections, stall_rd_seq contains
369 * the oldest active stalled sequence number.
370 *
371 * When threads want to expedite a stalled section, they set
372 * stall_waiter_goal to the sequence number they are waiting
373 * for and block via turnstile on the oldest stalled section.
374 */
375 hw_lck_ticket_t stall_lock;
376 smr_seq_t stall_rd_seq;
377 smr_seq_t stall_waiter_goal;
378 struct smrq_tailq_head stall_queue;
379 struct turnstile *stall_ts;
380} __attribute__((aligned(128))) * smr_pcpu_t;
381
382static_assert(offsetof(struct smr_pcpu, __cacheline_sep) == 64);
383static_assert(sizeof(struct smr_pcpu) == 128);
384
385#define CPU_CHECKIN_MIN_INTERVAL_US 5000 /* 5ms */
386#define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
387static uint64_t cpu_checkin_min_interval;
388static uint32_t cpu_checkin_min_interval_us;
389
390/*! the amount of memory pending retiring that causes a foreceful flush */
391#if XNU_TARGET_OS_OSX
392static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", 256 << 10);
393static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", 128);
394#else
395static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", 64 << 10);
396static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", 32);
397#endif
398/* time __smr_wait_for_oncore busy spins before going the expensive route */
399static TUNABLE(uint32_t, smr_wait_spin_us, "smr_wait_spin_us", 20);
400
401static LCK_GRP_DECLARE(smr_lock_grp, "smr");
402static struct smr_worker PERCPU_DATA(smr_worker);
403static struct smrq_tailq_head smr_domains = SMRQ_TAILQ_INITIALIZER(smr_domains);
404
405SMR_DEFINE_FLAGS(smr_system, "system", SMR_NONE);
406SMR_DEFINE_FLAGS(smr_system_sleepable, "system (sleepable)", SMR_SLEEPABLE);
407
408
409#pragma mark SMR domains: init & helpers
410
411#define SMR_PCPU_NOT_QUEUED ((struct smr_pcpu *)-1)
412
413__attribute__((always_inline, overloadable))
414static inline smr_pcpu_t
415__smr_pcpu(smr_t smr, int cpu)
416{
417 return &smr->smr_pcpu[cpu];
418}
419
420__attribute__((always_inline, overloadable))
421static inline smr_pcpu_t
422__smr_pcpu(smr_t smr)
423{
424 return __smr_pcpu(smr, cpu: cpu_number());
425}
426
427static inline bool
428__smr_pcpu_queued(smr_pcpu_t pcpu)
429{
430 return pcpu->drain_next != SMR_PCPU_NOT_QUEUED;
431}
432
433static inline void
434__smr_pcpu_set_not_queued(smr_pcpu_t pcpu)
435{
436 pcpu->drain_next = SMR_PCPU_NOT_QUEUED;
437}
438
439static inline void
440__smr_pcpu_associate(smr_t smr, smr_pcpu_t pcpu)
441{
442 zpercpu_foreach_cpu(cpu) {
443 pcpu[cpu].qold_tail = &pcpu[cpu].qhead;
444 pcpu[cpu].qage_tail = &pcpu[cpu].qhead;
445 pcpu[cpu].qcur_tail = &pcpu[cpu].qhead;
446
447 pcpu[cpu].drain_smr = smr;
448 __smr_pcpu_set_not_queued(pcpu: &pcpu[cpu]);
449 hw_lck_ticket_init(&pcpu[cpu].stall_lock, &smr_lock_grp);
450 smrq_init(&pcpu[cpu].stall_queue);
451 }
452
453 os_atomic_store(&smr->smr_pcpu, pcpu, release);
454}
455
456static inline event64_t
457__smrw_oncore_event(struct smr_worker *smrw)
458{
459 return CAST_EVENT64_T(&smrw->sect_queue);
460}
461
462static inline event64_t
463__smrw_drain_event(struct smr_worker *smrw)
464{
465 return CAST_EVENT64_T(&smrw->whead);
466}
467
468static inline processor_t
469__smrw_drain_bind_target(struct smr_worker *smrw)
470{
471 return smrw->detach_reason ? PROCESSOR_NULL : smrw->processor;
472}
473
474static inline void
475__smrw_lock(struct smr_worker *smrw)
476{
477 waitq_lock(wq: &smrw->waitq);
478}
479
480static inline void
481__smrw_unlock(struct smr_worker *smrw)
482{
483 waitq_unlock(wq: &smrw->waitq);
484}
485
486/*!
487 * @function __smrw_wakeup_and_unlock()
488 *
489 * @brief
490 * Wakes up (with binding) the SMR worker.
491 *
492 * @discussion
493 * Wakeup the worker thread and bind it to the proper processor
494 * as a side effect.
495 *
496 * This function must be called with interrupts disabled.
497 */
498static bool
499__smrw_wakeup_and_unlock(struct smr_worker *smrw)
500{
501 thread_t thread;
502
503 assert(!ml_get_interrupts_enabled());
504
505 thread = waitq_wakeup64_identify_locked(waitq: &smrw->waitq,
506 wake_event: __smrw_drain_event(smrw), THREAD_AWAKENED, flags: WAITQ_UNLOCK);
507
508 if (thread != THREAD_NULL) {
509 assert(thread == smrw->thread);
510
511 waitq_resume_and_bind_identified_thread(waitq: &smrw->waitq,
512 thread, processor: __smrw_drain_bind_target(smrw),
513 THREAD_AWAKENED, flags: WAITQ_WAKEUP_DEFAULT);
514 }
515
516 return thread != THREAD_NULL;
517}
518
519static void
520__smr_call_drain(smr_node_t head)
521{
522 smr_node_t node;
523
524 while ((node = head) != NULL) {
525 head = node->smrn_next;
526 node->smrn_next = NULL;
527 node->smrn_cb(node);
528 }
529}
530
531__startup_func
532void
533__smr_domain_init(smr_t smr)
534{
535 smr_pcpu_t pcpu;
536 vm_size_t size;
537
538 if (startup_phase < STARTUP_SUB_TUNABLES) {
539 smr_seq_t *rd_seqp = &smr->smr_early;
540
541 /*
542 * This is a big cheat, but before the EARLY_BOOT phase,
543 * all smr_* APIs that would access past the rd_seq
544 * will early return.
545 */
546 pcpu = __container_of(rd_seqp, struct smr_pcpu, c_rd_seq);
547 smr->smr_pcpu = pcpu - cpu_number();
548 assert(&__smr_pcpu(smr)->c_rd_seq == &smr->smr_early);
549 } else {
550 size = zpercpu_count() * sizeof(struct smr_pcpu);
551 pcpu = zalloc_permanent(size, ZALIGN(struct smr_pcpu));
552
553 __smr_pcpu_associate(smr, pcpu);
554 }
555}
556
557smr_t
558smr_domain_create(smr_flags_t flags, const char *name)
559{
560 smr_pcpu_t pcpu;
561 smr_t smr;
562
563 smr = kalloc_type(struct smr, Z_WAITOK | Z_ZERO | Z_NOFAIL);
564 pcpu = kalloc_type(struct smr_pcpu, zpercpu_count(),
565 Z_WAITOK | Z_ZERO | Z_NOFAIL);
566
567 smr->smr_clock.s_rd_seq = SMR_SEQ_INIT;
568 smr->smr_clock.s_wr_seq = SMR_SEQ_INIT;
569 smr->smr_flags = flags;
570 static_assert(sizeof(struct smr) ==
571 offsetof(struct smr, smr_name) + SMR_NAME_MAX);
572 strlcpy(dst: smr->smr_name, src: name, n: sizeof(smr->smr_name));
573
574 __smr_pcpu_associate(smr, pcpu);
575
576 return smr;
577}
578
579void
580smr_domain_free(smr_t smr)
581{
582 smr_barrier(smr);
583
584 zpercpu_foreach_cpu(cpu) {
585 smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
586
587 assert(pcpu->qhead == NULL);
588 hw_lck_ticket_destroy(&pcpu->stall_lock, &smr_lock_grp);
589 }
590
591 kfree_type(struct smr_pcpu, zpercpu_count(), smr->smr_pcpu);
592 kfree_type(struct smr, smr);
593}
594
595
596#pragma mark SMR domains: enter / leave
597
598bool
599smr_entered(smr_t smr)
600{
601 thread_t self = current_thread();
602 smr_tracker_t t;
603
604 if (lock_preemption_level_for_thread(self) &&
605 __smr_pcpu(smr)->c_rd_seq != SMR_SEQ_INVALID) {
606 return true;
607 }
608
609 if (smr->smr_flags & SMR_SLEEPABLE) {
610 smrq_serialized_foreach(t, &self->smr_stack, smrt_stack) {
611 if (t->smrt_domain == smr) {
612 return true;
613 }
614 }
615 }
616
617 return false;
618}
619
620__attribute__((always_inline))
621bool
622smr_entered_cpu_noblock(smr_t smr, int cpu)
623{
624 assert((smr->smr_flags & SMR_SLEEPABLE) == 0);
625 return __smr_pcpu(smr, cpu)->c_rd_seq != SMR_SEQ_INVALID;
626}
627
628__attribute__((always_inline))
629static smr_seq_t
630__smr_enter(smr_t smr, smr_pcpu_t pcpu, smr_seq_t sleepable)
631{
632 smr_seq_t s_wr_seq;
633 smr_seq_t old_seq;
634
635 assert(!ml_at_interrupt_context());
636
637 /*
638 * It is possible to have a long delay between loading the s_wr_seq
639 * and storing it to the percpu copy of it.
640 *
641 * It is unlikely but possible by that time the s_rd_seq advances
642 * ahead of what we will store. This however is still safe
643 * and handled in __smr_scan().
644 *
645 * On Intel, to achieve the ordering we want, we could use a store
646 * followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
647 * XADD is just the fastest instruction of the alternatives,
648 * but it will only ever add to '0'.
649 */
650 s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
651#if __x86_64__
652 /* [R1] */
653 old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq | sleepable, seq_cst);
654#else
655 old_seq = pcpu->c_rd_seq;
656 os_atomic_store(&pcpu->c_rd_seq, s_wr_seq | sleepable, relaxed);
657 os_atomic_thread_fence(seq_cst); /* [R1] */
658#endif
659 assert(old_seq == SMR_SEQ_INVALID);
660
661 return s_wr_seq;
662}
663
664__attribute__((always_inline))
665static void
666__smr_leave(smr_pcpu_t pcpu)
667{
668 assert(!ml_at_interrupt_context());
669 /* [R2] */
670 os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
671}
672
673__attribute__((always_inline))
674void
675smr_enter(smr_t smr)
676{
677 disable_preemption();
678 __smr_enter(smr, pcpu: __smr_pcpu(smr), sleepable: 0);
679}
680
681__attribute__((always_inline))
682void
683smr_leave(smr_t smr)
684{
685 __smr_leave(pcpu: __smr_pcpu(smr));
686 enable_preemption();
687}
688
689void
690smr_enter_sleepable(smr_t smr, smr_tracker_t tracker)
691{
692 thread_t self = current_thread();
693 struct smr_worker *smrw;
694 smr_pcpu_t pcpu;
695
696 assert(smr->smr_flags & SMR_SLEEPABLE);
697
698 lock_disable_preemption_for_thread(self);
699 lck_rw_lock_count_inc(thread: self, lock: smr);
700
701 pcpu = __smr_pcpu(smr);
702 smrw = PERCPU_GET(smr_worker);
703
704 tracker->smrt_domain = smr;
705 tracker->smrt_seq = __smr_enter(smr, pcpu, SMR_SEQ_SLEEPABLE);
706 smrq_serialized_insert_head_relaxed(&smrw->sect_queue, &tracker->smrt_link);
707 smrq_serialized_insert_head_relaxed(&self->smr_stack, &tracker->smrt_stack);
708 tracker->smrt_ctid = 0;
709 tracker->smrt_cpu = -1;
710
711 lock_enable_preemption();
712}
713
714__attribute__((always_inline))
715static void
716__smr_wake_oncore_sleepers(struct smr_worker *smrw)
717{
718 /*
719 * prevent reordering of making the list empty and checking for waiters.
720 */
721 if (__improbable(os_atomic_load(&smrw->sect_waiter, compiler_acq_rel))) {
722 if (smrq_empty(&smrw->sect_queue)) {
723 os_atomic_store(&smrw->sect_waiter, NULL, relaxed);
724 waitq_wakeup64_all(waitq: &smrw->waitq,
725 wake_event: __smrw_oncore_event(smrw), THREAD_AWAKENED,
726 flags: WAITQ_WAKEUP_DEFAULT);
727 }
728 }
729}
730
731void
732smr_ack_ipi(void)
733{
734 /*
735 * see __smr_wait_for_oncore(): if at the time of the IPI ack
736 * the list is empty and there is still a waiter, wake it up.
737 *
738 * If the queue is not empty, then when smr_leave_sleepable()
739 * runs it can't possibly fail to observe smrw->sect_waiter
740 * being non NULL and will do the wakeup then.
741 */
742 __smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
743}
744
745void
746smr_mark_active_trackers_stalled(thread_t self)
747{
748 struct smr_worker *smrw = PERCPU_GET(smr_worker);
749 int cpu = cpu_number();
750 smr_tracker_t t;
751
752 /* called at splsched */
753
754 smrq_serialized_foreach_safe(t, &smrw->sect_queue, smrt_link) {
755 smr_t smr = t->smrt_domain;
756 smr_pcpu_t pcpu;
757
758 pcpu = __smr_pcpu(smr, cpu);
759
760 t->smrt_ctid = self->ctid;
761 t->smrt_cpu = cpu;
762
763 hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
764
765 /*
766 * Transfer the section to the stalled queue,
767 * and _then_ leave the regular one.
768 *
769 * A store-release is sufficient to order these stores,
770 * and guarantee that __smr_scan() can't fail to observe
771 * both the @c rd_seq and @c stall_rd_seq during a transfer
772 * of a stalled section that was active when it started.
773 */
774 if (smrq_empty(&pcpu->stall_queue)) {
775 os_atomic_store(&pcpu->stall_rd_seq, t->smrt_seq, relaxed);
776 }
777 os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
778
779 smrq_serialized_insert_tail_relaxed(&pcpu->stall_queue, &t->smrt_link);
780
781 hw_lck_ticket_unlock_nopreempt(tlock: &pcpu->stall_lock);
782 }
783
784 smrq_init(&smrw->sect_queue);
785
786 __smr_wake_oncore_sleepers(smrw);
787}
788
789
790__attribute__((noinline))
791static void
792__smr_leave_stalled(smr_t smr, smr_tracker_t tracker, thread_t self)
793{
794 smr_seq_t new_stall_seq = SMR_SEQ_INVALID;
795 smr_tracker_t first = NULL;
796 smr_pcpu_t pcpu;
797 bool progress;
798
799 pcpu = __smr_pcpu(smr, cpu: tracker->smrt_cpu);
800
801 hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
802
803 progress = smrq_serialized_first(&pcpu->stall_queue,
804 struct smr_tracker, smrt_link) == tracker;
805
806 smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
807 smrq_serialized_remove(&pcpu->stall_queue, &tracker->smrt_link);
808 bzero(s: tracker, n: sizeof(*tracker));
809
810 if (progress) {
811 if (!smrq_empty(&pcpu->stall_queue)) {
812 first = smrq_serialized_first(&pcpu->stall_queue,
813 struct smr_tracker, smrt_link);
814 new_stall_seq = first->smrt_seq;
815 __builtin_assume(new_stall_seq != SMR_SEQ_INVALID);
816 assert(SMR_SEQ_CMP(pcpu->stall_rd_seq, <=, new_stall_seq));
817 }
818
819 os_atomic_store(&pcpu->stall_rd_seq, new_stall_seq, release);
820
821 progress = pcpu->stall_waiter_goal != SMR_SEQ_INVALID;
822 }
823
824 if (progress) {
825 struct turnstile *ts;
826
827 ts = turnstile_prepare(proprietor: (uintptr_t)pcpu, tstore: &pcpu->stall_ts,
828 TURNSTILE_NULL, type: TURNSTILE_KERNEL_MUTEX);
829
830 if (new_stall_seq == SMR_SEQ_INVALID ||
831 SMR_SEQ_CMP(pcpu->stall_waiter_goal, <=, new_stall_seq)) {
832 pcpu->stall_waiter_goal = SMR_SEQ_INVALID;
833 waitq_wakeup64_all(waitq: &ts->ts_waitq, CAST_EVENT64_T(pcpu),
834 THREAD_AWAKENED, flags: WAITQ_UPDATE_INHERITOR);
835 } else {
836 turnstile_update_inheritor(turnstile: ts, new_inheritor: ctid_get_thread(ctid: first->smrt_ctid),
837 flags: TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD);
838 }
839
840 turnstile_update_inheritor_complete(turnstile: ts, flags: TURNSTILE_INTERLOCK_HELD);
841
842 turnstile_complete(proprietor: (uintptr_t)pcpu, tstore: &pcpu->stall_ts,
843 NULL, type: TURNSTILE_KERNEL_MUTEX);
844 }
845
846 /* reenables preemption disabled in smr_leave_sleepable() */
847 hw_lck_ticket_unlock(tlock: &pcpu->stall_lock);
848
849 turnstile_cleanup();
850}
851
852void
853smr_leave_sleepable(smr_t smr, smr_tracker_t tracker)
854{
855 struct smr_worker *smrw;
856 thread_t self = current_thread();
857
858 assert(tracker->smrt_seq != SMR_SEQ_INVALID);
859 assert(smr->smr_flags & SMR_SLEEPABLE);
860
861 lock_disable_preemption_for_thread(self);
862
863 lck_rw_lock_count_dec(thread: self, lock: smr);
864
865 if (__improbable(tracker->smrt_cpu != -1)) {
866 return __smr_leave_stalled(smr, tracker, self);
867 }
868
869 __smr_leave(pcpu: __smr_pcpu(smr));
870
871 smrw = PERCPU_GET(smr_worker);
872 smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
873 smrq_serialized_remove(&smrw->sect_queue, &tracker->smrt_link);
874 bzero(s: tracker, n: sizeof(*tracker));
875
876 __smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
877
878 lock_enable_preemption();
879}
880
881
882#pragma mark SMR domains: advance, wait, poll, synchronize
883
884static inline smr_seq_t
885__smr_wr_advance(smr_t smr)
886{
887 /* [W] */
888 return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
889}
890
891static inline bool
892__smr_rd_advance(smr_t smr, smr_seq_t goal, smr_seq_t rd_seq)
893{
894 smr_seq_t o_seq;
895
896 os_atomic_thread_fence(seq_cst); /* [S3] */
897
898 os_atomic_rmw_loop(&smr->smr_clock.s_rd_seq, o_seq, rd_seq, relaxed, {
899 if (SMR_SEQ_CMP(rd_seq, <=, o_seq)) {
900 rd_seq = o_seq;
901 os_atomic_rmw_loop_give_up(break);
902 }
903 });
904
905 return SMR_SEQ_CMP(goal, <=, rd_seq);
906}
907
908__attribute__((noinline))
909static smr_seq_t
910__smr_wait_for_stalled(smr_pcpu_t pcpu, smr_seq_t goal)
911{
912 struct turnstile *ts;
913 thread_t inheritor;
914 wait_result_t wr;
915 smr_seq_t stall_rd_seq;
916
917 hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
918
919 stall_rd_seq = pcpu->stall_rd_seq;
920 if (stall_rd_seq == SMR_SEQ_INVALID ||
921 SMR_SEQ_CMP(goal, <=, stall_rd_seq)) {
922 hw_lck_ticket_unlock(tlock: &pcpu->stall_lock);
923 return stall_rd_seq;
924 }
925
926 if (pcpu->stall_waiter_goal == SMR_SEQ_INVALID ||
927 SMR_SEQ_CMP(goal, <, pcpu->stall_waiter_goal)) {
928 pcpu->stall_waiter_goal = goal;
929 }
930
931 inheritor = ctid_get_thread(smrq_serialized_first(&pcpu->stall_queue,
932 struct smr_tracker, smrt_link)->smrt_ctid);
933
934 ts = turnstile_prepare(proprietor: (uintptr_t)pcpu, tstore: &pcpu->stall_ts,
935 TURNSTILE_NULL, type: TURNSTILE_KERNEL_MUTEX);
936
937 turnstile_update_inheritor(turnstile: ts, new_inheritor: inheritor,
938 flags: TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD);
939 wr = waitq_assert_wait64(waitq: &ts->ts_waitq, CAST_EVENT64_T(pcpu),
940 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
941 turnstile_update_inheritor_complete(turnstile: ts, flags: TURNSTILE_INTERLOCK_HELD);
942
943 if (wr == THREAD_WAITING) {
944 hw_lck_ticket_unlock(tlock: &pcpu->stall_lock);
945 thread_block(THREAD_CONTINUE_NULL);
946 hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
947 }
948
949 turnstile_complete(proprietor: (uintptr_t)pcpu, tstore: &pcpu->stall_ts,
950 NULL, type: TURNSTILE_KERNEL_MUTEX);
951
952 stall_rd_seq = pcpu->stall_rd_seq;
953 hw_lck_ticket_unlock(tlock: &pcpu->stall_lock);
954
955 turnstile_cleanup();
956
957 return stall_rd_seq;
958}
959
960__attribute__((noinline))
961static smr_seq_t
962__smr_wait_for_oncore(smr_pcpu_t pcpu, smr_seq_t goal, uint32_t cpu)
963{
964 thread_t self = current_thread();
965 struct smr_worker *smrw;
966 uint64_t deadline = 0;
967 vm_offset_t base;
968 smr_seq_t rd_seq;
969
970 /*
971 * We are waiting for a currently active SMR section.
972 * Start spin-waiting for it for a bit.
973 */
974 for (;;) {
975 if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
976 rd_seq == SMR_SEQ_INVALID || SMR_SEQ_CMP(goal, <=, rd_seq))) {
977 return rd_seq;
978 }
979
980 if (deadline == 0) {
981 clock_interval_to_deadline(interval: smr_wait_spin_us,
982 NSEC_PER_USEC, result: &deadline);
983 } else if (mach_absolute_time() > deadline) {
984 break;
985 }
986 }
987
988 /*
989 * This section is being active for a while,
990 * we need to move to a more passive way of waiting.
991 *
992 * We post ourselves on the remote processor tracking head,
993 * to denote we need a thread_wakeup() when the tracker head clears,
994 * then send an IPI which will have 2 possible outcomes:
995 *
996 * 1. when smr_ack_ipi() runs, the queue is already cleared,
997 * and we will be woken up immediately.
998 *
999 * 2. when smr_ack_ipi() runs, the queue isn't cleared,
1000 * then it does nothing, but there is a guarantee that
1001 * when the queue clears, the remote core will observe
1002 * that there is a waiter, and thread_wakeup() will be
1003 * called then.
1004 *
1005 * In order to avoid to actually wait, we do spin some more,
1006 * hoping for the remote sequence to change.
1007 */
1008 base = other_percpu_base(cpu_number: cpu);
1009 smrw = PERCPU_GET_WITH_BASE(base, smr_worker);
1010
1011 waitq_assert_wait64(waitq: &smrw->waitq, wait_event: __smrw_oncore_event(smrw),
1012 THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1013
1014 if (lock_cmpxchg(&smrw->sect_waiter, NULL, self, relaxed)) {
1015 /*
1016 * only really send the IPI if we're first,
1017 * to avoid IPI storms in case of a pile-up
1018 * of smr_synchronize() calls stalled on the same guy.
1019 */
1020 cause_ast_check(PERCPU_GET_WITH_BASE(base, processor));
1021 }
1022
1023 if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
1024 rd_seq == SMR_SEQ_INVALID || SMR_SEQ_CMP(goal, <=, rd_seq))) {
1025 clear_wait(thread: self, THREAD_AWAKENED);
1026 return rd_seq;
1027 }
1028
1029 thread_block(THREAD_CONTINUE_NULL);
1030
1031 return os_atomic_load(&pcpu->c_rd_seq, relaxed);
1032}
1033
1034__attribute__((noinline))
1035static bool
1036__smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
1037{
1038 smr_delta_t delta;
1039 smr_seq_t rd_seq;
1040
1041 if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1042 return true;
1043 }
1044
1045 /*
1046 * Validate that the goal is sane.
1047 */
1048 delta = SMR_SEQ_DELTA(goal, clk.s_wr_seq);
1049 if (delta == SMR_SEQ_INC) {
1050 /*
1051 * This SMR clock uses deferred advance,
1052 * and the goal is one inc in the future.
1053 *
1054 * If we can wait, then commit the sequence number,
1055 * else we can't possibly succeed.
1056 *
1057 * Doing a commit here rather than an advance
1058 * gives the hardware a chance to abort the
1059 * transaction early in case of high contention
1060 * compared to an unconditional advance.
1061 */
1062 if (!wait) {
1063 return false;
1064 }
1065 if (lock_cmpxchgv(&smr->smr_clock.s_wr_seq,
1066 clk.s_wr_seq, goal, &clk.s_wr_seq, relaxed)) {
1067 clk.s_wr_seq = goal;
1068 }
1069 } else if (delta > 0) {
1070 /*
1071 * Invalid goal: the caller held on it for too long,
1072 * and integers wrapped.
1073 */
1074 return true;
1075 }
1076
1077 os_atomic_thread_fence(seq_cst); /* [S2] */
1078
1079 /*
1080 * The read sequence can be no larger than the write sequence
1081 * at the start of the poll.
1082 *
1083 * We know that on entry:
1084 *
1085 * s_rd_seq < goal <= s_wr_seq
1086 *
1087 * The correctness of this algorithm relies on the fact that
1088 * the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
1089 * by more than roughly (ULONG_MAX / 2) while __smr_scan()
1090 * is running, otherwise the "rd_seq" we try to scan for
1091 * might appear larger than s_rd_seq spuriously and we'd
1092 * __smr_rd_advance() incorrectly.
1093 *
1094 * This is guaranteed by the fact that this represents
1095 * advancing 2^62 times. At one advance every nanosecond,
1096 * it takes more than a century, which makes it possible
1097 * to call smr_wait() or smr_poll() with preemption enabled.
1098 */
1099 rd_seq = clk.s_wr_seq;
1100
1101 zpercpu_foreach_cpu(cpu) {
1102 smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1103 smr_seq_t seq = os_atomic_load(&pcpu->c_rd_seq, relaxed);
1104
1105 while (seq != SMR_SEQ_INVALID) {
1106 /*
1107 * Resolve the race documented in __smr_enter().
1108 *
1109 * The CPU has loaded a stale s_wr_seq, and s_rd_seq
1110 * moved past this stale value.
1111 *
1112 * Its critical section is however properly serialized,
1113 * but we can't know what the "correct" s_wr_seq it
1114 * could have observed was. We have to assume `s_rd_seq`
1115 * to prevent it from advancing.
1116 */
1117 if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1118 seq = clk.s_rd_seq;
1119 }
1120
1121 if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
1122 seq &= ~SMR_SEQ_SLEEPABLE;
1123 break;
1124 }
1125
1126 if (seq & SMR_SEQ_SLEEPABLE) {
1127 seq = __smr_wait_for_oncore(pcpu, goal, cpu);
1128 } else {
1129 disable_preemption();
1130 seq = hw_wait_while_equals_long(&pcpu->c_rd_seq, seq);
1131 enable_preemption();
1132 }
1133 }
1134
1135 if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1136 rd_seq = seq;
1137 }
1138 }
1139
1140 if (smr->smr_flags & SMR_SLEEPABLE) {
1141 /*
1142 * Order observation of stalled sections,
1143 * see smr_mark_active_trackers_stalled().
1144 */
1145 os_atomic_thread_fence(seq_cst);
1146
1147 zpercpu_foreach_cpu(cpu) {
1148 smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1149 smr_seq_t seq = os_atomic_load(&pcpu->stall_rd_seq, relaxed);
1150
1151 while (seq != SMR_SEQ_INVALID) {
1152 if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1153 seq = clk.s_rd_seq;
1154 }
1155
1156 if (!wait || SMR_SEQ_CMP(goal, <=, seq)) {
1157 seq &= ~SMR_SEQ_SLEEPABLE;
1158 break;
1159 }
1160
1161 seq = __smr_wait_for_stalled(pcpu, goal);
1162 }
1163
1164 if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1165 rd_seq = seq;
1166 }
1167 }
1168 }
1169
1170 /*
1171 * Advance the rd_seq as long as we observed a more recent value.
1172 */
1173 return __smr_rd_advance(smr, goal, rd_seq);
1174}
1175
1176static inline bool
1177__smr_poll(smr_t smr, smr_seq_t goal, bool wait)
1178{
1179 smr_clock_t clk;
1180
1181 /*
1182 * Load both the s_rd_seq and s_wr_seq in the right order so that we
1183 * can't observe a s_rd_seq older than s_wr_seq.
1184 */
1185
1186 /* [S1] */
1187 clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, acquire);
1188
1189 /*
1190 * We expect this to be typical: the goal has already been observed.
1191 */
1192 if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
1193 return true;
1194 }
1195
1196 clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1197
1198 return __smr_scan(smr, goal, clk, wait);
1199}
1200
1201smr_seq_t
1202smr_advance(smr_t smr)
1203{
1204 smr_clock_t clk;
1205
1206 assert(!smr_entered(smr));
1207
1208 /*
1209 * We assume that there will at least be a successful __smr_poll
1210 * call every 2^60 calls to smr_advance() or so, so we do not need
1211 * to check if [s_rd_seq, s_wr_seq) is growing too wide.
1212 */
1213 static_assert(sizeof(clk.s_wr_seq) == 8);
1214 return __smr_wr_advance(smr);
1215}
1216
1217smr_seq_t
1218smr_deferred_advance(smr_t smr)
1219{
1220 os_atomic_thread_fence(seq_cst);
1221 return SMR_SEQ_INC + os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1222}
1223
1224void
1225smr_deferred_advance_commit(smr_t smr, smr_seq_t seq)
1226{
1227 /*
1228 * no barrier needed: smr_deferred_advance() had one already.
1229 * no failure handling: it means someone updated the clock already!
1230 * lock_cmpxchg: so that we pre-test for architectures needing it.
1231 */
1232 assert(seq != SMR_SEQ_INVALID);
1233 lock_cmpxchg(&smr->smr_clock.s_wr_seq, seq - SMR_SEQ_INC, seq, relaxed);
1234}
1235
1236bool
1237smr_poll(smr_t smr, smr_seq_t goal)
1238{
1239 assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1240 return __smr_poll(smr, goal, false);
1241}
1242
1243void
1244smr_wait(smr_t smr, smr_seq_t goal)
1245{
1246 assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1247 if (smr->smr_flags & SMR_SLEEPABLE) {
1248 assert(get_preemption_level() == 0);
1249 }
1250 (void)__smr_poll(smr, goal, true);
1251}
1252
1253void
1254smr_synchronize(smr_t smr)
1255{
1256 smr_clock_t clk;
1257
1258 assert(!smr_entered(smr));
1259 assert(!ml_at_interrupt_context());
1260 if (smr->smr_flags & SMR_SLEEPABLE) {
1261 assert(get_preemption_level() == 0);
1262 }
1263
1264 /*
1265 * Similar to __smr_poll() but also does a deferred advance which
1266 * __smr_scan will commit.
1267 */
1268
1269 clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, relaxed);
1270 os_atomic_thread_fence(seq_cst);
1271 clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1272
1273 (void)__smr_scan(smr, goal: clk.s_wr_seq + SMR_SEQ_INC, clk, true);
1274}
1275
1276
1277#pragma mark SMR domains: smr_call & smr_barrier
1278
1279/*!
1280 * @struct smr_barrier_ctx
1281 *
1282 * @brief
1283 * Data structure to track the completion of an smr_barrier() call.
1284 */
1285struct smr_barrier_ctx {
1286 struct smr *smrb_domain;
1287 struct thread *smrb_waiter;
1288 uint32_t smrb_pending;
1289 uint32_t smrb_count;
1290};
1291
1292/*!
1293 * @struct smr_barrier_job
1294 *
1295 * @brief
1296 * Data structure used to track completion of smr_barrier() calls.
1297 */
1298struct smr_barrier_job {
1299 struct smr_barrier_ctx *smrj_context;
1300 union {
1301 struct smr_node smrj_node;
1302 struct mpsc_queue_chain smrj_link;
1303 };
1304};
1305
1306#define SMR_BARRIER_SIZE 24
1307static_assert(sizeof(struct smr_barrier_job) == SMR_BARRIER_SIZE);
1308#define SMR_BARRIER_USE_STACK (SMR_BARRIER_SIZE * MAX_CPUS <= 512)
1309
1310static void
1311__smr_worker_check_invariants(struct smr_worker *smrw)
1312{
1313#if MACH_ASSERT
1314 smr_pcpu_t pcpu = smrw->whead;
1315 uint16_t num = (uint16_t)cpu_number();
1316
1317 assert(!ml_get_interrupts_enabled() || get_preemption_level());
1318
1319 for (; pcpu != *smrw->wold_tail; pcpu = pcpu->drain_next) {
1320 assertf(pcpu->qold_seq != SMR_SEQ_INVALID &&
1321 __smr_pcpu_queued(pcpu),
1322 "pcpu %p doesn't belong on %p old queue", pcpu, smrw);
1323 pcpu->__check_cpu = num;
1324 pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1325 pcpu->__check_list = 1;
1326 }
1327
1328 for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1329 __assert_only smr_t smr = pcpu->drain_smr;
1330
1331 assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1332 pcpu->qage_seq != SMR_SEQ_INVALID &&
1333 SMR_SEQ_CMP(pcpu->qage_seq, <=, smr->smr_clock.s_wr_seq) &&
1334 __smr_pcpu_queued(pcpu),
1335 "pcpu %p doesn't belong on %p aging queue", pcpu, smrw);
1336 pcpu->__check_cpu = num;
1337 pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1338 pcpu->__check_list = 2;
1339 }
1340
1341 for (; pcpu != *smrw->wcur_tail; pcpu = pcpu->drain_next) {
1342 assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1343 pcpu->qage_seq != SMR_SEQ_INVALID &&
1344 __smr_pcpu_queued(pcpu),
1345 "pcpu %p doesn't belong on %p current queue", pcpu, smrw);
1346 pcpu->__check_cpu = num;
1347 pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1348 pcpu->__check_list = 3;
1349 }
1350
1351 assert(pcpu == NULL);
1352#else
1353 (void)smrw;
1354#endif
1355}
1356
1357__attribute__((noinline))
1358static void
1359__smr_cpu_lazy_up(struct smr_worker *smrw)
1360{
1361 spl_t spl;
1362
1363 /*
1364 * calling smr_call/smr_barrier() from the context of a CPU
1365 * with a detached worker is illegal.
1366 *
1367 * However, bound threads might run on a derecommended (IGNORED)
1368 * cpu which we correct for here (and the CPU will go back to IGNORED
1369 * in smr_cpu_leave()).
1370 */
1371 assert(smrw->detach_reason == SMR_CPU_REASON_IGNORED);
1372
1373 spl = splsched();
1374 __smrw_lock(smrw);
1375 smrw->detach_reason &= ~SMR_CPU_REASON_IGNORED;
1376 __smrw_unlock(smrw);
1377 splx(spl);
1378}
1379
1380static void
1381__smr_cpu_lazy_up_if_needed(struct smr_worker *smrw)
1382{
1383 if (__improbable(smrw->detach_reason != SMR_CPU_REASON_NONE)) {
1384 __smr_cpu_lazy_up(smrw);
1385 }
1386}
1387
1388static bool
1389__smr_call_should_advance(smr_pcpu_t pcpu)
1390{
1391 if (pcpu->qcur_cnt > smr_call_cnt_cap) {
1392 return true;
1393 }
1394 if (pcpu->qcur_size > smr_call_size_cap) {
1395 return true;
1396 }
1397 return false;
1398}
1399
1400static void
1401__smr_call_advance_qcur(smr_t smr, smr_pcpu_t pcpu, bool needs_commit)
1402{
1403 smr_seq_t new_seq;
1404
1405 if (needs_commit || pcpu->qage_seq) {
1406 new_seq = smr_advance(smr);
1407 } else {
1408 new_seq = smr_deferred_advance(smr);
1409 }
1410 __builtin_assume(new_seq != SMR_SEQ_INVALID);
1411
1412 pcpu->qage_seq = new_seq;
1413 pcpu->qage_tail = pcpu->qcur_tail;
1414
1415 pcpu->qcur_size = 0;
1416 pcpu->qcur_cnt = 0;
1417}
1418
1419static void
1420__smr_call_push(smr_pcpu_t pcpu, smr_node_t node, smr_cb_t cb)
1421{
1422 assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1423
1424 node->smrn_next = NULL;
1425 node->smrn_cb = cb;
1426
1427 *pcpu->qcur_tail = node;
1428 pcpu->qcur_tail = &node->smrn_next;
1429 pcpu->qcur_cnt += 1;
1430}
1431
1432static void
1433__smr_call_dispatch(struct smr_worker *smrw, smr_pcpu_t pcpu)
1434{
1435 __smr_worker_check_invariants(smrw);
1436
1437 if (!__smr_pcpu_queued(pcpu)) {
1438 assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1439 assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1440
1441 pcpu->drain_next = NULL;
1442 *smrw->wcur_tail = pcpu;
1443 smrw->wcur_tail = &pcpu->drain_next;
1444 }
1445}
1446
1447void
1448smr_call(smr_t smr, smr_node_t node, vm_size_t size, smr_cb_t cb)
1449{
1450 struct smr_worker *smrw;
1451 smr_pcpu_t pcpu;
1452
1453 if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1454 return cb(node);
1455 }
1456
1457 lock_disable_preemption_for_thread(current_thread());
1458 assert(!ml_at_interrupt_context());
1459
1460 smrw = PERCPU_GET(smr_worker);
1461 __smr_cpu_lazy_up_if_needed(smrw);
1462
1463 pcpu = __smr_pcpu(smr);
1464 assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1465
1466 if (os_add_overflow(pcpu->qcur_size, size, &pcpu->qcur_size)) {
1467 pcpu->qcur_size = UINT32_MAX;
1468 }
1469
1470 __smr_call_push(pcpu, node, cb);
1471 if (__smr_call_should_advance(pcpu)) {
1472 if (pcpu->qage_seq == SMR_SEQ_INVALID) {
1473 __smr_call_advance_qcur(smr, pcpu, false);
1474 }
1475 __smr_call_dispatch(smrw, pcpu);
1476 }
1477
1478 return lock_enable_preemption();
1479}
1480
1481static inline event_t
1482__smrb_event(struct smr_barrier_ctx *ctx)
1483{
1484 return ctx;
1485}
1486
1487static void
1488__smr_barrier_cb(struct smr_node *node)
1489{
1490 struct smr_barrier_job *job;
1491 struct smr_barrier_ctx *ctx;
1492
1493 job = __container_of(node, struct smr_barrier_job, smrj_node);
1494 ctx = job->smrj_context;
1495
1496 if (os_atomic_dec(&ctx->smrb_pending, relaxed) == 0) {
1497 /*
1498 * It is permitted to still reach into the context
1499 * because smr_barrier() always blocks, which means
1500 * that the context will be valid until this wakeup
1501 * happens.
1502 */
1503 thread_wakeup_thread(event: __smrb_event(ctx), thread: ctx->smrb_waiter);
1504 }
1505}
1506
1507static bool
1508__smr_barrier_drain(struct smr_worker *smrw, bool needs_commit)
1509{
1510 mpsc_queue_chain_t head, tail, it;
1511
1512 head = mpsc_queue_dequeue_batch(q: &smrw->barrier_queue, tail: &tail,
1513 OS_ATOMIC_DEPENDENCY_NONE);
1514
1515 mpsc_queue_batch_foreach_safe(it, head, tail) {
1516 struct smr_barrier_job *job;
1517 struct smr_barrier_ctx *ctx;
1518 smr_pcpu_t pcpu;
1519 smr_t smr;
1520
1521 job = __container_of(it, struct smr_barrier_job, smrj_link);
1522 ctx = job->smrj_context;
1523 smr = ctx->smrb_domain;
1524 pcpu = __smr_pcpu(smr, smrw->processor->cpu_id);
1525
1526 pcpu->qcur_size = UINT32_MAX;
1527 __smr_call_push(pcpu, &job->smrj_node, __smr_barrier_cb);
1528 __smr_call_advance_qcur(smr, pcpu, needs_commit);
1529 __smr_call_dispatch(smrw, pcpu);
1530 }
1531
1532 return head != NULL;
1533}
1534
1535
1536void
1537smr_barrier(smr_t smr)
1538{
1539#if SMR_BARRIER_USE_STACK
1540 struct smr_barrier_job jobs[MAX_CPUS];
1541#else
1542 struct smr_barrier_job *jobs;
1543#endif
1544 struct smr_barrier_job *job;
1545 struct smr_barrier_ctx ctx = {
1546 .smrb_domain = smr,
1547 .smrb_waiter = current_thread(),
1548 .smrb_pending = zpercpu_count(),
1549 .smrb_count = zpercpu_count(),
1550 };
1551 spl_t spl;
1552
1553 /*
1554 * First wait for all readers to observe whatever it is
1555 * that changed prior to this call.
1556 *
1557 * _then_ enqueue callbacks that push out anything ahead.
1558 */
1559 smr_synchronize(smr);
1560
1561#if !SMR_BARRIER_USE_STACK
1562 jobs = kalloc_type(struct smr_barrier_job, ctx.smrb_count,
1563 Z_WAITOK | Z_ZERO | Z_NOFAIL);
1564#endif
1565 job = jobs;
1566 spl = splsched();
1567
1568 __smr_cpu_lazy_up_if_needed(PERCPU_GET(smr_worker));
1569
1570 percpu_foreach(smrw, smr_worker) {
1571 job->smrj_context = &ctx;
1572 if (mpsc_queue_append(q: &smrw->barrier_queue, elm: &job->smrj_link)) {
1573 __smrw_lock(smrw);
1574 __smrw_wakeup_and_unlock(smrw);
1575 }
1576 job++;
1577 }
1578
1579 /*
1580 * Because we disabled interrupts, our own CPU's callback
1581 * can't possibly have run, so just block.
1582 *
1583 * We must block in order to guarantee the lifetime of "ctx".
1584 * (See comment in __smr_barrier_cb).
1585 */
1586 assert_wait(event: __smrb_event(ctx: &ctx), THREAD_UNINT);
1587 assert(ctx.smrb_pending > 0);
1588 splx(spl);
1589 thread_block(THREAD_CONTINUE_NULL);
1590
1591#if !SMR_BARRIER_USE_STACK
1592 kfree_type(struct smr_barrier_job, ctx.smrb_count, jobs);
1593#endif
1594}
1595
1596
1597#pragma mark SMR domains: smr_worker
1598
1599static void
1600__smr_worker_drain_lock(struct smr_worker *smrw)
1601{
1602 for (;;) {
1603 ml_set_interrupts_enabled(false);
1604 __smrw_lock(smrw);
1605
1606 /*
1607 * Check we are on an appropriate processor
1608 *
1609 * Note that we might be running on the canonical
1610 * processor incorrectly: if the processor has been
1611 * de-recommended but isn't offline.
1612 */
1613 if (__probable(current_processor() == smrw->processor)) {
1614 if (__probable(!smrw->detach_reason)) {
1615 break;
1616 }
1617 } else {
1618 if (__probable(smrw->detach_reason)) {
1619 break;
1620 }
1621 }
1622
1623 /* go bind in the right place and retry */
1624 thread_bind(processor: __smrw_drain_bind_target(smrw));
1625 __smrw_unlock(smrw);
1626 ml_set_interrupts_enabled(true);
1627 thread_block(THREAD_CONTINUE_NULL);
1628 }
1629}
1630
1631static void
1632__smr_worker_drain_unlock(struct smr_worker *smrw)
1633{
1634 __smrw_unlock(smrw);
1635 ml_set_interrupts_enabled(true);
1636}
1637
1638/*!
1639 * @function __smr_worker_tick
1640 *
1641 * @brief
1642 * Make the SMR worker queues make gentle progress
1643 *
1644 * @discussion
1645 * One round of progress will:
1646 * - move entries that have aged as being old,
1647 * - commit entries that have a deferred sequence and let them age.
1648 *
1649 * If this results into any callbacks to become "old",
1650 * then the worker is being woken up to start running callbacks.
1651 *
1652 * This function must run either on the processfor for this worker,
1653 * or under the worker drain lock being held.
1654 */
1655static void
1656__smr_worker_tick(struct smr_worker *smrw, uint64_t ctime, bool wakeup)
1657{
1658 smr_pcpu_t pcpu = *smrw->wold_tail;
1659
1660 __smr_worker_check_invariants(smrw);
1661
1662 for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1663 assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1664 assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1665
1666 pcpu->qold_seq = pcpu->qage_seq;
1667 pcpu->qold_tail = pcpu->qage_tail;
1668
1669 pcpu->qage_seq = SMR_SEQ_INVALID;
1670 }
1671
1672 for (; pcpu; pcpu = pcpu->drain_next) {
1673 assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1674 assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1675
1676 smr_deferred_advance_commit(smr: pcpu->drain_smr, seq: pcpu->qage_seq);
1677 }
1678
1679 smrw->wold_tail = smrw->wage_tail;
1680 smrw->wage_tail = smrw->wcur_tail;
1681 smrw->drain_ctime = ctime;
1682
1683 __smr_worker_check_invariants(smrw);
1684
1685 if (wakeup && smrw->wold_tail != &smrw->whead) {
1686 __smrw_lock(smrw);
1687 __smrw_wakeup_and_unlock(smrw);
1688 }
1689}
1690
1691static void
1692__smr_worker_update_wold_tail(struct smr_worker *smrw, smr_pcpu_t *new_tail)
1693{
1694 smr_pcpu_t *old_tail = smrw->wold_tail;
1695
1696 if (smrw->wcur_tail == old_tail) {
1697 smrw->wage_tail = new_tail;
1698 smrw->wcur_tail = new_tail;
1699 } else if (smrw->wage_tail == old_tail) {
1700 smrw->wage_tail = new_tail;
1701 }
1702
1703 smrw->wold_tail = new_tail;
1704}
1705
1706static void
1707__smr_worker_drain_one(struct smr_worker *smrw, smr_pcpu_t pcpu)
1708{
1709 smr_t smr = pcpu->drain_smr;
1710 smr_seq_t seq = pcpu->qold_seq;
1711 smr_node_t head;
1712
1713 /*
1714 * Step 1: pop the "old" items,
1715 * (qold_tail/qold_seq left dangling)
1716 */
1717
1718 assert(seq != SMR_SEQ_INVALID);
1719 head = pcpu->qhead;
1720 pcpu->qhead = *pcpu->qold_tail;
1721 *pcpu->qold_tail = NULL;
1722
1723 /*
1724 * Step 2: Reconstruct the queue
1725 * based on the sequence numbers and count fields.
1726 *
1727 * Do what __smr_worker_tick() would do on this queue:
1728 * - commit the aging queue
1729 * - advance the current queue if needed
1730 */
1731
1732 if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1733 assert(pcpu->qage_tail != pcpu->qold_tail);
1734
1735 smr_deferred_advance_commit(smr, seq: pcpu->qage_seq);
1736 pcpu->qold_seq = pcpu->qage_seq;
1737 pcpu->qold_tail = pcpu->qage_tail;
1738 } else {
1739 assert(pcpu->qage_tail == pcpu->qold_tail);
1740
1741 pcpu->qold_seq = SMR_SEQ_INVALID;
1742 pcpu->qold_tail = &pcpu->qhead;
1743 }
1744
1745 if (__smr_call_should_advance(pcpu)) {
1746 __smr_call_advance_qcur(smr, pcpu, false);
1747 } else {
1748 pcpu->qage_seq = SMR_SEQ_INVALID;
1749 pcpu->qage_tail = pcpu->qold_tail;
1750 if (pcpu->qcur_cnt == 0) {
1751 pcpu->qcur_tail = pcpu->qage_tail;
1752 }
1753 }
1754
1755 if (pcpu->qold_seq != SMR_SEQ_INVALID) {
1756 /*
1757 * The node has gained an "old seq" back,
1758 * it goes to the ready queue.
1759 */
1760 pcpu->drain_next = *smrw->wold_tail;
1761 *smrw->wold_tail = pcpu;
1762 __smr_worker_update_wold_tail(smrw,
1763 new_tail: &pcpu->drain_next);
1764 } else if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1765 /*
1766 * The node has gained an "age seq" back,
1767 * it needs to age and wait for a tick
1768 * for its sequence number to be commited.
1769 */
1770 pcpu->drain_next = NULL;
1771 *smrw->wcur_tail = pcpu;
1772 smrw->wcur_tail = &pcpu->drain_next;
1773 } else {
1774 /*
1775 * The node is empty or with "current"
1776 * callbacks only, it can be dequeued.
1777 */
1778 assert(!__smr_call_should_advance(pcpu));
1779 pcpu->__check_cpu = (uint16_t)cpu_number();
1780 pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1781 pcpu->__check_list = 0;
1782 __smr_pcpu_set_not_queued(pcpu);
1783 }
1784
1785 /*
1786 * Step 3: drain callbacks.
1787 */
1788 __smr_worker_check_invariants(smrw);
1789 __smr_worker_drain_unlock(smrw);
1790
1791 __smr_poll(smr, goal: seq, true);
1792 __smr_call_drain(head);
1793
1794 __smr_worker_drain_lock(smrw);
1795}
1796
1797static void
1798__smr_worker_continue(void *arg, wait_result_t wr __unused)
1799{
1800 smr_pcpu_t pcpu = NULL, next = NULL;
1801 struct smr_worker *const smrw = arg;
1802 uint64_t deadline;
1803
1804 __smr_worker_drain_lock(smrw);
1805 __smr_worker_check_invariants(smrw);
1806
1807 if (smrw->wold_tail != &smrw->whead) {
1808 next = smrw->whead;
1809 smrw->whead = *smrw->wold_tail;
1810 *smrw->wold_tail = NULL;
1811 __smr_worker_update_wold_tail(smrw, new_tail: &smrw->whead);
1812 }
1813
1814 /*
1815 * The pipeline of per-cpu SMR data structures with pending
1816 * smr_call() callbacks has three stages: wcur -> wage -> wold.
1817 *
1818 * In order to guarantee forward progress, a tick happens
1819 * for each of them, either via __smr_worker_tick(),
1820 * or via __smr_worker_drain_one().
1821 *
1822 * The second tick will happen either because to core stayed
1823 * busy enough that a subsequent smr_cpu_tick() decided to
1824 * perform it, or because the CPU idled, and smr_cpu_leave()
1825 * will perform an unconditional __smr_worker_tick().
1826 */
1827 __smr_barrier_drain(smrw, false);
1828 __smr_worker_tick(smrw, ctime: mach_absolute_time(), false);
1829
1830 while ((pcpu = next)) {
1831 next = next->drain_next;
1832 __smr_worker_drain_one(smrw, pcpu);
1833 }
1834
1835 if (__improbable(smrw->whead && smrw->detach_reason)) {
1836 /*
1837 * If the thread isn't bound, we want to flush anything
1838 * that is pending without causing too much contention.
1839 *
1840 * Sleep for a bit in order to give the system time
1841 * to observe any advance commits we did.
1842 */
1843 deadline = mach_absolute_time() + cpu_checkin_min_interval;
1844 } else {
1845 deadline = TIMEOUT_WAIT_FOREVER;
1846 }
1847 waitq_assert_wait64_locked(waitq: &smrw->waitq, wait_event: __smrw_drain_event(smrw),
1848 THREAD_UNINT, TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1849 TIMEOUT_NO_LEEWAY, thread: smrw->thread);
1850
1851 /*
1852 * Make sure there's no barrier left, after we called assert_wait()
1853 * in order to pair with __smr_barrier_cb(). If we do find some,
1854 * we must be careful about invariants and forward progress.
1855 *
1856 * For affected domains, the dequeued barriers have been added
1857 * to their "qage" queue. If their "qage" queue was non empty,
1858 * then its "qage_seq" was already commited, and we must preserve
1859 * this invariant.
1860 *
1861 * Affected domains that were idle before will get enqueued on this
1862 * worker's "wcur" queue. In order to guarantee forward progress,
1863 * we must force a tick if both the "wage" and "wold" queues
1864 * of the worker are empty.
1865 */
1866 if (__improbable(__smr_barrier_drain(smrw, true))) {
1867 if (smrw->wage_tail == &smrw->whead) {
1868 __smr_worker_tick(smrw, ctime: mach_absolute_time(), false);
1869 }
1870 }
1871
1872 __smr_worker_check_invariants(smrw);
1873 __smr_worker_drain_unlock(smrw);
1874
1875 thread_block_parameter(continuation: __smr_worker_continue, parameter: smrw);
1876}
1877
1878
1879#pragma mark SMR domains: scheduler integration
1880
1881#if CONFIG_QUIESCE_COUNTER
1882__startup_data
1883static uint64_t _Atomic quiesce_gen_startup;
1884static uint64_t _Atomic *quiesce_genp = &quiesce_gen_startup;
1885static uint64_t _Atomic quiesce_ctime;
1886
1887void
1888cpu_quiescent_set_storage(uint64_t _Atomic *ptr)
1889{
1890 /*
1891 * Transfer to the real location for the commpage.
1892 *
1893 * this is ok to do like this because the system
1894 * is still single threaded.
1895 */
1896 uint64_t gen = os_atomic_load(&quiesce_gen_startup, relaxed);
1897
1898 os_atomic_store(ptr, gen, relaxed);
1899 quiesce_genp = ptr;
1900}
1901
1902static smr_seq_t
1903cpu_quiescent_gen_to_seq(uint64_t gen)
1904{
1905 return gen * SMR_SEQ_INC + SMR_SEQ_INIT;
1906}
1907
1908static void
1909cpu_quiescent_advance(uint64_t gen, uint64_t ctime __kdebug_only)
1910{
1911 smr_seq_t seq = cpu_quiescent_gen_to_seq(gen);
1912
1913 os_atomic_thread_fence(seq_cst);
1914
1915 percpu_foreach(it, smr_worker) {
1916 smr_seq_t rd_seq = os_atomic_load(&it->rd_quiesce_seq, relaxed);
1917
1918 if (rd_seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(rd_seq, <, seq)) {
1919 return;
1920 }
1921 }
1922
1923 os_atomic_thread_fence(seq_cst);
1924
1925 if (lock_cmpxchg(quiesce_genp, gen, gen + 1, relaxed)) {
1926 KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER),
1927 gen, 0, ctime, 0);
1928 }
1929}
1930
1931static void
1932cpu_quiescent_join(struct smr_worker *smrw)
1933{
1934 uint64_t gen = os_atomic_load(quiesce_genp, relaxed);
1935
1936 assert(smrw->rd_quiesce_seq == SMR_SEQ_INVALID);
1937 os_atomic_store(&smrw->rd_quiesce_seq,
1938 cpu_quiescent_gen_to_seq(gen), relaxed);
1939 os_atomic_thread_fence(seq_cst);
1940}
1941
1942static void
1943cpu_quiescent_tick(struct smr_worker *smrw, uint64_t ctime, uint64_t interval)
1944{
1945 uint64_t gen = os_atomic_load(quiesce_genp, relaxed);
1946 smr_seq_t seq = cpu_quiescent_gen_to_seq(gen);
1947
1948 if (smrw->rd_quiesce_seq == SMR_SEQ_INVALID) {
1949 /*
1950 * Likely called because of the scheduler tick,
1951 * smr_maintenance() will do the right thing.
1952 */
1953 assert(current_processor()->state != PROCESSOR_RUNNING);
1954 } else if (seq != smrw->rd_quiesce_seq) {
1955 /*
1956 * Someone managed to update the sequence already,
1957 * learn it, update our ctime.
1958 */
1959 os_atomic_store(&smrw->rd_quiesce_seq, seq, release);
1960 os_atomic_store(&quiesce_ctime, ctime, relaxed);
1961 os_atomic_thread_fence(seq_cst);
1962 } else if ((ctime - os_atomic_load(&quiesce_ctime, relaxed)) > interval) {
1963 /*
1964 * The system looks busy enough we want to update
1965 * the counter faster than every scheduler tick.
1966 */
1967 os_atomic_store(&quiesce_ctime, ctime, relaxed);
1968 cpu_quiescent_advance(gen, ctime);
1969 }
1970}
1971
1972static void
1973cpu_quiescent_leave(struct smr_worker *smrw)
1974{
1975 assert(smrw->rd_quiesce_seq != SMR_SEQ_INVALID);
1976 os_atomic_store(&smrw->rd_quiesce_seq, SMR_SEQ_INVALID, release);
1977}
1978#endif /* CONFIG_QUIESCE_COUNTER */
1979
1980uint32_t
1981smr_cpu_checkin_get_min_interval_us(void)
1982{
1983 return cpu_checkin_min_interval_us;
1984}
1985
1986void
1987smr_cpu_checkin_set_min_interval_us(uint32_t new_value_us)
1988{
1989 /* clamp to something vaguely sane */
1990 if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) {
1991 new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
1992 }
1993
1994 cpu_checkin_min_interval_us = new_value_us;
1995
1996 uint64_t abstime = 0;
1997 clock_interval_to_absolutetime_interval(interval: cpu_checkin_min_interval_us,
1998 NSEC_PER_USEC, result: &abstime);
1999 cpu_checkin_min_interval = abstime;
2000}
2001
2002__startup_func
2003static void
2004smr_cpu_checkin_init_min_interval_us(void)
2005{
2006 smr_cpu_checkin_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
2007}
2008STARTUP(TUNABLES, STARTUP_RANK_FIRST, smr_cpu_checkin_init_min_interval_us);
2009
2010static void
2011__smr_cpu_init_thread(struct smr_worker *smrw)
2012{
2013 char name[MAXTHREADNAMESIZE];
2014 thread_t th = THREAD_NULL;
2015
2016 kernel_thread_create(continuation: __smr_worker_continue, parameter: smrw, MINPRI_KERNEL, new_thread: &th);
2017 smrw->thread = th;
2018
2019 snprintf(name, sizeof(name), "smr.reclaim:%d", smrw->processor->cpu_id);
2020 thread_set_thread_name(th, name);
2021 thread_start_in_assert_wait(thread: th,
2022 waitq: &smrw->waitq, event: __smrw_drain_event(smrw), THREAD_UNINT);
2023}
2024
2025void
2026smr_cpu_init(struct processor *processor)
2027{
2028 struct smr_worker *smrw;
2029
2030 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2031 smrw->processor = processor;
2032
2033 waitq_init(waitq: &smrw->waitq, type: WQT_QUEUE, SYNC_POLICY_FIFO);
2034 smrw->detach_reason = SMR_CPU_REASON_OFFLINE;
2035
2036 smrq_init(&smrw->sect_queue);
2037 smrw->wold_tail = &smrw->whead;
2038 smrw->wage_tail = &smrw->whead;
2039 smrw->wcur_tail = &smrw->whead;
2040 mpsc_queue_init(q: &smrw->barrier_queue);
2041
2042 if (processor != master_processor) {
2043 __smr_cpu_init_thread(smrw);
2044 }
2045}
2046STARTUP_ARG(LOCKS, STARTUP_RANK_LAST, smr_cpu_init, master_processor);
2047STARTUP_ARG(THREAD_CALL, STARTUP_RANK_LAST,
2048 __smr_cpu_init_thread, PERCPU_GET_MASTER(smr_worker));
2049
2050/*!
2051 * @function smr_cpu_up()
2052 *
2053 * @brief
2054 * Scheduler callback to notify this processor is going up.
2055 *
2056 * @discussion
2057 * Called at splsched() under the sched_available_cores_lock.
2058 */
2059void
2060smr_cpu_up(struct processor *processor, smr_cpu_reason_t reason)
2061{
2062 struct smr_worker *smrw;
2063
2064 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2065
2066 __smrw_lock(smrw);
2067 if (reason != SMR_CPU_REASON_IGNORED) {
2068 assert((smrw->detach_reason & reason) == reason);
2069 }
2070 smrw->detach_reason &= ~reason;
2071 __smrw_unlock(smrw);
2072}
2073
2074static void
2075__smr_cpu_down_and_unlock(
2076 struct processor *processor,
2077 struct smr_worker *smrw,
2078 smr_cpu_reason_t reason)
2079{
2080 bool detach = !smrw->detach_reason;
2081
2082 /*
2083 * When reason is SMR_CPU_REASON_IGNORED,
2084 * this is called from smr_cpu_leave() on the way to idle.
2085 *
2086 * However this isn't sychronized with the recommendation
2087 * lock, hence it is possible that the CPU might actually
2088 * be recommended again while we're on the way to idle.
2089 *
2090 * By re-checking processor recommendation under
2091 * the __smrw_lock, we serialize with smr_cpu_up().
2092 */
2093 if (reason != SMR_CPU_REASON_IGNORED) {
2094 assert((smrw->detach_reason & reason) == 0);
2095 } else if (processor->is_recommended) {
2096 /*
2097 * The race we try to detect happened,
2098 * do nothing.
2099 */
2100 reason = SMR_CPU_REASON_NONE;
2101 detach = false;
2102 }
2103 smrw->detach_reason |= reason;
2104 reason = smrw->detach_reason;
2105
2106 if (detach && smrw->whead) {
2107 detach = !__smrw_wakeup_and_unlock(smrw);
2108 } else {
2109 __smrw_unlock(smrw);
2110 }
2111
2112 if (detach) {
2113 thread_unbind_after_queue_shutdown(thread: smrw->thread, processor);
2114 }
2115}
2116
2117/*!
2118 * @function smr_cpu_down()
2119 *
2120 * @brief
2121 * Scheduler callback to notify this processor is going down.
2122 *
2123 * @discussion
2124 * Called at splsched() when the processor run queue is being shut down.
2125 */
2126void
2127smr_cpu_down(struct processor *processor, smr_cpu_reason_t reason)
2128{
2129 struct smr_worker *smrw;
2130
2131 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2132
2133 __smrw_lock(smrw);
2134 __smr_cpu_down_and_unlock(processor, smrw, reason);
2135}
2136
2137
2138/*!
2139 * @function smr_cpu_join()
2140 *
2141 * @brief
2142 * Scheduler callback to notify this processor is going out of idle.
2143 *
2144 * @discussion
2145 * Called at splsched().
2146 */
2147void
2148smr_cpu_join(struct processor *processor, uint64_t ctime __unused)
2149{
2150#if CONFIG_QUIESCE_COUNTER
2151 struct smr_worker *smrw;
2152
2153 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2154 cpu_quiescent_join(smrw);
2155#else
2156 (void)processor;
2157#endif /* CONFIG_QUIESCE_COUNTER */
2158}
2159
2160/*!
2161 * @function smr_cpu_tick()
2162 *
2163 * @brief
2164 * Scheduler callback invoked during the scheduler maintenance routine.
2165 *
2166 * @discussion
2167 * Called at splsched().
2168 */
2169void
2170smr_cpu_tick(uint64_t ctime, bool safe_point)
2171{
2172 struct smr_worker *smrw = PERCPU_GET(smr_worker);
2173 uint64_t interval = cpu_checkin_min_interval;
2174
2175#if CONFIG_QUIESCE_COUNTER
2176 cpu_quiescent_tick(smrw, ctime, interval);
2177#endif /* CONFIG_QUIESCE_COUNTER */
2178
2179 /*
2180 * if a bound thread was woken up on a derecommended core,
2181 * our detach_reason might be "IGNORED" and we want to leave
2182 * it alone in that case
2183 */
2184 if (safe_point && !smrw->detach_reason && smrw->whead &&
2185 current_processor()->state == PROCESSOR_RUNNING &&
2186 (ctime - smrw->drain_ctime) > interval) {
2187 __smr_worker_tick(smrw, ctime, true);
2188 }
2189}
2190
2191/*!
2192 * @function smr_cpu_leave()
2193 *
2194 * @brief
2195 * Scheduler callback to notify this processor is going idle.
2196 *
2197 * @discussion
2198 * Called at splsched().
2199 */
2200void
2201smr_cpu_leave(struct processor *processor, uint64_t ctime)
2202{
2203 struct smr_worker *smrw;
2204
2205 smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2206
2207 /*
2208 * if a bound thread was woken up on a derecommended core,
2209 * our detach_reason might be "IGNORED" and we want to leave
2210 * it alone in that case
2211 *
2212 * See comment in __smr_worker_continue for why this must be
2213 * done unconditionally otherwise.
2214 */
2215 if (!smrw->detach_reason && smrw->whead) {
2216 __smr_worker_tick(smrw, ctime, true);
2217 }
2218
2219 if (__improbable(!processor->is_recommended)) {
2220 __smrw_lock(smrw);
2221 __smr_cpu_down_and_unlock(processor, smrw, reason: SMR_CPU_REASON_IGNORED);
2222 }
2223
2224#if CONFIG_QUIESCE_COUNTER
2225 cpu_quiescent_leave(smrw);
2226#endif /* CONFIG_QUIESCE_COUNTER */
2227}
2228
2229/*!
2230 * @function smr_maintenance()
2231 *
2232 * @brief
2233 * Scheduler callback called at the scheduler tick.
2234 *
2235 * @discussion
2236 * Called at splsched().
2237 */
2238void
2239smr_maintenance(uint64_t ctime)
2240{
2241#if CONFIG_QUIESCE_COUNTER
2242 cpu_quiescent_advance(os_atomic_load(quiesce_genp, relaxed), ctime);
2243#else
2244 (void)ctime;
2245#endif /* CONFIG_QUIESCE_COUNTER */
2246}
2247
2248
2249#pragma mark - SMR hash tables
2250
2251static struct smrq_slist_head *
2252smr_hash_alloc_array(size_t size)
2253{
2254 return kalloc_type(struct smrq_slist_head, size,
2255 Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
2256}
2257
2258static void
2259smr_hash_free_array(struct smrq_slist_head *array, size_t size)
2260{
2261 kfree_type(struct smrq_slist_head, size, array);
2262}
2263
2264static inline uintptr_t
2265smr_hash_array_encode(struct smrq_slist_head *array, uint16_t order)
2266{
2267 uintptr_t ptr;
2268
2269 ptr = (uintptr_t)array;
2270 ptr &= ~SMRH_ARRAY_ORDER_MASK;
2271 ptr |= (uintptr_t)order << SMRH_ARRAY_ORDER_SHIFT;
2272
2273 return ptr;
2274}
2275
2276#pragma mark SMR simple hash tables
2277
2278void
2279smr_hash_init(struct smr_hash *smrh, size_t size)
2280{
2281 struct smrq_slist_head *array;
2282 uint16_t shift;
2283
2284 assert(size);
2285 shift = (uint16_t)flsll(mask: size - 1);
2286 size = 1UL << shift;
2287 if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2288 assert(size * sizeof(struct smrq_slist_head) <=
2289 KALLOC_SAFE_ALLOC_SIZE);
2290 }
2291 array = smr_hash_alloc_array(size);
2292
2293 *smrh = (struct smr_hash){
2294 .smrh_array = smr_hash_array_encode(array, order: 64 - shift),
2295 };
2296}
2297
2298void
2299smr_hash_destroy(struct smr_hash *smrh)
2300{
2301 struct smr_hash_array array = smr_hash_array_decode(smrh);
2302
2303 smr_hash_free_array(array: array.smrh_array, size: smr_hash_size(array));
2304 *smrh = (struct smr_hash){ };
2305}
2306
2307void
2308__smr_hash_serialized_clear(
2309 struct smr_hash *smrh,
2310 smrh_traits_t smrht,
2311 void (^free)(void *obj))
2312{
2313 struct smr_hash_array array = smr_hash_array_decode(smrh);
2314
2315 for (size_t i = 0; i < smr_hash_size(array); i++) {
2316 struct smrq_slink *link;
2317 __smrq_slink_t *prev;
2318
2319 prev = &array.smrh_array[i].first;
2320 while ((link = smr_serialized_load(prev))) {
2321 prev = &link->next;
2322 free(__smrht_link_to_obj(traits: smrht, link));
2323 }
2324
2325 smr_clear_store(&array.smrh_array[i].first);
2326 }
2327
2328 smrh->smrh_count = 0;
2329}
2330
2331kern_return_t
2332__smr_hash_shrink_and_unlock(
2333 struct smr_hash *smrh,
2334 lck_mtx_t *lock,
2335 smrh_traits_t smrht)
2336{
2337 struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2338 struct smrq_slist_head *newarray, *oldarray;
2339 uint16_t neworder = decptr.smrh_order + 1;
2340 size_t oldsize = smr_hash_size(array: decptr);
2341 size_t newsize = oldsize / 2;
2342
2343 assert(newsize);
2344
2345 if (os_atomic_load(&smrh->smrh_resizing, relaxed)) {
2346 lck_mtx_unlock(lck: lock);
2347 return KERN_FAILURE;
2348 }
2349
2350 os_atomic_store(&smrh->smrh_resizing, true, relaxed);
2351 lck_mtx_unlock(lck: lock);
2352
2353 newarray = smr_hash_alloc_array(size: newsize);
2354 if (newarray == NULL) {
2355 os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2356 return KERN_RESOURCE_SHORTAGE;
2357 }
2358
2359 lck_mtx_lock(lck: lock);
2360
2361 /*
2362 * Step 1: collapse all the chains in pairs.
2363 */
2364 oldarray = decptr.smrh_array;
2365
2366 for (size_t i = 0; i < newsize; i++) {
2367 newarray[i] = oldarray[i];
2368 smrq_serialized_append(&newarray[i], &oldarray[i + newsize]);
2369 }
2370
2371 /*
2372 * Step 2: publish the new array.
2373 */
2374 os_atomic_store(&smrh->smrh_array,
2375 smr_hash_array_encode(newarray, neworder), release);
2376
2377 os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2378
2379 lck_mtx_unlock(lck: lock);
2380
2381 /*
2382 * Step 3: free the old array once readers can't observe the old values.
2383 */
2384 smr_synchronize(smr: smrht->domain);
2385
2386 smr_hash_free_array(array: oldarray, size: oldsize);
2387 return KERN_SUCCESS;
2388}
2389
2390kern_return_t
2391__smr_hash_grow_and_unlock(
2392 struct smr_hash *smrh,
2393 lck_mtx_t *lock,
2394 smrh_traits_t smrht)
2395{
2396 struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2397 struct smrq_slist_head *newarray, *oldarray;
2398 __smrq_slink_t **prevarray;
2399 uint16_t neworder = decptr.smrh_order - 1;
2400 size_t oldsize = smr_hash_size(array: decptr);
2401 size_t newsize = 2 * oldsize;
2402 bool needs_another_round = false;
2403
2404 if (smrh->smrh_resizing) {
2405 lck_mtx_unlock(lck: lock);
2406 return KERN_FAILURE;
2407 }
2408
2409 smrh->smrh_resizing = true;
2410 lck_mtx_unlock(lck: lock);
2411
2412 newarray = smr_hash_alloc_array(size: newsize);
2413 if (newarray == NULL) {
2414 os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2415 return KERN_RESOURCE_SHORTAGE;
2416 }
2417
2418 prevarray = kalloc_type(__smrq_slink_t *, newsize,
2419 Z_WAITOK | Z_ZERO | Z_SPRAYQTN);
2420 if (prevarray == NULL) {
2421 smr_hash_free_array(array: newarray, size: newsize);
2422 os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2423 return KERN_RESOURCE_SHORTAGE;
2424 }
2425
2426
2427 lck_mtx_lock(lck: lock);
2428
2429 /*
2430 * Step 1: create a duplicated array with twice as many heads.
2431 */
2432 oldarray = decptr.smrh_array;
2433
2434 memcpy(dst: newarray, src: oldarray, n: oldsize * sizeof(newarray[0]));
2435 memcpy(dst: newarray + oldsize, src: oldarray, n: oldsize * sizeof(newarray[0]));
2436
2437 /*
2438 * Step 2: Publish the new array, and wait for readers to observe it
2439 * before we do any change.
2440 */
2441 os_atomic_store(&smrh->smrh_array,
2442 smr_hash_array_encode(newarray, neworder), release);
2443
2444 smr_synchronize(smr: smrht->domain);
2445
2446
2447 /*
2448 * Step 3: split the lists.
2449 */
2450
2451 /*
2452 * If the list we are trying to split looked like this,
2453 * where L elements will go to the "left" bucket and "R"
2454 * to the right one:
2455 *
2456 * old_head --> L1 --> L2 -> L5
2457 * \ / \
2458 * -> R3 --> R4 -> R6 --> NULL
2459 *
2460 * Then make sure the new heads point to their legitimate first element,
2461 * leading to this state:
2462 *
2463 * l_head --> L1 --> L2 -> L5
2464 * \ / \
2465 * r_head ----------------> R3 --> R4 -> R6 --> NULL
2466 *
2467 *
2468 * prevarray[left] = &L2->next
2469 * prevarray[right] = &r_head
2470 * oldarray[old] = L2
2471 */
2472
2473 for (size_t i = 0; i < oldsize; i++) {
2474 struct smrq_slink *link, *next;
2475 uint32_t want_mask;
2476
2477 link = smr_serialized_load(&oldarray[i].first);
2478 if (link == NULL) {
2479 continue;
2480 }
2481
2482 want_mask = smrht->obj_hash(link, 0) & oldsize;
2483 while ((next = smr_serialized_load(&link->next)) &&
2484 (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
2485 link = next;
2486 }
2487
2488 if (want_mask == 0) {
2489 /* elements seen go to the "left" bucket */
2490 prevarray[i] = &link->next;
2491 prevarray[i + oldsize] = &newarray[i + oldsize].first;
2492 smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2493 } else {
2494 /* elements seen go to the "right" bucket */
2495 prevarray[i] = &newarray[i].first;
2496 prevarray[i + oldsize] = &link->next;
2497 smr_serialized_store_relaxed(prevarray[i], next);
2498 }
2499
2500 smr_serialized_store_relaxed(&oldarray[i].first,
2501 next ? link : NULL);
2502
2503 needs_another_round |= (next != NULL);
2504 }
2505
2506 /*
2507 * At this point, when we split further, we must wait for
2508 * readers to observe the previous state before we split
2509 * further. Indeed, reusing the example above, the next
2510 * round of splitting would end up with this:
2511 *
2512 * l_head --> L1 --> L2 ----------------> L5
2513 * / \
2514 * r_head ----------------> R3 --> R4 -> R6 --> NULL
2515 *
2516 *
2517 * prevarray[left] = &L2->next
2518 * prevarray[right] = &R4->next
2519 * oldarray[old] = R4
2520 *
2521 * But we must be sure that no readers can observe r_head
2522 * having been L1, otherwise a stale reader might skip over
2523 * R3/R4.
2524 *
2525 * Generally speaking we need to do that each time we do a round
2526 * of splitting that isn't terminating the list with NULL.
2527 */
2528
2529 while (needs_another_round) {
2530 smr_synchronize(smr: smrht->domain);
2531
2532 needs_another_round = false;
2533
2534 for (size_t i = 0; i < oldsize; i++) {
2535 struct smrq_slink *link, *next;
2536 uint32_t want_mask;
2537
2538 link = smr_serialized_load(&oldarray[i].first);
2539 if (link == NULL) {
2540 continue;
2541 }
2542
2543 /*
2544 * If `prevarray[i]` (left) points to the linkage
2545 * we stopped at, then it means the next element
2546 * will be "to the right" and vice versa.
2547 *
2548 * We also already know "next" exists, so only probe
2549 * after it.
2550 */
2551 if (prevarray[i] == &link->next) {
2552 want_mask = (uint32_t)oldsize;
2553 } else {
2554 want_mask = 0;
2555 }
2556
2557 link = smr_serialized_load(&link->next);
2558
2559 while ((next = smr_serialized_load(&link->next)) &&
2560 (smrht->obj_hash(next, 0) & oldsize) == want_mask) {
2561 link = next;
2562 }
2563
2564 if (want_mask == 0) {
2565 /* elements seen go to the "left" bucket */
2566 prevarray[i] = &link->next;
2567 smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2568 } else {
2569 /* elements seen go to the "right" bucket */
2570 smr_serialized_store_relaxed(prevarray[i], next);
2571 prevarray[i + oldsize] = &link->next;
2572 }
2573
2574 smr_serialized_store_relaxed(&oldarray[i].first,
2575 next ? link : NULL);
2576
2577 needs_another_round |= (next != NULL);
2578 }
2579 }
2580
2581 smrh->smrh_resizing = false;
2582 lck_mtx_unlock(lck: lock);
2583
2584 /*
2585 * Step 4: cleanup, no need to wait for readers, this happened already
2586 * at least once for splitting reasons.
2587 */
2588 smr_hash_free_array(array: oldarray, size: oldsize);
2589 kfree_type(__smrq_slink_t *, newsize, prevarray);
2590 return KERN_SUCCESS;
2591}
2592
2593#pragma mark SMR scalable hash tables
2594
2595#define SMRSH_MIGRATED ((struct smrq_slink *)SMRSH_BUCKET_STOP_BIT)
2596static LCK_GRP_DECLARE(smr_shash_grp, "smr_shash");
2597
2598static inline size_t
2599__smr_shash_min_size(struct smr_shash *smrh)
2600{
2601 return 1ul << smrh->smrsh_min_shift;
2602}
2603
2604static inline size_t
2605__smr_shash_size_for_shift(uint8_t shift)
2606{
2607 return (~0u >> shift) + 1;
2608}
2609
2610static inline size_t
2611__smr_shash_cursize(smrsh_state_t state)
2612{
2613 return __smr_shash_size_for_shift(shift: state.curshift);
2614}
2615
2616static void
2617__smr_shash_bucket_init(hw_lck_ptr_t *head)
2618{
2619 hw_lck_ptr_init(head, __smr_shash_bucket_stop(head), &smr_shash_grp);
2620}
2621
2622static void
2623__smr_shash_bucket_destroy(hw_lck_ptr_t *head)
2624{
2625 hw_lck_ptr_destroy(head, &smr_shash_grp);
2626}
2627
2628__attribute__((noinline))
2629void *
2630__smr_shash_entered_find_slow(
2631 const struct smr_shash *smrh,
2632 smrh_key_t key,
2633 hw_lck_ptr_t *head,
2634 smrh_traits_t traits)
2635{
2636 struct smrq_slink *link;
2637 smrsh_state_t state;
2638 uint32_t hash;
2639
2640 /* wait for the rehashing to be done into their target buckets */
2641 hw_lck_ptr_wait_for_value(head, SMRSH_MIGRATED, &smr_shash_grp);
2642
2643 state = os_atomic_load(&smrh->smrsh_state, dependency);
2644 hash = __smr_shash_hash(smrh, idx: state.newidx, key, traits);
2645 head = __smr_shash_bucket(smrh, state, sel: SMRSH_NEW, hash);
2646
2647 link = hw_lck_ptr_value(lck: head);
2648 while (!__smr_shash_is_stop(link)) {
2649 if (traits->obj_equ(link, key)) {
2650 return __smrht_link_to_obj(traits, link);
2651 }
2652 link = smr_entered_load(&link->next);
2653 }
2654
2655 assert(link == __smr_shash_bucket_stop(head));
2656 return NULL;
2657}
2658
2659static const uint8_t __smr_shash_grow_ratio[] = {
2660 [SMRSH_COMPACT] = 6,
2661 [SMRSH_BALANCED] = 4,
2662 [SMRSH_BALANCED_NOSHRINK] = 4,
2663 [SMRSH_FASTEST] = 2,
2664};
2665
2666static inline uint64_t
2667__smr_shash_count(struct smr_shash *smrh)
2668{
2669 int64_t count = (int64_t)counter_load(&smrh->smrsh_count);
2670
2671 /*
2672 * negative values make no sense and is likely due to some
2673 * stale values being read.
2674 */
2675 return count < 0 ? 0ull : (uint64_t)count;
2676}
2677
2678static inline bool
2679__smr_shash_should_grow(
2680 struct smr_shash *smrh,
2681 smrsh_state_t state,
2682 uint64_t count)
2683{
2684 size_t size = __smr_shash_cursize(state);
2685
2686 /* grow if elem:bucket ratio is worse than grow_ratio:1 */
2687 return count > __smr_shash_grow_ratio[smrh->smrsh_policy] * size;
2688}
2689
2690static inline bool
2691__smr_shash_should_reseed(
2692 struct smr_shash *smrh,
2693 size_t observed_depth)
2694{
2695 return observed_depth > 10 * __smr_shash_grow_ratio[smrh->smrsh_policy];
2696}
2697
2698static inline bool
2699__smr_shash_should_shrink(
2700 struct smr_shash *smrh,
2701 smrsh_state_t state,
2702 uint64_t count)
2703{
2704 size_t size = __smr_shash_cursize(state);
2705
2706 switch (smrh->smrsh_policy) {
2707 case SMRSH_COMPACT:
2708 /* shrink if bucket:elem ratio is worse than 1:1 */
2709 return size > count && size > __smr_shash_min_size(smrh);
2710 case SMRSH_BALANCED:
2711 /* shrink if bucket:elem ratio is worse than 2:1 */
2712 return size > 2 * count && size > __smr_shash_min_size(smrh);
2713 case SMRSH_BALANCED_NOSHRINK:
2714 case SMRSH_FASTEST:
2715 return false;
2716 }
2717}
2718
2719static inline void
2720__smr_shash_schedule_rehash(
2721 struct smr_shash *smrh,
2722 smrh_traits_t traits,
2723 smrsh_rehash_t reason)
2724{
2725 smrsh_rehash_t rehash;
2726
2727 rehash = os_atomic_load(&smrh->smrsh_rehashing, relaxed);
2728 if (rehash & reason) {
2729 return;
2730 }
2731
2732 rehash = os_atomic_or_orig(&smrh->smrsh_rehashing, reason, relaxed);
2733 if (!rehash) {
2734 thread_call_enter1(call: smrh->smrsh_callout,
2735 __DECONST(void *, traits));
2736 }
2737}
2738
2739void *
2740__smr_shash_entered_get_or_insert(
2741 struct smr_shash *smrh,
2742 smrh_key_t key,
2743 struct smrq_slink *link,
2744 smrh_traits_t traits)
2745{
2746 struct smrq_slink *first;
2747 struct smrq_slink *other;
2748 uint32_t hash, depth;
2749 smrsh_state_t state;
2750 hw_lck_ptr_t *head;
2751 void *obj;
2752
2753 state = os_atomic_load(&smrh->smrsh_state, dependency);
2754 hash = __smr_shash_hash(smrh, idx: state.curidx, key, traits);
2755 head = __smr_shash_bucket(smrh, state, sel: SMRSH_CUR, hash);
2756 first = hw_lck_ptr_lock(head, &smr_shash_grp);
2757
2758 if (__improbable(first == SMRSH_MIGRATED)) {
2759 hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2760
2761 state = os_atomic_load(&smrh->smrsh_state, dependency);
2762 hash = __smr_shash_hash(smrh, idx: state.newidx, key, traits);
2763 head = __smr_shash_bucket(smrh, state, sel: SMRSH_NEW, hash);
2764 first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2765 }
2766
2767 depth = 0;
2768 other = first;
2769 while (!__smr_shash_is_stop(link: other)) {
2770 depth++;
2771 if (traits->obj_equ(other, key)) {
2772 obj = __smrht_link_to_obj(traits, link: other);
2773 if (traits->obj_try_get(obj)) {
2774 hw_lck_ptr_unlock(head, first,
2775 &smr_shash_grp);
2776 return obj;
2777 }
2778 break;
2779 }
2780 other = smr_serialized_load(&other->next);
2781 }
2782
2783 counter_inc_preemption_disabled(&smrh->smrsh_count);
2784 smr_serialized_store_relaxed(&link->next, first);
2785 hw_lck_ptr_unlock(head, link, &smr_shash_grp);
2786
2787 if (__smr_shash_should_reseed(smrh, observed_depth: depth)) {
2788 __smr_shash_schedule_rehash(smrh, traits, reason: SMRSH_REHASH_RESEED);
2789 } else if (depth * 2 >= __smr_shash_grow_ratio[smrh->smrsh_policy] &&
2790 __smr_shash_should_grow(smrh, state, count: __smr_shash_count(smrh))) {
2791 __smr_shash_schedule_rehash(smrh, traits, reason: SMRSH_REHASH_GROW);
2792 }
2793 return NULL;
2794}
2795
2796__abortlike
2797static void
2798__smr_shash_missing_elt_panic(
2799 struct smr_shash *smrh,
2800 struct smrq_slink *link,
2801 smrh_traits_t traits)
2802{
2803 panic("Unable to find item %p (linkage %p) in %p (traits %p)",
2804 __smrht_link_to_obj(traits, link), link, smrh, traits);
2805}
2806
2807smr_shash_mut_cursor_t
2808__smr_shash_entered_mut_begin(
2809 struct smr_shash *smrh,
2810 struct smrq_slink *link,
2811 smrh_traits_t traits)
2812{
2813 struct smrq_slink *first, *next;
2814 __smrq_slink_t *prev;
2815 smrsh_state_t state;
2816 hw_lck_ptr_t *head;
2817 uint32_t hash;
2818
2819 state = os_atomic_load(&smrh->smrsh_state, dependency);
2820 hash = __smr_shash_hash(smrh, idx: state.curidx, link, traits);
2821 head = __smr_shash_bucket(smrh, state, sel: SMRSH_CUR, hash);
2822 first = hw_lck_ptr_lock(head, &smr_shash_grp);
2823
2824 if (__improbable(first == SMRSH_MIGRATED)) {
2825 hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2826
2827 state = os_atomic_load(&smrh->smrsh_state, dependency);
2828 hash = __smr_shash_hash(smrh, idx: state.newidx, link, traits);
2829 head = __smr_shash_bucket(smrh, state, sel: SMRSH_NEW, hash);
2830 first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2831 }
2832
2833 next = first;
2834 while (next != link) {
2835 if (__smr_shash_is_stop(link: next)) {
2836 __smr_shash_missing_elt_panic(smrh, link, traits);
2837 }
2838 prev = &next->next;
2839 next = smr_serialized_load(prev);
2840 }
2841
2842 return (smr_shash_mut_cursor_t){ .head = head, .prev = prev };
2843}
2844
2845void
2846__smr_shash_entered_mut_erase(
2847 struct smr_shash *smrh,
2848 smr_shash_mut_cursor_t cursor,
2849 struct smrq_slink *link,
2850 smrh_traits_t traits)
2851{
2852 struct smrq_slink *next, *first;
2853 smrsh_state_t state;
2854
2855 first = hw_lck_ptr_value(lck: cursor.head);
2856
2857 next = smr_serialized_load(&link->next);
2858 if (first == link) {
2859 counter_dec_preemption_disabled(&smrh->smrsh_count);
2860 hw_lck_ptr_unlock(cursor.head, next, &smr_shash_grp);
2861 } else {
2862 smr_serialized_store_relaxed(cursor.prev, next);
2863 counter_dec_preemption_disabled(&smrh->smrsh_count);
2864 hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2865 }
2866
2867 state = atomic_load_explicit(&smrh->smrsh_state, memory_order_relaxed);
2868 if (first == link && __smr_shash_is_stop(link: next) &&
2869 __smr_shash_should_shrink(smrh, state, count: __smr_shash_count(smrh))) {
2870 __smr_shash_schedule_rehash(smrh, traits, reason: SMRSH_REHASH_SHRINK);
2871 }
2872}
2873
2874void
2875__smr_shash_entered_mut_replace(
2876 smr_shash_mut_cursor_t cursor,
2877 struct smrq_slink *old_link,
2878 struct smrq_slink *new_link)
2879{
2880 struct smrq_slink *first, *next;
2881
2882 first = hw_lck_ptr_value(lck: cursor.head);
2883
2884 next = smr_serialized_load(&old_link->next);
2885 smr_serialized_store_relaxed(&new_link->next, next);
2886 if (first == old_link) {
2887 hw_lck_ptr_unlock(cursor.head, new_link, &smr_shash_grp);
2888 } else {
2889 smr_serialized_store_relaxed(cursor.prev, new_link);
2890 hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2891 }
2892}
2893
2894void
2895__smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)
2896{
2897 hw_lck_ptr_unlock(cursor.head,
2898 hw_lck_ptr_value(cursor.head), &smr_shash_grp);
2899}
2900
2901static kern_return_t
2902__smr_shash_rehash_with_target(
2903 struct smr_shash *smrh,
2904 smrsh_state_t state,
2905 uint8_t newshift,
2906 smrh_traits_t traits)
2907{
2908 const size_t FLAT_SIZE = 256;
2909 struct smrq_slink *flat_queue[FLAT_SIZE];
2910
2911 size_t oldsize, newsize;
2912 hw_lck_ptr_t *oldarray;
2913 hw_lck_ptr_t *newarray;
2914 uint32_t newseed;
2915 uint8_t oldidx;
2916
2917 /*
2918 * This function resizes a scalable hash table.
2919 *
2920 * It doesn't require a lock because it is the callout
2921 * of a THREAD_CALL_ONCE thread call.
2922 */
2923
2924 oldidx = state.curidx;
2925 state.newidx = 1 - state.curidx;
2926 state.newshift = newshift;
2927 assert(__smr_shash_load_array(smrh, state.newidx) == NULL);
2928
2929 oldsize = __smr_shash_cursize(state);
2930 newsize = __smr_shash_size_for_shift(shift: newshift);
2931
2932 oldarray = __smr_shash_load_array(smrh, idx: state.curidx);
2933 newarray = (hw_lck_ptr_t *)smr_hash_alloc_array(size: newsize);
2934 newseed = (uint32_t)early_random();
2935
2936 if (newarray == NULL) {
2937 return KERN_RESOURCE_SHORTAGE;
2938 }
2939
2940 /*
2941 * Step 1: initialize the new array and seed,
2942 * and then publish the state referencing it.
2943 *
2944 * We do not need to synchronize explicitly with SMR,
2945 * because readers/writers will notice rehashing when
2946 * the bucket they interact with has a SMRSH_MIGRATED
2947 * value.
2948 */
2949
2950 for (size_t i = 0; i < newsize; i++) {
2951 __smr_shash_bucket_init(head: &newarray[i]);
2952 }
2953 os_atomic_store(&smrh->smrsh_array[state.newidx], newarray, relaxed);
2954 os_atomic_store(&smrh->smrsh_seed[state.newidx], newseed, relaxed);
2955 os_atomic_store(&smrh->smrsh_state, state, release);
2956
2957 /*
2958 * Step 2: migrate buckets "atomically" under the old bucket lock.
2959 *
2960 * This migration is atomic for writers because
2961 * they take the old bucket lock first, and if
2962 * they observe SMRSH_MIGRATED as the value,
2963 * go look in the new bucket instead.
2964 *
2965 * This migration is atomic for readers, because
2966 * as we move elements to their new buckets,
2967 * the hash chains will not circle back to their
2968 * bucket head (the "stop" value won't match),
2969 * or the bucket head will be SMRSH_MIGRATED.
2970 *
2971 * This causes a slowpath which spins waiting
2972 * for SMRSH_MIGRATED to appear and then looks
2973 * in the new bucket.
2974 */
2975 for (size_t i = 0; i < oldsize; i++) {
2976 struct smrq_slink *first, *link, *next;
2977 hw_lck_ptr_t *head;
2978 uint32_t hash;
2979 size_t n = 0;
2980
2981 link = first = hw_lck_ptr_lock(&oldarray[i], &smr_shash_grp);
2982
2983 while (!__smr_shash_is_stop(link)) {
2984 flat_queue[n++ % FLAT_SIZE] = link;
2985 link = smr_serialized_load(&link->next);
2986 }
2987
2988 while (n-- > 0) {
2989 for (size_t j = (n % FLAT_SIZE) + 1; j-- > 0;) {
2990 link = flat_queue[j];
2991 hash = traits->obj_hash(link, newseed);
2992 head = &newarray[hash >> newshift];
2993 next = hw_lck_ptr_lock_nopreempt(head,
2994 &smr_shash_grp);
2995 smr_serialized_store_relaxed(&link->next, next);
2996 hw_lck_ptr_unlock_nopreempt(head, link,
2997 &smr_shash_grp);
2998 }
2999 n &= ~(FLAT_SIZE - 1);
3000
3001 /*
3002 * If there were more than FLAT_SIZE elements in the
3003 * chain (which is super unlikely and in many ways,
3004 * worrisome), then we need to repopoulate
3005 * the flattened queue array for each run.
3006 *
3007 * This is O(n^2) but we have worse problems anyway
3008 * if we ever hit this path.
3009 */
3010 if (__improbable(n > 0)) {
3011 link = first;
3012 for (size_t j = 0; j < n - FLAT_SIZE; j++) {
3013 link = smr_serialized_load(&link->next);
3014 }
3015
3016 flat_queue[0] = link;
3017 for (size_t j = 1; j < FLAT_SIZE; j++) {
3018 link = smr_serialized_load(&link->next);
3019 flat_queue[j] = link;
3020 }
3021 }
3022 }
3023
3024 hw_lck_ptr_unlock(&oldarray[i], SMRSH_MIGRATED, &smr_shash_grp);
3025 }
3026
3027 /*
3028 * Step 3: deallocate the old array of buckets,
3029 * making sure to hide it from readers.
3030 */
3031
3032 state.curshift = state.newshift;
3033 state.curidx = state.newidx;
3034 os_atomic_store(&smrh->smrsh_state, state, release);
3035
3036 smr_synchronize(smr: traits->domain);
3037
3038 os_atomic_store(&smrh->smrsh_array[oldidx], NULL, relaxed);
3039 for (size_t i = 0; i < oldsize; i++) {
3040 __smr_shash_bucket_destroy(head: &oldarray[i]);
3041 }
3042 smr_hash_free_array(array: (struct smrq_slist_head *)oldarray, size: oldsize);
3043
3044 return KERN_SUCCESS;
3045}
3046
3047static void
3048__smr_shash_rehash(thread_call_param_t arg0, thread_call_param_t arg1)
3049{
3050 struct smr_shash *smrh = arg0;
3051 smrh_traits_t traits = arg1;
3052 smrsh_rehash_t reason;
3053 smrsh_state_t state;
3054 uint64_t count;
3055 kern_return_t kr;
3056
3057 do {
3058 reason = os_atomic_xchg(&smrh->smrsh_rehashing,
3059 SMRSH_REHASH_RUNNING, relaxed);
3060
3061 state = os_atomic_load(&smrh->smrsh_state, relaxed);
3062 count = __smr_shash_count(smrh);
3063
3064 if (__smr_shash_should_grow(smrh, state, count)) {
3065 kr = __smr_shash_rehash_with_target(smrh, state,
3066 newshift: state.curshift - 1, traits);
3067 } else if (__smr_shash_should_shrink(smrh, state, count)) {
3068 kr = __smr_shash_rehash_with_target(smrh, state,
3069 newshift: state.curshift + 1, traits);
3070 } else if (reason & SMRSH_REHASH_RESEED) {
3071 kr = __smr_shash_rehash_with_target(smrh, state,
3072 newshift: state.curshift, traits);
3073 } else {
3074 kr = KERN_SUCCESS;
3075 }
3076
3077 if (kr == KERN_RESOURCE_SHORTAGE) {
3078 uint64_t deadline;
3079
3080 os_atomic_or(&smrh->smrsh_rehashing, reason, relaxed);
3081 nanoseconds_to_deadline(NSEC_PER_MSEC, result: &deadline);
3082 thread_call_enter1_delayed(call: smrh->smrsh_callout,
3083 param1: arg1, deadline);
3084 break;
3085 }
3086 } while (!os_atomic_cmpxchg(&smrh->smrsh_rehashing,
3087 SMRSH_REHASH_RUNNING, SMRSH_REHASH_NONE, relaxed));
3088}
3089
3090void
3091smr_shash_init(struct smr_shash *smrh, smrsh_policy_t policy, size_t min_size)
3092{
3093 smrsh_state_t state;
3094 hw_lck_ptr_t *array;
3095 uint8_t shift;
3096 size_t size;
3097
3098 switch (policy) {
3099 case SMRSH_COMPACT:
3100 if (min_size < 2) {
3101 min_size = 2;
3102 }
3103 break;
3104 default:
3105 if (min_size < 16) {
3106 min_size = 16;
3107 }
3108 break;
3109 }
3110
3111 switch (policy) {
3112 case SMRSH_COMPACT:
3113 size = MIN(2, min_size);
3114 break;
3115 case SMRSH_BALANCED:
3116 case SMRSH_BALANCED_NOSHRINK:
3117 size = MIN(16, min_size);
3118 break;
3119 case SMRSH_FASTEST:
3120 size = min_size;
3121 break;
3122 }
3123
3124 if (size > KALLOC_SAFE_ALLOC_SIZE / sizeof(*array)) {
3125 size = KALLOC_SAFE_ALLOC_SIZE / sizeof(*array);
3126 }
3127 shift = (uint8_t)__builtin_clz((uint32_t)(size - 1));
3128 size = (~0u >> shift) + 1;
3129 array = (hw_lck_ptr_t *)smr_hash_alloc_array(size);
3130 for (size_t i = 0; i < size; i++) {
3131 __smr_shash_bucket_init(head: &array[i]);
3132 }
3133
3134 state = (smrsh_state_t){
3135 .curshift = shift,
3136 .newshift = shift,
3137 };
3138 *smrh = (struct smr_shash){
3139 .smrsh_array[0] = array,
3140 .smrsh_seed[0] = (uint32_t)early_random(),
3141 .smrsh_state = state,
3142 .smrsh_policy = policy,
3143 .smrsh_min_shift = (uint8_t)flsll(mask: min_size - 1),
3144 };
3145 counter_alloc(&smrh->smrsh_count);
3146 smrh->smrsh_callout = thread_call_allocate_with_options(func: __smr_shash_rehash,
3147 param0: smrh, pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
3148}
3149
3150void
3151__smr_shash_destroy(
3152 struct smr_shash *smrh,
3153 smrh_traits_t traits,
3154 void (^free)(void *))
3155{
3156 smrsh_state_t state;
3157 hw_lck_ptr_t *array;
3158 size_t size;
3159
3160 thread_call_cancel_wait(call: smrh->smrsh_callout);
3161
3162 state = os_atomic_load(&smrh->smrsh_state, dependency);
3163 assert(state.curidx == state.newidx);
3164 assert(__smr_shash_load_array(smrh, 1 - state.curidx) == NULL);
3165 size = __smr_shash_cursize(state);
3166 array = __smr_shash_load_array(smrh, idx: state.curidx);
3167
3168 if (free) {
3169 for (size_t i = 0; i < size; i++) {
3170 struct smrq_slink *link, *next;
3171
3172 next = hw_lck_ptr_value(lck: &array[i]);
3173 while (!__smr_shash_is_stop(link: next)) {
3174 link = next;
3175 next = smr_serialized_load(&link->next);
3176 free(__smrht_link_to_obj(traits, link));
3177 }
3178 }
3179 }
3180 for (size_t i = 0; i < size; i++) {
3181 __smr_shash_bucket_destroy(head: &array[i]);
3182 }
3183
3184 thread_call_free(call: smrh->smrsh_callout);
3185 counter_free(&smrh->smrsh_count);
3186 smr_hash_free_array(array: (struct smrq_slist_head *)array, size);
3187 bzero(s: smrh, n: sizeof(*smrh));
3188}
3189
3190
3191#pragma mark misc
3192
3193void
3194__smr_linkage_invalid(__smrq_link_t *link)
3195{
3196 struct smrq_link *elem = __container_of(link, struct smrq_link, next);
3197 struct smrq_link *next = smr_serialized_load(&elem->next);
3198
3199 panic("Invalid queue linkage: elt:%p next:%p next->prev:%p",
3200 elem, next, __container_of(next->prev, struct smrq_link, next));
3201}
3202
3203void
3204__smr_stail_invalid(__smrq_slink_t *link, __smrq_slink_t *last)
3205{
3206 struct smrq_slink *elem = __container_of(link, struct smrq_slink, next);
3207 struct smrq_slink *next = smr_serialized_load(&elem->next);
3208
3209 if (next) {
3210 panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
3211 elem, next);
3212 } else {
3213 panic("Invalid queue tail (early end): elt:%p tail:%p",
3214 elem, __container_of(last, struct smrq_slink, next));
3215 }
3216}
3217
3218void
3219__smr_tail_invalid(__smrq_link_t *link, __smrq_link_t *last)
3220{
3221 struct smrq_link *elem = __container_of(link, struct smrq_link, next);
3222 struct smrq_link *next = smr_serialized_load(&elem->next);
3223
3224 if (next) {
3225 panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
3226 elem, next);
3227 } else {
3228 panic("Invalid queue tail (early end): elt:%p tail:%p",
3229 elem, __container_of(last, struct smrq_link, next));
3230 }
3231}
3232