smr.c source code [xnu/osfmk/kern/smr.c]

1	/*
2	* Copyright (c) 2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include <kern/locks_internal.h>
30	#include <kern/cpu_data.h>
31	#include <kern/machine.h>
32	#include <kern/mpsc_queue.h>
33	#include <kern/percpu.h>
34	#include <kern/sched.h>
35	#include <kern/smr.h>
36	#include <kern/smr_hash.h>
37	#include <kern/thread.h>
38	#include <kern/zalloc.h>
39	#include <machine/commpage.h>
40	#include <os/hash.h>
41
42
43	#pragma mark - SMR domains
44
45	/*
46	* This SMR scheme is directly FreeBSD's "Global Unbounded Sequences".
47	*
48	* Major differences are:
49	*
50	* - only eager clocks are implemented (no lazy, no implicit)
51	*
52	*
53	* SMR clocks have 3 state machines interacting at any given time:
54	*
55	* 1. reader critical sections
56	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~
57	*
58	* Each CPU can disable preemption and do this sequence:
59	*
60	* CPU::c_rd_seq = GLOBAL::c_wr_seq;
61	*
62	* < unfortunate place to receive a long IRQ > [I]
63	*
64	* os_atomic_thread_fence(seq_cst); [R1]
65	*
66	* {
67	* // critical section
68	* }
69	*
70	* os_atomic_store(&CPU::c_rd_seq, INVALID, release); [R2]
71	*
72	*
73	*
74	* 2. writer sequence advances
75	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~
76	*
77	* Each writer can increment the global write sequence
78	* at any given time:
79	*
80	* os_atomic_add(&GLOBAL::c_wr_seq, SMR_SEQ_INC, release); [W]
81	*
82	*
83	*
84	* 3. synchronization sequence: poll/wait/scan
85	* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
86	*
87	* This state machine synchronizes with the other two in order to decide
88	* if a given "goal" is in the past. Only the cases when the call
89	* is successful is interresting for barrier purposes, and we will focus
90	* on cases that do not take an early return for failures.
91	*
92	* a. __smr_poll:
93	*
94	* rd_seq = os_atomic_load(&GLOBAL::c_rd_seq, acquire); [S1]
95	* if (goal < rd_seq) SUCCESS.
96	* wr_seq = os_atomic_load(&GLOBAL::c_rd_seq, relaxed);
97	*
98	* b. __smr_scan
99	*
100	* os_atomic_thread_fence(seq_cst) [S2]
101	*
102	* observe the minimum CPU::c_rd_seq "min_rd_seq"
103	* value possible or rw_seq if no CPU was in a critical section.
104	* (possibly spinning until it satisfies "goal")
105	*
106	* c. __smr_rd_advance
107	*
108	* cur_rd_seq = load_exclusive(&GLOBAL::c_rd_seq);
109	* os_atomic_thread_fence(seq_cst); [S3]
110	* if (min_rd_seq > cur_rd_seq) {
111	* store_exlusive(&GLOBAL::c_rd_seq, min_rd_seq);
112	* }
113	*
114	*
115	* One sentence summary
116	* ~~~~~~~~~~~~~~~~~~~~
117	*
118	* A simplistic one-sentence summary of the algorithm is that __smr_scan()
119	* works really hard to insert itself in the timeline of write sequences and
120	* observe a reasonnable bound for first safe-to-reclaim sequence, and
121	* issues [S3] to sequence everything around "c_rd_seq" (via [S3] -> [S1]):
122	*
123	* GLOBAL::c_rd_seq GLOBAL::c_wr_seq
124	* v v
125	* ──────────────────────┬────────────────┬─────────────────────
126	* ... safe to reclaim │ deferred │ future ...
127	* ──────────────────────┴────────────────┴─────────────────────
128	*
129	*
130	* Detailed explanation
131	* ~~~~~~~~~~~~~~~~~~~~
132	*
133	* [W] -> [R1] establishes a "happens before" relationship between a given
134	* writer and this critical section. The loaded GLOBAL::c_wr_seq might
135	* however be stale with respect to the one [R1] really synchronizes with
136	* (see [I] explanation below).
137	*
138	*
139	* [R1] -> [S2] establishes a "happens before" relationship between all the
140	* active critical sections and the scanner.
141	* It lets us compute the oldest possible sequence pinned by an active
142	* critical section.
143	*
144	*
145	* [R2] -> [S3] establishes a "happens before" relationship between all the
146	* inactive critical sections and the scanner.
147	*
148	*
149	* [S3] -> [S1] is the typical expected fastpath: when the caller can decide
150	* that its goal is older than the last update an __smr_rd_advance() did.
151	* Note that [S3] doubles as an "[S1]" when two __smr_scan() race each other
152	* and one of them finishes last but observed a "worse" read sequence.
153	*
154	*
155	* [W], [S3] -> [S1] is the last crucial property: all updates to the global
156	* clock are totally ordered because they update the entire 128bit state
157	* every time with an RMW. This guarantees that __smr_poll() can't load
158	* an `rd_seq` that is younger than the `wr_seq` it loads next.
159	*
160	*
161	* [I] __smr_enter() also can be unfortunately delayed after observing
162	* a given write sequence and right before [R1] at [I].
163	*
164	* However for a read sequence to have move past what __smr_enter() observed,
165	* it means another __smr_scan() didn't observe the store to CPU::c_rd_seq
166	* made by __smr_enter() and thought the section was inactive.
167	*
168	* This can only happen if the scan's [S2] was issued before the delayed
169	* __smr_enter() [R1] (during the [I] window).
170	*
171	* As a consequence the outcome of that scan can be accepted as the "real"
172	* write sequence __smr_enter() should have observed.
173	*
174	*
175	* Litmus tests
176	* ~~~~~~~~~~~~
177	*
178	* This is the proof of [W] -> [R1] -> [S2] being established properly:
179	* - P0 sets a global and calls smr_synchronize()
180	* - P1 does smr_enter() and loads the global
181	*
182	* AArch64 MP
183	* {
184	* global = 0;
185	* wr_seq = 123;
186	* p1_rd_seq = 0;
187	*
188	* 0:x0 = global; 0:x1 = wr_seq; 0:x2 = p1_rd_seq;
189	* 1:x0 = global; 1:x1 = wr_seq; 1:x2 = p1_rd_seq;
190	* }
191	* P0 \| P1 ;
192	* MOV X8, #2 \| LDR X8, [X1] ;
193	* STR X8, [X0] \| STR X8, [X2] ;
194	* LDADDL X8, X9, [X1] \| DMB SY ;
195	* DMB SY \| LDR X10, [X0] ;
196	* LDR X10, [X2] \| ;
197	* exists (0:X10 = 0 /\ 1:X8 = 123 /\ 1:X10 = 0)
198	*
199	*
200	* This is the proof that deferred advances are also correct:
201	* - P0 sets a global and does a smr_deferred_advance()
202	* - P1 does an smr_synchronize() and reads the global
203	*
204	* AArch64 MP
205	* {
206	* global = 0;
207	* wr_seq = 123;
208	*
209	* 0:x0 = global; 0:x1 = wr_seq; 0:x2 = 2;
210	* 1:x0 = global; 1:x1 = wr_seq; 1:x2 = 2;
211	* }
212	* P0 \| P1 ;
213	* STR X2, [X0] \| LDADDL X2, X9, [X1] ;
214	* DMB SY \| DMB SY ;
215	* LDR X9, [X1] \| LDR X10, [X0] ;
216	* ADD X9, X9, X2 \| ;
217	* exists (0:X9 = 125 /\ 1:X9 = 123 /\ 1:X10 = 0)
218	*
219	*/
220
221	/!*
222	* @struct smr_worker
223	*
224	* @brief
225	* Structure tracking the per-cpu SMR workers state.
226	*
227	* @discussion
228	* This structure is system wide and global and is used to track
229	* the various active SMR domains at the granularity of a CPU.
230	*
231	* Each structure has an associated thread which is responsible
232	* for the forward progress the @c smr_call() and @c smr_barrier()
233	* interfaces.
234	*
235	* It also tracks all the active, non stalled, sleepable SMR sections.
236	*/
237	struct smr_worker {
238	/*
239	* The thread for this worker,
240	* and conveniency pointer to the processor it is bound to.
241	*/
242	struct thread *thread;
243	struct processor *processor;
244
245	/*
246	* Thread binding/locking logic:
247	*
248	* If the worker thread is running on its canonical CPU,
249	* then locking to access the various SMR per-cpu data
250	* structures it is draining is just preemption disablement.
251	*
252	* However, if it is currently not bound to its canonical
253	* CPU because the CPU has been offlined or de-recommended,
254	* then a lock which serializes with the CPU going online
255	* again is being used.
256	*/
257	struct waitq waitq;
258	smr_cpu_reason_t detach_reason;
259
260	#if CONFIG_QUIESCE_COUNTER
261	/*
262	* Currently active quiescent generation for this processor,
263	* and the last timestamp when a scan of all cores was performed.
264	*/
265	smr_seq_t rd_quiesce_seq;
266	#endif
267
268	/*
269	* List of all the active sleepable sections that haven't
270	* been stalled.
271	*/
272	struct smrq_list_head sect_queue;
273	struct thread *sect_waiter;
274
275	/*
276	* Queue of SMR domains with pending smr_call()
277	* callouts to drain.
278	*
279	* This uses an ageing strategy in order to amortize
280	* SMR clock updates:
281	*
282	* - the "old" queue have domains whose callbacks have
283	* a committed and aged sequence,
284	* - the "age" queue have domains whose callbacks have
285	* a commited but fresh sequence and need ageing,
286	* - the "cur" queue have domains whose callbacks have
287	* a sequence in the future and need for it to be committed.
288	*/
289	struct smr_pcpu *whead;
290	struct smr_pcpu **wold_tail;
291	struct smr_pcpu **wage_tail;
292	struct smr_pcpu **wcur_tail;
293	uint64_t drain_ctime;
294
295	/*
296	* Queue of smr_barrier() calls in flight,
297	* that will be picked up by the worker thread
298	* to enqueue as smr_call() entries in their
299	* respective per-CPU data structures.
300	*/
301	struct mpsc_queue_head barrier_queue;
302	} __attribute__((aligned(`64`)));
303
304
305	typedef struct smr_pcpu {
306	/*
307	* CPU private cacheline.
308	*
309	* Nothing else than the CPU this state is made for,
310	* ever writes to this cacheline.
311	*
312	* It holds the epoch activity witness (rd_seq), and
313	* the local smr_call() queue, which is structured this way:
314	*
315	* head -> n1 -> n2 -> n3 -> n4 -> ... -> ni -> ... -> nN -> NULL
316	* ^ ^ ^
317	* qold_tail -------------' \| \|
318	* qage_tail --------------------------' \|
319	* qcur_tail ---------------------------------------------'
320	*
321	* - the "old" queue can be reclaimed once qold_seq is past,
322	* qold_seq is always a commited sequence.
323	* - the "age" queue can be reclaimed once qage_seq is past,
324	* qage_seq might not be commited yet.
325	* - the "cur" queue has an approximate size of qcur_size bytes,
326	* and a length of qcur_cnt callbacks.
327	*/
328
329	smr_seq_t c_rd_seq; / might have SMR_SEQ_SLEEPABLE set /
330
331	smr_node_t qhead;
332
333	smr_seq_t qold_seq;
334	smr_node_t *qold_tail;
335
336	smr_seq_t qage_seq;
337	smr_node_t *qage_tail;
338
339	uint32_t qcur_size;
340	uint32_t qcur_cnt;
341	smr_node_t *qcur_tail;
342
343	uint8_t __cacheline_sep[`0`];
344
345	/*
346	* Drain queue.
347	*
348	* This is used to drive smr_call() via the smr worker threads.
349	* If the SMR domain is not using smr_call() or smr_barrier(),
350	* this isn't used.
351	*/
352	struct smr *drain_smr;
353	struct smr_pcpu *drain_next;
354	uint16_t __check_cpu;
355	uint8_t __check_reason;
356	uint8_t __check_list;
357
358	/*
359	* Stalled queue.
360	*
361	* Stalled sections are enqueued onto this queue by the scheduler
362	* when their thread blocks (see smr_mark_active_trackers_stalled()).
363	*
364	* If the SMR domain is not sleepable, then this isn't used.
365	*
366	* This list is protected by a lock.
367	*
368	* When there are stalled sections, stall_rd_seq contains
369	* the oldest active stalled sequence number.
370	*
371	* When threads want to expedite a stalled section, they set
372	* stall_waiter_goal to the sequence number they are waiting
373	* for and block via turnstile on the oldest stalled section.
374	*/
375	hw_lck_ticket_t stall_lock;
376	smr_seq_t stall_rd_seq;
377	smr_seq_t stall_waiter_goal;
378	struct smrq_tailq_head stall_queue;
379	struct turnstile *stall_ts;
380	} __attribute__((aligned(`128`))) * smr_pcpu_t;
381
382	static_assert(offsetof(struct smr_pcpu, __cacheline_sep) == `64`);
383	static_assert(sizeof(struct smr_pcpu) == `128`);
384
385	#define CPU_CHECKIN_MIN_INTERVAL_US 5000 /* 5ms */
386	#define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
387	static uint64_t cpu_checkin_min_interval;
388	static uint32_t cpu_checkin_min_interval_us;
389
390	/! the amount of memory pending retiring that causes a foreceful flush /
391	#if XNU_TARGET_OS_OSX
392	static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", `256` << `10`);
393	static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", `128`);
394	#else
395	static TUNABLE(vm_size_t, smr_call_size_cap, "smr_call_size_cap", `64` << `10`);
396	static TUNABLE(vm_size_t, smr_call_cnt_cap, "smr_call_cnt_cap", `32`);
397	#endif
398	/ time __smr_wait_for_oncore busy spins before going the expensive route /
399	static TUNABLE(uint32_t, smr_wait_spin_us, "smr_wait_spin_us", `20`);
400
401	static LCK_GRP_DECLARE(smr_lock_grp, "smr");
402	static struct smr_worker PERCPU_DATA(smr_worker);
403	static struct smrq_tailq_head smr_domains = SMRQ_TAILQ_INITIALIZER(smr_domains);
404
405	SMR_DEFINE_FLAGS(smr_system, "system", SMR_NONE);
406	SMR_DEFINE_FLAGS(smr_system_sleepable, "system (sleepable)", SMR_SLEEPABLE);
407
408
409	#pragma mark SMR domains: init & helpers
410
411	#define SMR_PCPU_NOT_QUEUED ((struct smr_pcpu *)-1)
412
413	__attribute__((always_inline, overloadable))
414	static inline smr_pcpu_t
415	__smr_pcpu(smr_t smr, int cpu)
416	{
417	return &smr->smr_pcpu[cpu];
418	}
419
420	__attribute__((always_inline, overloadable))
421	static inline smr_pcpu_t
422	__smr_pcpu(smr_t smr)
423	{
424	return __smr_pcpu(smr, cpu: cpu_number());
425	}
426
427	static inline bool
428	__smr_pcpu_queued(smr_pcpu_t pcpu)
429	{
430	return pcpu->drain_next != SMR_PCPU_NOT_QUEUED;
431	}
432
433	static inline void
434	__smr_pcpu_set_not_queued(smr_pcpu_t pcpu)
435	{
436	pcpu->drain_next = SMR_PCPU_NOT_QUEUED;
437	}
438
439	static inline void
440	__smr_pcpu_associate(smr_t smr, smr_pcpu_t pcpu)
441	{
442	zpercpu_foreach_cpu(cpu) {
443	pcpu[cpu].qold_tail = &pcpu[cpu].qhead;
444	pcpu[cpu].qage_tail = &pcpu[cpu].qhead;
445	pcpu[cpu].qcur_tail = &pcpu[cpu].qhead;
446
447	pcpu[cpu].drain_smr = smr;
448	__smr_pcpu_set_not_queued(pcpu: &pcpu[cpu]);
449	hw_lck_ticket_init(&pcpu[cpu].stall_lock, &smr_lock_grp);
450	smrq_init(&pcpu[cpu].stall_queue);
451	}
452
453	os_atomic_store(&smr->smr_pcpu, pcpu, release);
454	}
455
456	static inline event64_t
457	__smrw_oncore_event(struct smr_worker *smrw)
458	{
459	return CAST_EVENT64_T(&smrw->sect_queue);
460	}
461
462	static inline event64_t
463	__smrw_drain_event(struct smr_worker *smrw)
464	{
465	return CAST_EVENT64_T(&smrw->whead);
466	}
467
468	static inline processor_t
469	__smrw_drain_bind_target(struct smr_worker *smrw)
470	{
471	return smrw->detach_reason ? PROCESSOR_NULL : smrw->processor;
472	}
473
474	static inline void
475	__smrw_lock(struct smr_worker *smrw)
476	{
477	waitq_lock(wq: &smrw->waitq);
478	}
479
480	static inline void
481	__smrw_unlock(struct smr_worker *smrw)
482	{
483	waitq_unlock(wq: &smrw->waitq);
484	}
485
486	/!*
487	* @function __smrw_wakeup_and_unlock()
488	*
489	* @brief
490	* Wakes up (with binding) the SMR worker.
491	*
492	* @discussion
493	* Wakeup the worker thread and bind it to the proper processor
494	* as a side effect.
495	*
496	* This function must be called with interrupts disabled.
497	*/
498	static bool
499	__smrw_wakeup_and_unlock(struct smr_worker *smrw)
500	{
501	thread_t thread;
502
503	assert(!ml_get_interrupts_enabled());
504
505	thread = waitq_wakeup64_identify_locked(waitq: &smrw->waitq,
506	wake_event: __smrw_drain_event(smrw), THREAD_AWAKENED, flags: WAITQ_UNLOCK);
507
508	if (thread != THREAD_NULL) {
509	assert(thread == smrw->thread);
510
511	waitq_resume_and_bind_identified_thread(waitq: &smrw->waitq,
512	thread, processor: __smrw_drain_bind_target(smrw),
513	THREAD_AWAKENED, flags: WAITQ_WAKEUP_DEFAULT);
514	}
515
516	return thread != THREAD_NULL;
517	}
518
519	static void
520	__smr_call_drain(smr_node_t head)
521	{
522	smr_node_t node;
523
524	while ((node = head) != NULL) {
525	head = node->smrn_next;
526	node->smrn_next = NULL;
527	node->smrn_cb(node);
528	}
529	}
530
531	__startup_func
532	void
533	__smr_domain_init(smr_t smr)
534	{
535	smr_pcpu_t pcpu;
536	vm_size_t size;
537
538	if (startup_phase < STARTUP_SUB_TUNABLES) {
539	smr_seq_t *rd_seqp = &smr->smr_early;
540
541	/*
542	* This is a big cheat, but before the EARLY_BOOT phase,
543	* all smr_* APIs that would access past the rd_seq
544	* will early return.
545	*/
546	pcpu = __container_of(rd_seqp, struct smr_pcpu, c_rd_seq);
547	smr->smr_pcpu = pcpu - cpu_number();
548	assert(&__smr_pcpu(smr)->c_rd_seq == &smr->smr_early);
549	} else {
550	size = zpercpu_count() * sizeof(struct smr_pcpu);
551	pcpu = zalloc_permanent(size, ZALIGN(struct smr_pcpu));
552
553	__smr_pcpu_associate(smr, pcpu);
554	}
555	}
556
557	smr_t
558	smr_domain_create(smr_flags_t flags, const char *name)
559	{
560	smr_pcpu_t pcpu;
561	smr_t smr;
562
563	smr = kalloc_type(struct smr, Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
564	pcpu = kalloc_type(struct smr_pcpu, zpercpu_count(),
565	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
566
567	smr->smr_clock.s_rd_seq = SMR_SEQ_INIT;
568	smr->smr_clock.s_wr_seq = SMR_SEQ_INIT;
569	smr->smr_flags = flags;
570	static_assert(sizeof(struct smr) ==
571	offsetof(struct smr, smr_name) + SMR_NAME_MAX);
572	strlcpy(dst: smr->smr_name, src: name, n: sizeof(smr->smr_name));
573
574	__smr_pcpu_associate(smr, pcpu);
575
576	return smr;
577	}
578
579	void
580	smr_domain_free(smr_t smr)
581	{
582	smr_barrier(smr);
583
584	zpercpu_foreach_cpu(cpu) {
585	smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
586
587	assert(pcpu->qhead == NULL);
588	hw_lck_ticket_destroy(&pcpu->stall_lock, &smr_lock_grp);
589	}
590
591	kfree_type(struct smr_pcpu, zpercpu_count(), smr->smr_pcpu);
592	kfree_type(struct smr, smr);
593	}
594
595
596	#pragma mark SMR domains: enter / leave
597
598	bool
599	smr_entered(smr_t smr)
600	{
601	thread_t self = current_thread();
602	smr_tracker_t t;
603
604	if (lock_preemption_level_for_thread(self) &&
605	__smr_pcpu(smr)->c_rd_seq != SMR_SEQ_INVALID) {
606	return true;
607	}
608
609	if (smr->smr_flags & SMR_SLEEPABLE) {
610	smrq_serialized_foreach(t, &self->smr_stack, smrt_stack) {
611	if (t->smrt_domain == smr) {
612	return true;
613	}
614	}
615	}
616
617	return false;
618	}
619
620	__attribute__((always_inline))
621	bool
622	smr_entered_cpu_noblock(smr_t smr, int cpu)
623	{
624	assert((smr->smr_flags & SMR_SLEEPABLE) == `0`);
625	return __smr_pcpu(smr, cpu)->c_rd_seq != SMR_SEQ_INVALID;
626	}
627
628	__attribute__((always_inline))
629	static smr_seq_t
630	__smr_enter(smr_t smr, smr_pcpu_t pcpu, smr_seq_t sleepable)
631	{
632	smr_seq_t s_wr_seq;
633	smr_seq_t old_seq;
634
635	assert(!ml_at_interrupt_context());
636
637	/*
638	* It is possible to have a long delay between loading the s_wr_seq
639	* and storing it to the percpu copy of it.
640	*
641	* It is unlikely but possible by that time the s_rd_seq advances
642	* ahead of what we will store. This however is still safe
643	* and handled in __smr_scan().
644	*
645	* On Intel, to achieve the ordering we want, we could use a store
646	* followed by an mfence, or any RMW (XCHG, XADD, CMPXCHG, ...).
647	* XADD is just the fastest instruction of the alternatives,
648	* but it will only ever add to '0'.
649	*/
650	s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
651	#if __x86_64__
652	/ [R1] /
653	old_seq = os_atomic_add_orig(&pcpu->c_rd_seq, s_wr_seq \| sleepable, seq_cst);
654	#else
655	old_seq = pcpu->c_rd_seq;
656	os_atomic_store(&pcpu->c_rd_seq, s_wr_seq \| sleepable, relaxed);
657	os_atomic_thread_fence(seq_cst); / [R1] /
658	#endif
659	assert(old_seq == SMR_SEQ_INVALID);
660
661	return s_wr_seq;
662	}
663
664	__attribute__((always_inline))
665	static void
666	__smr_leave(smr_pcpu_t pcpu)
667	{
668	assert(!ml_at_interrupt_context());
669	/ [R2] /
670	os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
671	}
672
673	__attribute__((always_inline))
674	void
675	smr_enter(smr_t smr)
676	{
677	disable_preemption();
678	__smr_enter(smr, pcpu: __smr_pcpu(smr), sleepable: `0`);
679	}
680
681	__attribute__((always_inline))
682	void
683	smr_leave(smr_t smr)
684	{
685	__smr_leave(pcpu: __smr_pcpu(smr));
686	enable_preemption();
687	}
688
689	void
690	smr_enter_sleepable(smr_t smr, smr_tracker_t tracker)
691	{
692	thread_t self = current_thread();
693	struct smr_worker *smrw;
694	smr_pcpu_t pcpu;
695
696	assert(smr->smr_flags & SMR_SLEEPABLE);
697
698	lock_disable_preemption_for_thread(self);
699	lck_rw_lock_count_inc(thread: self, lock: smr);
700
701	pcpu = __smr_pcpu(smr);
702	smrw = PERCPU_GET(smr_worker);
703
704	tracker->smrt_domain = smr;
705	tracker->smrt_seq = __smr_enter(smr, pcpu, SMR_SEQ_SLEEPABLE);
706	smrq_serialized_insert_head_relaxed(&smrw->sect_queue, &tracker->smrt_link);
707	smrq_serialized_insert_head_relaxed(&self->smr_stack, &tracker->smrt_stack);
708	tracker->smrt_ctid = `0`;
709	tracker->smrt_cpu = -`1`;
710
711	lock_enable_preemption();
712	}
713
714	__attribute__((always_inline))
715	static void
716	__smr_wake_oncore_sleepers(struct smr_worker *smrw)
717	{
718	/*
719	* prevent reordering of making the list empty and checking for waiters.
720	*/
721	if (__improbable(os_atomic_load(&smrw->sect_waiter, compiler_acq_rel))) {
722	if (smrq_empty(&smrw->sect_queue)) {
723	os_atomic_store(&smrw->sect_waiter, NULL, relaxed);
724	waitq_wakeup64_all(waitq: &smrw->waitq,
725	wake_event: __smrw_oncore_event(smrw), THREAD_AWAKENED,
726	flags: WAITQ_WAKEUP_DEFAULT);
727	}
728	}
729	}
730
731	void
732	smr_ack_ipi(void)
733	{
734	/*
735	* see __smr_wait_for_oncore(): if at the time of the IPI ack
736	* the list is empty and there is still a waiter, wake it up.
737	*
738	* If the queue is not empty, then when smr_leave_sleepable()
739	* runs it can't possibly fail to observe smrw->sect_waiter
740	* being non NULL and will do the wakeup then.
741	*/
742	__smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
743	}
744
745	void
746	smr_mark_active_trackers_stalled(thread_t self)
747	{
748	struct smr_worker *smrw = PERCPU_GET(smr_worker);
749	int cpu = cpu_number();
750	smr_tracker_t t;
751
752	/ called at splsched /
753
754	smrq_serialized_foreach_safe(t, &smrw->sect_queue, smrt_link) {
755	smr_t smr = t->smrt_domain;
756	smr_pcpu_t pcpu;
757
758	pcpu = __smr_pcpu(smr, cpu);
759
760	t->smrt_ctid = self->ctid;
761	t->smrt_cpu = cpu;
762
763	hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
764
765	/*
766	* Transfer the section to the stalled queue,
767	* and _then_ leave the regular one.
768	*
769	* A store-release is sufficient to order these stores,
770	* and guarantee that __smr_scan() can't fail to observe
771	* both the @c rd_seq and @c stall_rd_seq during a transfer
772	* of a stalled section that was active when it started.
773	*/
774	if (smrq_empty(&pcpu->stall_queue)) {
775	os_atomic_store(&pcpu->stall_rd_seq, t->smrt_seq, relaxed);
776	}
777	os_atomic_store(&pcpu->c_rd_seq, SMR_SEQ_INVALID, release);
778
779	smrq_serialized_insert_tail_relaxed(&pcpu->stall_queue, &t->smrt_link);
780
781	hw_lck_ticket_unlock_nopreempt(tlock: &pcpu->stall_lock);
782	}
783
784	smrq_init(&smrw->sect_queue);
785
786	__smr_wake_oncore_sleepers(smrw);
787	}
788
789
790	__attribute__((noinline))
791	static void
792	__smr_leave_stalled(smr_t smr, smr_tracker_t tracker, thread_t self)
793	{
794	smr_seq_t new_stall_seq = SMR_SEQ_INVALID;
795	smr_tracker_t first = NULL;
796	smr_pcpu_t pcpu;
797	bool progress;
798
799	pcpu = __smr_pcpu(smr, cpu: tracker->smrt_cpu);
800
801	hw_lck_ticket_lock_nopreempt(&pcpu->stall_lock, &smr_lock_grp);
802
803	progress = smrq_serialized_first(&pcpu->stall_queue,
804	struct smr_tracker, smrt_link) == tracker;
805
806	smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
807	smrq_serialized_remove(&pcpu->stall_queue, &tracker->smrt_link);
808	bzero(s: tracker, n: sizeof(*tracker));
809
810	if (progress) {
811	if (!smrq_empty(&pcpu->stall_queue)) {
812	first = smrq_serialized_first(&pcpu->stall_queue,
813	struct smr_tracker, smrt_link);
814	new_stall_seq = first->smrt_seq;
815	__builtin_assume(new_stall_seq != SMR_SEQ_INVALID);
816	assert(SMR_SEQ_CMP(pcpu->stall_rd_seq, <=, new_stall_seq));
817	}
818
819	os_atomic_store(&pcpu->stall_rd_seq, new_stall_seq, release);
820
821	progress = pcpu->stall_waiter_goal != SMR_SEQ_INVALID;
822	}
823
824	if (progress) {
825	struct turnstile *ts;
826
827	ts = turnstile_prepare(proprietor: (uintptr_t)pcpu, tstore: &pcpu->stall_ts,
828	TURNSTILE_NULL, type: TURNSTILE_KERNEL_MUTEX);
829
830	if (new_stall_seq == SMR_SEQ_INVALID \|\|
831	SMR_SEQ_CMP(pcpu->stall_waiter_goal, <=, new_stall_seq)) {
832	pcpu->stall_waiter_goal = SMR_SEQ_INVALID;
833	waitq_wakeup64_all(waitq: &ts->ts_waitq, CAST_EVENT64_T(pcpu),
834	THREAD_AWAKENED, flags: WAITQ_UPDATE_INHERITOR);
835	} else {
836	turnstile_update_inheritor(turnstile: ts, new_inheritor: ctid_get_thread(ctid: first->smrt_ctid),
837	flags: TURNSTILE_IMMEDIATE_UPDATE \| TURNSTILE_INHERITOR_THREAD);
838	}
839
840	turnstile_update_inheritor_complete(turnstile: ts, flags: TURNSTILE_INTERLOCK_HELD);
841
842	turnstile_complete(proprietor: (uintptr_t)pcpu, tstore: &pcpu->stall_ts,
843	NULL, type: TURNSTILE_KERNEL_MUTEX);
844	}
845
846	/ reenables preemption disabled in smr_leave_sleepable() /
847	hw_lck_ticket_unlock(tlock: &pcpu->stall_lock);
848
849	turnstile_cleanup();
850	}
851
852	void
853	smr_leave_sleepable(smr_t smr, smr_tracker_t tracker)
854	{
855	struct smr_worker *smrw;
856	thread_t self = current_thread();
857
858	assert(tracker->smrt_seq != SMR_SEQ_INVALID);
859	assert(smr->smr_flags & SMR_SLEEPABLE);
860
861	lock_disable_preemption_for_thread(self);
862
863	lck_rw_lock_count_dec(thread: self, lock: smr);
864
865	if (__improbable(tracker->smrt_cpu != -`1`)) {
866	return __smr_leave_stalled(smr, tracker, self);
867	}
868
869	__smr_leave(pcpu: __smr_pcpu(smr));
870
871	smrw = PERCPU_GET(smr_worker);
872	smrq_serialized_remove(&self->smr_stack, &tracker->smrt_stack);
873	smrq_serialized_remove(&smrw->sect_queue, &tracker->smrt_link);
874	bzero(s: tracker, n: sizeof(*tracker));
875
876	__smr_wake_oncore_sleepers(PERCPU_GET(smr_worker));
877
878	lock_enable_preemption();
879	}
880
881
882	#pragma mark SMR domains: advance, wait, poll, synchronize
883
884	static inline smr_seq_t
885	__smr_wr_advance(smr_t smr)
886	{
887	/ [W] /
888	return os_atomic_add(&smr->smr_clock.s_wr_seq, SMR_SEQ_INC, release);
889	}
890
891	static inline bool
892	__smr_rd_advance(smr_t smr, smr_seq_t goal, smr_seq_t rd_seq)
893	{
894	smr_seq_t o_seq;
895
896	os_atomic_thread_fence(seq_cst); / [S3] /
897
898	os_atomic_rmw_loop(&smr->smr_clock.s_rd_seq, o_seq, rd_seq, relaxed, {
899	if (SMR_SEQ_CMP(rd_seq, <=, o_seq)) {
900	rd_seq = o_seq;
901	os_atomic_rmw_loop_give_up(break);
902	}
903	});
904
905	return SMR_SEQ_CMP(goal, <=, rd_seq);
906	}
907
908	__attribute__((noinline))
909	static smr_seq_t
910	__smr_wait_for_stalled(smr_pcpu_t pcpu, smr_seq_t goal)
911	{
912	struct turnstile *ts;
913	thread_t inheritor;
914	wait_result_t wr;
915	smr_seq_t stall_rd_seq;
916
917	hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
918
919	stall_rd_seq = pcpu->stall_rd_seq;
920	if (stall_rd_seq == SMR_SEQ_INVALID \|\|
921	SMR_SEQ_CMP(goal, <=, stall_rd_seq)) {
922	hw_lck_ticket_unlock(tlock: &pcpu->stall_lock);
923	return stall_rd_seq;
924	}
925
926	if (pcpu->stall_waiter_goal == SMR_SEQ_INVALID \|\|
927	SMR_SEQ_CMP(goal, <, pcpu->stall_waiter_goal)) {
928	pcpu->stall_waiter_goal = goal;
929	}
930
931	inheritor = ctid_get_thread(smrq_serialized_first(&pcpu->stall_queue,
932	struct smr_tracker, smrt_link)->smrt_ctid);
933
934	ts = turnstile_prepare(proprietor: (uintptr_t)pcpu, tstore: &pcpu->stall_ts,
935	TURNSTILE_NULL, type: TURNSTILE_KERNEL_MUTEX);
936
937	turnstile_update_inheritor(turnstile: ts, new_inheritor: inheritor,
938	flags: TURNSTILE_DELAYED_UPDATE \| TURNSTILE_INHERITOR_THREAD);
939	wr = waitq_assert_wait64(waitq: &ts->ts_waitq, CAST_EVENT64_T(pcpu),
940	THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
941	turnstile_update_inheritor_complete(turnstile: ts, flags: TURNSTILE_INTERLOCK_HELD);
942
943	if (wr == THREAD_WAITING) {
944	hw_lck_ticket_unlock(tlock: &pcpu->stall_lock);
945	thread_block(THREAD_CONTINUE_NULL);
946	hw_lck_ticket_lock(&pcpu->stall_lock, &smr_lock_grp);
947	}
948
949	turnstile_complete(proprietor: (uintptr_t)pcpu, tstore: &pcpu->stall_ts,
950	NULL, type: TURNSTILE_KERNEL_MUTEX);
951
952	stall_rd_seq = pcpu->stall_rd_seq;
953	hw_lck_ticket_unlock(tlock: &pcpu->stall_lock);
954
955	turnstile_cleanup();
956
957	return stall_rd_seq;
958	}
959
960	__attribute__((noinline))
961	static smr_seq_t
962	__smr_wait_for_oncore(smr_pcpu_t pcpu, smr_seq_t goal, uint32_t cpu)
963	{
964	thread_t self = current_thread();
965	struct smr_worker *smrw;
966	uint64_t deadline = `0`;
967	vm_offset_t base;
968	smr_seq_t rd_seq;
969
970	/*
971	* We are waiting for a currently active SMR section.
972	* Start spin-waiting for it for a bit.
973	*/
974	for (;;) {
975	if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
976	rd_seq == SMR_SEQ_INVALID \|\| SMR_SEQ_CMP(goal, <=, rd_seq))) {
977	return rd_seq;
978	}
979
980	if (deadline == `0`) {
981	clock_interval_to_deadline(interval: smr_wait_spin_us,
982	NSEC_PER_USEC, result: &deadline);
983	} else if (mach_absolute_time() > deadline) {
984	break;
985	}
986	}
987
988	/*
989	* This section is being active for a while,
990	* we need to move to a more passive way of waiting.
991	*
992	* We post ourselves on the remote processor tracking head,
993	* to denote we need a thread_wakeup() when the tracker head clears,
994	* then send an IPI which will have 2 possible outcomes:
995	*
996	* 1. when smr_ack_ipi() runs, the queue is already cleared,
997	* and we will be woken up immediately.
998	*
999	* 2. when smr_ack_ipi() runs, the queue isn't cleared,
1000	* then it does nothing, but there is a guarantee that
1001	* when the queue clears, the remote core will observe
1002	* that there is a waiter, and thread_wakeup() will be
1003	* called then.
1004	*
1005	* In order to avoid to actually wait, we do spin some more,
1006	* hoping for the remote sequence to change.
1007	*/
1008	base = other_percpu_base(cpu_number: cpu);
1009	smrw = PERCPU_GET_WITH_BASE(base, smr_worker);
1010
1011	waitq_assert_wait64(waitq: &smrw->waitq, wait_event: __smrw_oncore_event(smrw),
1012	THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
1013
1014	if (lock_cmpxchg(&smrw->sect_waiter, NULL, self, relaxed)) {
1015	/*
1016	* only really send the IPI if we're first,
1017	* to avoid IPI storms in case of a pile-up
1018	* of smr_synchronize() calls stalled on the same guy.
1019	*/
1020	cause_ast_check(PERCPU_GET_WITH_BASE(base, processor));
1021	}
1022
1023	if (hw_spin_wait_until(&pcpu->c_rd_seq, rd_seq,
1024	rd_seq == SMR_SEQ_INVALID \|\| SMR_SEQ_CMP(goal, <=, rd_seq))) {
1025	clear_wait(thread: self, THREAD_AWAKENED);
1026	return rd_seq;
1027	}
1028
1029	thread_block(THREAD_CONTINUE_NULL);
1030
1031	return os_atomic_load(&pcpu->c_rd_seq, relaxed);
1032	}
1033
1034	__attribute__((noinline))
1035	static bool
1036	__smr_scan(smr_t smr, smr_seq_t goal, smr_clock_t clk, bool wait)
1037	{
1038	smr_delta_t delta;
1039	smr_seq_t rd_seq;
1040
1041	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1042	return true;
1043	}
1044
1045	/*
1046	* Validate that the goal is sane.
1047	*/
1048	delta = SMR_SEQ_DELTA(goal, clk.s_wr_seq);
1049	if (delta == SMR_SEQ_INC) {
1050	/*
1051	* This SMR clock uses deferred advance,
1052	* and the goal is one inc in the future.
1053	*
1054	* If we can wait, then commit the sequence number,
1055	* else we can't possibly succeed.
1056	*
1057	* Doing a commit here rather than an advance
1058	* gives the hardware a chance to abort the
1059	* transaction early in case of high contention
1060	* compared to an unconditional advance.
1061	*/
1062	if (!wait) {
1063	return false;
1064	}
1065	if (lock_cmpxchgv(&smr->smr_clock.s_wr_seq,
1066	clk.s_wr_seq, goal, &clk.s_wr_seq, relaxed)) {
1067	clk.s_wr_seq = goal;
1068	}
1069	} else if (delta > `0`) {
1070	/*
1071	* Invalid goal: the caller held on it for too long,
1072	* and integers wrapped.
1073	*/
1074	return true;
1075	}
1076
1077	os_atomic_thread_fence(seq_cst); / [S2] /
1078
1079	/*
1080	* The read sequence can be no larger than the write sequence
1081	* at the start of the poll.
1082	*
1083	* We know that on entry:
1084	*
1085	* s_rd_seq < goal <= s_wr_seq
1086	*
1087	* The correctness of this algorithm relies on the fact that
1088	* the SMR domain [s_rd_seq, s_wr_seq) can't possibly move
1089	* by more than roughly (ULONG_MAX / 2) while __smr_scan()
1090	* is running, otherwise the "rd_seq" we try to scan for
1091	* might appear larger than s_rd_seq spuriously and we'd
1092	* __smr_rd_advance() incorrectly.
1093	*
1094	* This is guaranteed by the fact that this represents
1095	* advancing 2^62 times. At one advance every nanosecond,
1096	* it takes more than a century, which makes it possible
1097	* to call smr_wait() or smr_poll() with preemption enabled.
1098	*/
1099	rd_seq = clk.s_wr_seq;
1100
1101	zpercpu_foreach_cpu(cpu) {
1102	smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1103	smr_seq_t seq = os_atomic_load(&pcpu->c_rd_seq, relaxed);
1104
1105	while (seq != SMR_SEQ_INVALID) {
1106	/*
1107	* Resolve the race documented in __smr_enter().
1108	*
1109	* The CPU has loaded a stale s_wr_seq, and s_rd_seq
1110	* moved past this stale value.
1111	*
1112	* Its critical section is however properly serialized,
1113	* but we can't know what the "correct" s_wr_seq it
1114	* could have observed was. We have to assume `s_rd_seq`
1115	* to prevent it from advancing.
1116	*/
1117	if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1118	seq = clk.s_rd_seq;
1119	}
1120
1121	if (!wait \|\| SMR_SEQ_CMP(goal, <=, seq)) {
1122	seq &= ~SMR_SEQ_SLEEPABLE;
1123	break;
1124	}
1125
1126	if (seq & SMR_SEQ_SLEEPABLE) {
1127	seq = __smr_wait_for_oncore(pcpu, goal, cpu);
1128	} else {
1129	disable_preemption();
1130	seq = hw_wait_while_equals_long(&pcpu->c_rd_seq, seq);
1131	enable_preemption();
1132	}
1133	}
1134
1135	if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1136	rd_seq = seq;
1137	}
1138	}
1139
1140	if (smr->smr_flags & SMR_SLEEPABLE) {
1141	/*
1142	* Order observation of stalled sections,
1143	* see smr_mark_active_trackers_stalled().
1144	*/
1145	os_atomic_thread_fence(seq_cst);
1146
1147	zpercpu_foreach_cpu(cpu) {
1148	smr_pcpu_t pcpu = __smr_pcpu(smr, cpu);
1149	smr_seq_t seq = os_atomic_load(&pcpu->stall_rd_seq, relaxed);
1150
1151	while (seq != SMR_SEQ_INVALID) {
1152	if (SMR_SEQ_CMP(seq, <, clk.s_rd_seq)) {
1153	seq = clk.s_rd_seq;
1154	}
1155
1156	if (!wait \|\| SMR_SEQ_CMP(goal, <=, seq)) {
1157	seq &= ~SMR_SEQ_SLEEPABLE;
1158	break;
1159	}
1160
1161	seq = __smr_wait_for_stalled(pcpu, goal);
1162	}
1163
1164	if (seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(seq, <, rd_seq)) {
1165	rd_seq = seq;
1166	}
1167	}
1168	}
1169
1170	/*
1171	* Advance the rd_seq as long as we observed a more recent value.
1172	*/
1173	return __smr_rd_advance(smr, goal, rd_seq);
1174	}
1175
1176	static inline bool
1177	__smr_poll(smr_t smr, smr_seq_t goal, bool wait)
1178	{
1179	smr_clock_t clk;
1180
1181	/*
1182	* Load both the s_rd_seq and s_wr_seq in the right order so that we
1183	* can't observe a s_rd_seq older than s_wr_seq.
1184	*/
1185
1186	/ [S1] /
1187	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, acquire);
1188
1189	/*
1190	* We expect this to be typical: the goal has already been observed.
1191	*/
1192	if (__probable(SMR_SEQ_CMP(goal, <=, clk.s_rd_seq))) {
1193	return true;
1194	}
1195
1196	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1197
1198	return __smr_scan(smr, goal, clk, wait);
1199	}
1200
1201	smr_seq_t
1202	smr_advance(smr_t smr)
1203	{
1204	smr_clock_t clk;
1205
1206	assert(!smr_entered(smr));
1207
1208	/*
1209	* We assume that there will at least be a successful __smr_poll
1210	* call every 2^60 calls to smr_advance() or so, so we do not need
1211	* to check if [s_rd_seq, s_wr_seq) is growing too wide.
1212	*/
1213	static_assert(sizeof(clk.s_wr_seq) == `8`);
1214	return __smr_wr_advance(smr);
1215	}
1216
1217	smr_seq_t
1218	smr_deferred_advance(smr_t smr)
1219	{
1220	os_atomic_thread_fence(seq_cst);
1221	return SMR_SEQ_INC + os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1222	}
1223
1224	void
1225	smr_deferred_advance_commit(smr_t smr, smr_seq_t seq)
1226	{
1227	/*
1228	* no barrier needed: smr_deferred_advance() had one already.
1229	* no failure handling: it means someone updated the clock already!
1230	* lock_cmpxchg: so that we pre-test for architectures needing it.
1231	*/
1232	assert(seq != SMR_SEQ_INVALID);
1233	lock_cmpxchg(&smr->smr_clock.s_wr_seq, seq - SMR_SEQ_INC, seq, relaxed);
1234	}
1235
1236	bool
1237	smr_poll(smr_t smr, smr_seq_t goal)
1238	{
1239	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1240	return __smr_poll(smr, goal, false);
1241	}
1242
1243	void
1244	smr_wait(smr_t smr, smr_seq_t goal)
1245	{
1246	assert(!smr_entered(smr) && goal != SMR_SEQ_INVALID);
1247	if (smr->smr_flags & SMR_SLEEPABLE) {
1248	assert(get_preemption_level() == `0`);
1249	}
1250	(void)__smr_poll(smr, goal, true);
1251	}
1252
1253	void
1254	smr_synchronize(smr_t smr)
1255	{
1256	smr_clock_t clk;
1257
1258	assert(!smr_entered(smr));
1259	assert(!ml_at_interrupt_context());
1260	if (smr->smr_flags & SMR_SLEEPABLE) {
1261	assert(get_preemption_level() == `0`);
1262	}
1263
1264	/*
1265	* Similar to __smr_poll() but also does a deferred advance which
1266	* __smr_scan will commit.
1267	*/
1268
1269	clk.s_rd_seq = os_atomic_load(&smr->smr_clock.s_rd_seq, relaxed);
1270	os_atomic_thread_fence(seq_cst);
1271	clk.s_wr_seq = os_atomic_load(&smr->smr_clock.s_wr_seq, relaxed);
1272
1273	(void)__smr_scan(smr, goal: clk.s_wr_seq + SMR_SEQ_INC, clk, true);
1274	}
1275
1276
1277	#pragma mark SMR domains: smr_call & smr_barrier
1278
1279	/!*
1280	* @struct smr_barrier_ctx
1281	*
1282	* @brief
1283	* Data structure to track the completion of an smr_barrier() call.
1284	*/
1285	struct smr_barrier_ctx {
1286	struct smr *smrb_domain;
1287	struct thread *smrb_waiter;
1288	uint32_t smrb_pending;
1289	uint32_t smrb_count;
1290	};
1291
1292	/!*
1293	* @struct smr_barrier_job
1294	*
1295	* @brief
1296	* Data structure used to track completion of smr_barrier() calls.
1297	*/
1298	struct smr_barrier_job {
1299	struct smr_barrier_ctx *smrj_context;
1300	union {
1301	struct smr_node smrj_node;
1302	struct mpsc_queue_chain smrj_link;
1303	};
1304	};
1305
1306	#define SMR_BARRIER_SIZE 24
1307	static_assert(sizeof(struct smr_barrier_job) == SMR_BARRIER_SIZE);
1308	#define SMR_BARRIER_USE_STACK (SMR_BARRIER_SIZE * MAX_CPUS <= 512)
1309
1310	static void
1311	__smr_worker_check_invariants(struct smr_worker *smrw)
1312	{
1313	#if MACH_ASSERT
1314	smr_pcpu_t pcpu = smrw->whead;
1315	uint16_t num = (uint16_t)cpu_number();
1316
1317	assert(!ml_get_interrupts_enabled() \|\| get_preemption_level());
1318
1319	for (; pcpu != *smrw->wold_tail; pcpu = pcpu->drain_next) {
1320	assertf(pcpu->qold_seq != SMR_SEQ_INVALID &&
1321	__smr_pcpu_queued(pcpu),
1322	"pcpu %p doesn't belong on %p old queue", pcpu, smrw);
1323	pcpu->__check_cpu = num;
1324	pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1325	pcpu->__check_list = `1`;
1326	}
1327
1328	for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1329	__assert_only smr_t smr = pcpu->drain_smr;
1330
1331	assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1332	pcpu->qage_seq != SMR_SEQ_INVALID &&
1333	SMR_SEQ_CMP(pcpu->qage_seq, <=, smr->smr_clock.s_wr_seq) &&
1334	__smr_pcpu_queued(pcpu),
1335	"pcpu %p doesn't belong on %p aging queue", pcpu, smrw);
1336	pcpu->__check_cpu = num;
1337	pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1338	pcpu->__check_list = `2`;
1339	}
1340
1341	for (; pcpu != *smrw->wcur_tail; pcpu = pcpu->drain_next) {
1342	assertf(pcpu->qold_seq == SMR_SEQ_INVALID &&
1343	pcpu->qage_seq != SMR_SEQ_INVALID &&
1344	__smr_pcpu_queued(pcpu),
1345	"pcpu %p doesn't belong on %p current queue", pcpu, smrw);
1346	pcpu->__check_cpu = num;
1347	pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1348	pcpu->__check_list = `3`;
1349	}
1350
1351	assert(pcpu == NULL);
1352	#else
1353	(void)smrw;
1354	#endif
1355	}
1356
1357	__attribute__((noinline))
1358	static void
1359	__smr_cpu_lazy_up(struct smr_worker *smrw)
1360	{
1361	spl_t spl;
1362
1363	/*
1364	* calling smr_call/smr_barrier() from the context of a CPU
1365	* with a detached worker is illegal.
1366	*
1367	* However, bound threads might run on a derecommended (IGNORED)
1368	* cpu which we correct for here (and the CPU will go back to IGNORED
1369	* in smr_cpu_leave()).
1370	*/
1371	assert(smrw->detach_reason == SMR_CPU_REASON_IGNORED);
1372
1373	spl = splsched();
1374	__smrw_lock(smrw);
1375	smrw->detach_reason &= ~SMR_CPU_REASON_IGNORED;
1376	__smrw_unlock(smrw);
1377	splx(spl);
1378	}
1379
1380	static void
1381	__smr_cpu_lazy_up_if_needed(struct smr_worker *smrw)
1382	{
1383	if (__improbable(smrw->detach_reason != SMR_CPU_REASON_NONE)) {
1384	__smr_cpu_lazy_up(smrw);
1385	}
1386	}
1387
1388	static bool
1389	__smr_call_should_advance(smr_pcpu_t pcpu)
1390	{
1391	if (pcpu->qcur_cnt > smr_call_cnt_cap) {
1392	return true;
1393	}
1394	if (pcpu->qcur_size > smr_call_size_cap) {
1395	return true;
1396	}
1397	return false;
1398	}
1399
1400	static void
1401	__smr_call_advance_qcur(smr_t smr, smr_pcpu_t pcpu, bool needs_commit)
1402	{
1403	smr_seq_t new_seq;
1404
1405	if (needs_commit \|\| pcpu->qage_seq) {
1406	new_seq = smr_advance(smr);
1407	} else {
1408	new_seq = smr_deferred_advance(smr);
1409	}
1410	__builtin_assume(new_seq != SMR_SEQ_INVALID);
1411
1412	pcpu->qage_seq = new_seq;
1413	pcpu->qage_tail = pcpu->qcur_tail;
1414
1415	pcpu->qcur_size = `0`;
1416	pcpu->qcur_cnt = `0`;
1417	}
1418
1419	static void
1420	__smr_call_push(smr_pcpu_t pcpu, smr_node_t node, smr_cb_t cb)
1421	{
1422	assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1423
1424	node->smrn_next = NULL;
1425	node->smrn_cb = cb;
1426
1427	*pcpu->qcur_tail = node;
1428	pcpu->qcur_tail = &node->smrn_next;
1429	pcpu->qcur_cnt += `1`;
1430	}
1431
1432	static void
1433	__smr_call_dispatch(struct smr_worker *smrw, smr_pcpu_t pcpu)
1434	{
1435	__smr_worker_check_invariants(smrw);
1436
1437	if (!__smr_pcpu_queued(pcpu)) {
1438	assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1439	assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1440
1441	pcpu->drain_next = NULL;
1442	*smrw->wcur_tail = pcpu;
1443	smrw->wcur_tail = &pcpu->drain_next;
1444	}
1445	}
1446
1447	void
1448	smr_call(smr_t smr, smr_node_t node, vm_size_t size, smr_cb_t cb)
1449	{
1450	struct smr_worker *smrw;
1451	smr_pcpu_t pcpu;
1452
1453	if (__improbable(startup_phase < STARTUP_SUB_EARLY_BOOT)) {
1454	return cb(node);
1455	}
1456
1457	lock_disable_preemption_for_thread(current_thread());
1458	assert(!ml_at_interrupt_context());
1459
1460	smrw = PERCPU_GET(smr_worker);
1461	__smr_cpu_lazy_up_if_needed(smrw);
1462
1463	pcpu = __smr_pcpu(smr);
1464	assert(pcpu->c_rd_seq == SMR_SEQ_INVALID);
1465
1466	if (os_add_overflow(pcpu->qcur_size, size, &pcpu->qcur_size)) {
1467	pcpu->qcur_size = UINT32_MAX;
1468	}
1469
1470	__smr_call_push(pcpu, node, cb);
1471	if (__smr_call_should_advance(pcpu)) {
1472	if (pcpu->qage_seq == SMR_SEQ_INVALID) {
1473	__smr_call_advance_qcur(smr, pcpu, false);
1474	}
1475	__smr_call_dispatch(smrw, pcpu);
1476	}
1477
1478	return lock_enable_preemption();
1479	}
1480
1481	static inline event_t
1482	__smrb_event(struct smr_barrier_ctx *ctx)
1483	{
1484	return ctx;
1485	}
1486
1487	static void
1488	__smr_barrier_cb(struct smr_node *node)
1489	{
1490	struct smr_barrier_job *job;
1491	struct smr_barrier_ctx *ctx;
1492
1493	job = __container_of(node, struct smr_barrier_job, smrj_node);
1494	ctx = job->smrj_context;
1495
1496	if (os_atomic_dec(&ctx->smrb_pending, relaxed) == `0`) {
1497	/*
1498	* It is permitted to still reach into the context
1499	* because smr_barrier() always blocks, which means
1500	* that the context will be valid until this wakeup
1501	* happens.
1502	*/
1503	thread_wakeup_thread(event: __smrb_event(ctx), thread: ctx->smrb_waiter);
1504	}
1505	}
1506
1507	static bool
1508	__smr_barrier_drain(struct smr_worker *smrw, bool needs_commit)
1509	{
1510	mpsc_queue_chain_t head, tail, it;
1511
1512	head = mpsc_queue_dequeue_batch(q: &smrw->barrier_queue, tail: &tail,
1513	OS_ATOMIC_DEPENDENCY_NONE);
1514
1515	mpsc_queue_batch_foreach_safe(it, head, tail) {
1516	struct smr_barrier_job *job;
1517	struct smr_barrier_ctx *ctx;
1518	smr_pcpu_t pcpu;
1519	smr_t smr;
1520
1521	job = __container_of(it, struct smr_barrier_job, smrj_link);
1522	ctx = job->smrj_context;
1523	smr = ctx->smrb_domain;
1524	pcpu = __smr_pcpu(smr, smrw->processor->cpu_id);
1525
1526	pcpu->qcur_size = UINT32_MAX;
1527	__smr_call_push(pcpu, &job->smrj_node, __smr_barrier_cb);
1528	__smr_call_advance_qcur(smr, pcpu, needs_commit);
1529	__smr_call_dispatch(smrw, pcpu);
1530	}
1531
1532	return head != NULL;
1533	}
1534
1535
1536	void
1537	smr_barrier(smr_t smr)
1538	{
1539	#if SMR_BARRIER_USE_STACK
1540	struct smr_barrier_job jobs[MAX_CPUS];
1541	#else
1542	struct smr_barrier_job *jobs;
1543	#endif
1544	struct smr_barrier_job *job;
1545	struct smr_barrier_ctx ctx = {
1546	.smrb_domain = smr,
1547	.smrb_waiter = current_thread(),
1548	.smrb_pending = zpercpu_count(),
1549	.smrb_count = zpercpu_count(),
1550	};
1551	spl_t spl;
1552
1553	/*
1554	* First wait for all readers to observe whatever it is
1555	* that changed prior to this call.
1556	*
1557	* _then_ enqueue callbacks that push out anything ahead.
1558	*/
1559	smr_synchronize(smr);
1560
1561	#if !SMR_BARRIER_USE_STACK
1562	jobs = kalloc_type(struct smr_barrier_job, ctx.smrb_count,
1563	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
1564	#endif
1565	job = jobs;
1566	spl = splsched();
1567
1568	__smr_cpu_lazy_up_if_needed(PERCPU_GET(smr_worker));
1569
1570	percpu_foreach(smrw, smr_worker) {
1571	job->smrj_context = &ctx;
1572	if (mpsc_queue_append(q: &smrw->barrier_queue, elm: &job->smrj_link)) {
1573	__smrw_lock(smrw);
1574	__smrw_wakeup_and_unlock(smrw);
1575	}
1576	job++;
1577	}
1578
1579	/*
1580	* Because we disabled interrupts, our own CPU's callback
1581	* can't possibly have run, so just block.
1582	*
1583	* We must block in order to guarantee the lifetime of "ctx".
1584	* (See comment in __smr_barrier_cb).
1585	*/
1586	assert_wait(event: __smrb_event(ctx: &ctx), THREAD_UNINT);
1587	assert(ctx.smrb_pending > `0`);
1588	splx(spl);
1589	thread_block(THREAD_CONTINUE_NULL);
1590
1591	#if !SMR_BARRIER_USE_STACK
1592	kfree_type(struct smr_barrier_job, ctx.smrb_count, jobs);
1593	#endif
1594	}
1595
1596
1597	#pragma mark SMR domains: smr_worker
1598
1599	static void
1600	__smr_worker_drain_lock(struct smr_worker *smrw)
1601	{
1602	for (;;) {
1603	ml_set_interrupts_enabled(false);
1604	__smrw_lock(smrw);
1605
1606	/*
1607	* Check we are on an appropriate processor
1608	*
1609	* Note that we might be running on the canonical
1610	* processor incorrectly: if the processor has been
1611	* de-recommended but isn't offline.
1612	*/
1613	if (__probable(current_processor() == smrw->processor)) {
1614	if (__probable(!smrw->detach_reason)) {
1615	break;
1616	}
1617	} else {
1618	if (__probable(smrw->detach_reason)) {
1619	break;
1620	}
1621	}
1622
1623	/ go bind in the right place and retry /
1624	thread_bind(processor: __smrw_drain_bind_target(smrw));
1625	__smrw_unlock(smrw);
1626	ml_set_interrupts_enabled(true);
1627	thread_block(THREAD_CONTINUE_NULL);
1628	}
1629	}
1630
1631	static void
1632	__smr_worker_drain_unlock(struct smr_worker *smrw)
1633	{
1634	__smrw_unlock(smrw);
1635	ml_set_interrupts_enabled(true);
1636	}
1637
1638	/!*
1639	* @function __smr_worker_tick
1640	*
1641	* @brief
1642	* Make the SMR worker queues make gentle progress
1643	*
1644	* @discussion
1645	* One round of progress will:
1646	* - move entries that have aged as being old,
1647	* - commit entries that have a deferred sequence and let them age.
1648	*
1649	* If this results into any callbacks to become "old",
1650	* then the worker is being woken up to start running callbacks.
1651	*
1652	* This function must run either on the processfor for this worker,
1653	* or under the worker drain lock being held.
1654	*/
1655	static void
1656	__smr_worker_tick(struct smr_worker *smrw, uint64_t ctime, bool wakeup)
1657	{
1658	smr_pcpu_t pcpu = *smrw->wold_tail;
1659
1660	__smr_worker_check_invariants(smrw);
1661
1662	for (; pcpu != *smrw->wage_tail; pcpu = pcpu->drain_next) {
1663	assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1664	assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1665
1666	pcpu->qold_seq = pcpu->qage_seq;
1667	pcpu->qold_tail = pcpu->qage_tail;
1668
1669	pcpu->qage_seq = SMR_SEQ_INVALID;
1670	}
1671
1672	for (; pcpu; pcpu = pcpu->drain_next) {
1673	assert(pcpu->qold_seq == SMR_SEQ_INVALID);
1674	assert(pcpu->qage_seq != SMR_SEQ_INVALID);
1675
1676	smr_deferred_advance_commit(smr: pcpu->drain_smr, seq: pcpu->qage_seq);
1677	}
1678
1679	smrw->wold_tail = smrw->wage_tail;
1680	smrw->wage_tail = smrw->wcur_tail;
1681	smrw->drain_ctime = ctime;
1682
1683	__smr_worker_check_invariants(smrw);
1684
1685	if (wakeup && smrw->wold_tail != &smrw->whead) {
1686	__smrw_lock(smrw);
1687	__smrw_wakeup_and_unlock(smrw);
1688	}
1689	}
1690
1691	static void
1692	__smr_worker_update_wold_tail(struct smr_worker smrw, smr_pcpu_t new_tail)
1693	{
1694	smr_pcpu_t *old_tail = smrw->wold_tail;
1695
1696	if (smrw->wcur_tail == old_tail) {
1697	smrw->wage_tail = new_tail;
1698	smrw->wcur_tail = new_tail;
1699	} else if (smrw->wage_tail == old_tail) {
1700	smrw->wage_tail = new_tail;
1701	}
1702
1703	smrw->wold_tail = new_tail;
1704	}
1705
1706	static void
1707	__smr_worker_drain_one(struct smr_worker *smrw, smr_pcpu_t pcpu)
1708	{
1709	smr_t smr = pcpu->drain_smr;
1710	smr_seq_t seq = pcpu->qold_seq;
1711	smr_node_t head;
1712
1713	/*
1714	* Step 1: pop the "old" items,
1715	* (qold_tail/qold_seq left dangling)
1716	*/
1717
1718	assert(seq != SMR_SEQ_INVALID);
1719	head = pcpu->qhead;
1720	pcpu->qhead = *pcpu->qold_tail;
1721	*pcpu->qold_tail = NULL;
1722
1723	/*
1724	* Step 2: Reconstruct the queue
1725	* based on the sequence numbers and count fields.
1726	*
1727	* Do what __smr_worker_tick() would do on this queue:
1728	* - commit the aging queue
1729	* - advance the current queue if needed
1730	*/
1731
1732	if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1733	assert(pcpu->qage_tail != pcpu->qold_tail);
1734
1735	smr_deferred_advance_commit(smr, seq: pcpu->qage_seq);
1736	pcpu->qold_seq = pcpu->qage_seq;
1737	pcpu->qold_tail = pcpu->qage_tail;
1738	} else {
1739	assert(pcpu->qage_tail == pcpu->qold_tail);
1740
1741	pcpu->qold_seq = SMR_SEQ_INVALID;
1742	pcpu->qold_tail = &pcpu->qhead;
1743	}
1744
1745	if (__smr_call_should_advance(pcpu)) {
1746	__smr_call_advance_qcur(smr, pcpu, false);
1747	} else {
1748	pcpu->qage_seq = SMR_SEQ_INVALID;
1749	pcpu->qage_tail = pcpu->qold_tail;
1750	if (pcpu->qcur_cnt == `0`) {
1751	pcpu->qcur_tail = pcpu->qage_tail;
1752	}
1753	}
1754
1755	if (pcpu->qold_seq != SMR_SEQ_INVALID) {
1756	/*
1757	* The node has gained an "old seq" back,
1758	* it goes to the ready queue.
1759	*/
1760	pcpu->drain_next = *smrw->wold_tail;
1761	*smrw->wold_tail = pcpu;
1762	__smr_worker_update_wold_tail(smrw,
1763	new_tail: &pcpu->drain_next);
1764	} else if (pcpu->qage_seq != SMR_SEQ_INVALID) {
1765	/*
1766	* The node has gained an "age seq" back,
1767	* it needs to age and wait for a tick
1768	* for its sequence number to be commited.
1769	*/
1770	pcpu->drain_next = NULL;
1771	*smrw->wcur_tail = pcpu;
1772	smrw->wcur_tail = &pcpu->drain_next;
1773	} else {
1774	/*
1775	* The node is empty or with "current"
1776	* callbacks only, it can be dequeued.
1777	*/
1778	assert(!__smr_call_should_advance(pcpu));
1779	pcpu->__check_cpu = (uint16_t)cpu_number();
1780	pcpu->__check_reason = (uint8_t)smrw->detach_reason;
1781	pcpu->__check_list = `0`;
1782	__smr_pcpu_set_not_queued(pcpu);
1783	}
1784
1785	/*
1786	* Step 3: drain callbacks.
1787	*/
1788	__smr_worker_check_invariants(smrw);
1789	__smr_worker_drain_unlock(smrw);
1790
1791	__smr_poll(smr, goal: seq, true);
1792	__smr_call_drain(head);
1793
1794	__smr_worker_drain_lock(smrw);
1795	}
1796
1797	static void
1798	__smr_worker_continue(void *arg, wait_result_t wr __unused)
1799	{
1800	smr_pcpu_t pcpu = NULL, next = NULL;
1801	struct smr_worker *const smrw = arg;
1802	uint64_t deadline;
1803
1804	__smr_worker_drain_lock(smrw);
1805	__smr_worker_check_invariants(smrw);
1806
1807	if (smrw->wold_tail != &smrw->whead) {
1808	next = smrw->whead;
1809	smrw->whead = *smrw->wold_tail;
1810	*smrw->wold_tail = NULL;
1811	__smr_worker_update_wold_tail(smrw, new_tail: &smrw->whead);
1812	}
1813
1814	/*
1815	* The pipeline of per-cpu SMR data structures with pending
1816	* smr_call() callbacks has three stages: wcur -> wage -> wold.
1817	*
1818	* In order to guarantee forward progress, a tick happens
1819	* for each of them, either via __smr_worker_tick(),
1820	* or via __smr_worker_drain_one().
1821	*
1822	* The second tick will happen either because to core stayed
1823	* busy enough that a subsequent smr_cpu_tick() decided to
1824	* perform it, or because the CPU idled, and smr_cpu_leave()
1825	* will perform an unconditional __smr_worker_tick().
1826	*/
1827	__smr_barrier_drain(smrw, false);
1828	__smr_worker_tick(smrw, ctime: mach_absolute_time(), false);
1829
1830	while ((pcpu = next)) {
1831	next = next->drain_next;
1832	__smr_worker_drain_one(smrw, pcpu);
1833	}
1834
1835	if (__improbable(smrw->whead && smrw->detach_reason)) {
1836	/*
1837	* If the thread isn't bound, we want to flush anything
1838	* that is pending without causing too much contention.
1839	*
1840	* Sleep for a bit in order to give the system time
1841	* to observe any advance commits we did.
1842	*/
1843	deadline = mach_absolute_time() + cpu_checkin_min_interval;
1844	} else {
1845	deadline = TIMEOUT_WAIT_FOREVER;
1846	}
1847	waitq_assert_wait64_locked(waitq: &smrw->waitq, wait_event: __smrw_drain_event(smrw),
1848	THREAD_UNINT, TIMEOUT_URGENCY_SYS_NORMAL, deadline,
1849	TIMEOUT_NO_LEEWAY, thread: smrw->thread);
1850
1851	/*
1852	* Make sure there's no barrier left, after we called assert_wait()
1853	* in order to pair with __smr_barrier_cb(). If we do find some,
1854	* we must be careful about invariants and forward progress.
1855	*
1856	* For affected domains, the dequeued barriers have been added
1857	* to their "qage" queue. If their "qage" queue was non empty,
1858	* then its "qage_seq" was already commited, and we must preserve
1859	* this invariant.
1860	*
1861	* Affected domains that were idle before will get enqueued on this
1862	* worker's "wcur" queue. In order to guarantee forward progress,
1863	* we must force a tick if both the "wage" and "wold" queues
1864	* of the worker are empty.
1865	*/
1866	if (__improbable(__smr_barrier_drain(smrw, true))) {
1867	if (smrw->wage_tail == &smrw->whead) {
1868	__smr_worker_tick(smrw, ctime: mach_absolute_time(), false);
1869	}
1870	}
1871
1872	__smr_worker_check_invariants(smrw);
1873	__smr_worker_drain_unlock(smrw);
1874
1875	thread_block_parameter(continuation: __smr_worker_continue, parameter: smrw);
1876	}
1877
1878
1879	#pragma mark SMR domains: scheduler integration
1880
1881	#if CONFIG_QUIESCE_COUNTER
1882	__startup_data
1883	static uint64_t _Atomic quiesce_gen_startup;
1884	static uint64_t _Atomic *quiesce_genp = &quiesce_gen_startup;
1885	static uint64_t _Atomic quiesce_ctime;
1886
1887	void
1888	cpu_quiescent_set_storage(uint64_t _Atomic *ptr)
1889	{
1890	/*
1891	* Transfer to the real location for the commpage.
1892	*
1893	* this is ok to do like this because the system
1894	* is still single threaded.
1895	*/
1896	uint64_t gen = os_atomic_load(&quiesce_gen_startup, relaxed);
1897
1898	os_atomic_store(ptr, gen, relaxed);
1899	quiesce_genp = ptr;
1900	}
1901
1902	static smr_seq_t
1903	cpu_quiescent_gen_to_seq(uint64_t gen)
1904	{
1905	return gen * SMR_SEQ_INC + SMR_SEQ_INIT;
1906	}
1907
1908	static void
1909	cpu_quiescent_advance(uint64_t gen, uint64_t ctime __kdebug_only)
1910	{
1911	smr_seq_t seq = cpu_quiescent_gen_to_seq(gen);
1912
1913	os_atomic_thread_fence(seq_cst);
1914
1915	percpu_foreach(it, smr_worker) {
1916	smr_seq_t rd_seq = os_atomic_load(&it->rd_quiesce_seq, relaxed);
1917
1918	if (rd_seq != SMR_SEQ_INVALID && SMR_SEQ_CMP(rd_seq, <, seq)) {
1919	return;
1920	}
1921	}
1922
1923	os_atomic_thread_fence(seq_cst);
1924
1925	if (lock_cmpxchg(quiesce_genp, gen, gen + `1`, relaxed)) {
1926	KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER),
1927	gen, `0`, ctime, `0`);
1928	}
1929	}
1930
1931	static void
1932	cpu_quiescent_join(struct smr_worker *smrw)
1933	{
1934	uint64_t gen = os_atomic_load(quiesce_genp, relaxed);
1935
1936	assert(smrw->rd_quiesce_seq == SMR_SEQ_INVALID);
1937	os_atomic_store(&smrw->rd_quiesce_seq,
1938	cpu_quiescent_gen_to_seq(gen), relaxed);
1939	os_atomic_thread_fence(seq_cst);
1940	}
1941
1942	static void
1943	cpu_quiescent_tick(struct smr_worker *smrw, uint64_t ctime, uint64_t interval)
1944	{
1945	uint64_t gen = os_atomic_load(quiesce_genp, relaxed);
1946	smr_seq_t seq = cpu_quiescent_gen_to_seq(gen);
1947
1948	if (smrw->rd_quiesce_seq == SMR_SEQ_INVALID) {
1949	/*
1950	* Likely called because of the scheduler tick,
1951	* smr_maintenance() will do the right thing.
1952	*/
1953	assert(current_processor()->state != PROCESSOR_RUNNING);
1954	} else if (seq != smrw->rd_quiesce_seq) {
1955	/*
1956	* Someone managed to update the sequence already,
1957	* learn it, update our ctime.
1958	*/
1959	os_atomic_store(&smrw->rd_quiesce_seq, seq, release);
1960	os_atomic_store(&quiesce_ctime, ctime, relaxed);
1961	os_atomic_thread_fence(seq_cst);
1962	} else if ((ctime - os_atomic_load(&quiesce_ctime, relaxed)) > interval) {
1963	/*
1964	* The system looks busy enough we want to update
1965	* the counter faster than every scheduler tick.
1966	*/
1967	os_atomic_store(&quiesce_ctime, ctime, relaxed);
1968	cpu_quiescent_advance(gen, ctime);
1969	}
1970	}
1971
1972	static void
1973	cpu_quiescent_leave(struct smr_worker *smrw)
1974	{
1975	assert(smrw->rd_quiesce_seq != SMR_SEQ_INVALID);
1976	os_atomic_store(&smrw->rd_quiesce_seq, SMR_SEQ_INVALID, release);
1977	}
1978	#endif /* CONFIG_QUIESCE_COUNTER */
1979
1980	uint32_t
1981	smr_cpu_checkin_get_min_interval_us(void)
1982	{
1983	return cpu_checkin_min_interval_us;
1984	}
1985
1986	void
1987	smr_cpu_checkin_set_min_interval_us(uint32_t new_value_us)
1988	{
1989	/ clamp to something vaguely sane /
1990	if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US) {
1991	new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
1992	}
1993
1994	cpu_checkin_min_interval_us = new_value_us;
1995
1996	uint64_t abstime = `0`;
1997	clock_interval_to_absolutetime_interval(interval: cpu_checkin_min_interval_us,
1998	NSEC_PER_USEC, result: &abstime);
1999	cpu_checkin_min_interval = abstime;
2000	}
2001
2002	__startup_func
2003	static void
2004	smr_cpu_checkin_init_min_interval_us(void)
2005	{
2006	smr_cpu_checkin_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
2007	}
2008	STARTUP(TUNABLES, STARTUP_RANK_FIRST, smr_cpu_checkin_init_min_interval_us);
2009
2010	static void
2011	__smr_cpu_init_thread(struct smr_worker *smrw)
2012	{
2013	char name[MAXTHREADNAMESIZE];
2014	thread_t th = THREAD_NULL;
2015
2016	kernel_thread_create(continuation: __smr_worker_continue, parameter: smrw, MINPRI_KERNEL, new_thread: &th);
2017	smrw->thread = th;
2018
2019	snprintf(name, sizeof(name), "smr.reclaim:%d", smrw->processor->cpu_id);
2020	thread_set_thread_name(th, name);
2021	thread_start_in_assert_wait(thread: th,
2022	waitq: &smrw->waitq, event: __smrw_drain_event(smrw), THREAD_UNINT);
2023	}
2024
2025	void
2026	smr_cpu_init(struct processor *processor)
2027	{
2028	struct smr_worker *smrw;
2029
2030	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2031	smrw->processor = processor;
2032
2033	waitq_init(waitq: &smrw->waitq, type: WQT_QUEUE, SYNC_POLICY_FIFO);
2034	smrw->detach_reason = SMR_CPU_REASON_OFFLINE;
2035
2036	smrq_init(&smrw->sect_queue);
2037	smrw->wold_tail = &smrw->whead;
2038	smrw->wage_tail = &smrw->whead;
2039	smrw->wcur_tail = &smrw->whead;
2040	mpsc_queue_init(q: &smrw->barrier_queue);
2041
2042	if (processor != master_processor) {
2043	__smr_cpu_init_thread(smrw);
2044	}
2045	}
2046	STARTUP_ARG(LOCKS, STARTUP_RANK_LAST, smr_cpu_init, master_processor);
2047	STARTUP_ARG(THREAD_CALL, STARTUP_RANK_LAST,
2048	__smr_cpu_init_thread, PERCPU_GET_MASTER(smr_worker));
2049
2050	/!*
2051	* @function smr_cpu_up()
2052	*
2053	* @brief
2054	* Scheduler callback to notify this processor is going up.
2055	*
2056	* @discussion
2057	* Called at splsched() under the sched_available_cores_lock.
2058	*/
2059	void
2060	smr_cpu_up(struct processor *processor, smr_cpu_reason_t reason)
2061	{
2062	struct smr_worker *smrw;
2063
2064	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2065
2066	__smrw_lock(smrw);
2067	if (reason != SMR_CPU_REASON_IGNORED) {
2068	assert((smrw->detach_reason & reason) == reason);
2069	}
2070	smrw->detach_reason &= ~reason;
2071	__smrw_unlock(smrw);
2072	}
2073
2074	static void
2075	__smr_cpu_down_and_unlock(
2076	struct processor *processor,
2077	struct smr_worker *smrw,
2078	smr_cpu_reason_t reason)
2079	{
2080	bool detach = !smrw->detach_reason;
2081
2082	/*
2083	* When reason is SMR_CPU_REASON_IGNORED,
2084	* this is called from smr_cpu_leave() on the way to idle.
2085	*
2086	* However this isn't sychronized with the recommendation
2087	* lock, hence it is possible that the CPU might actually
2088	* be recommended again while we're on the way to idle.
2089	*
2090	* By re-checking processor recommendation under
2091	* the __smrw_lock, we serialize with smr_cpu_up().
2092	*/
2093	if (reason != SMR_CPU_REASON_IGNORED) {
2094	assert((smrw->detach_reason & reason) == `0`);
2095	} else if (processor->is_recommended) {
2096	/*
2097	* The race we try to detect happened,
2098	* do nothing.
2099	*/
2100	reason = SMR_CPU_REASON_NONE;
2101	detach = false;
2102	}
2103	smrw->detach_reason \|= reason;
2104	reason = smrw->detach_reason;
2105
2106	if (detach && smrw->whead) {
2107	detach = !__smrw_wakeup_and_unlock(smrw);
2108	} else {
2109	__smrw_unlock(smrw);
2110	}
2111
2112	if (detach) {
2113	thread_unbind_after_queue_shutdown(thread: smrw->thread, processor);
2114	}
2115	}
2116
2117	/!*
2118	* @function smr_cpu_down()
2119	*
2120	* @brief
2121	* Scheduler callback to notify this processor is going down.
2122	*
2123	* @discussion
2124	* Called at splsched() when the processor run queue is being shut down.
2125	*/
2126	void
2127	smr_cpu_down(struct processor *processor, smr_cpu_reason_t reason)
2128	{
2129	struct smr_worker *smrw;
2130
2131	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2132
2133	__smrw_lock(smrw);
2134	__smr_cpu_down_and_unlock(processor, smrw, reason);
2135	}
2136
2137
2138	/!*
2139	* @function smr_cpu_join()
2140	*
2141	* @brief
2142	* Scheduler callback to notify this processor is going out of idle.
2143	*
2144	* @discussion
2145	* Called at splsched().
2146	*/
2147	void
2148	smr_cpu_join(struct processor *processor, uint64_t ctime __unused)
2149	{
2150	#if CONFIG_QUIESCE_COUNTER
2151	struct smr_worker *smrw;
2152
2153	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2154	cpu_quiescent_join(smrw);
2155	#else
2156	(void)processor;
2157	#endif /* CONFIG_QUIESCE_COUNTER */
2158	}
2159
2160	/!*
2161	* @function smr_cpu_tick()
2162	*
2163	* @brief
2164	* Scheduler callback invoked during the scheduler maintenance routine.
2165	*
2166	* @discussion
2167	* Called at splsched().
2168	*/
2169	void
2170	smr_cpu_tick(uint64_t ctime, bool safe_point)
2171	{
2172	struct smr_worker *smrw = PERCPU_GET(smr_worker);
2173	uint64_t interval = cpu_checkin_min_interval;
2174
2175	#if CONFIG_QUIESCE_COUNTER
2176	cpu_quiescent_tick(smrw, ctime, interval);
2177	#endif /* CONFIG_QUIESCE_COUNTER */
2178
2179	/*
2180	* if a bound thread was woken up on a derecommended core,
2181	* our detach_reason might be "IGNORED" and we want to leave
2182	* it alone in that case
2183	*/
2184	if (safe_point && !smrw->detach_reason && smrw->whead &&
2185	current_processor()->state == PROCESSOR_RUNNING &&
2186	(ctime - smrw->drain_ctime) > interval) {
2187	__smr_worker_tick(smrw, ctime, true);
2188	}
2189	}
2190
2191	/!*
2192	* @function smr_cpu_leave()
2193	*
2194	* @brief
2195	* Scheduler callback to notify this processor is going idle.
2196	*
2197	* @discussion
2198	* Called at splsched().
2199	*/
2200	void
2201	smr_cpu_leave(struct processor *processor, uint64_t ctime)
2202	{
2203	struct smr_worker *smrw;
2204
2205	smrw = PERCPU_GET_RELATIVE(smr_worker, processor, processor);
2206
2207	/*
2208	* if a bound thread was woken up on a derecommended core,
2209	* our detach_reason might be "IGNORED" and we want to leave
2210	* it alone in that case
2211	*
2212	* See comment in __smr_worker_continue for why this must be
2213	* done unconditionally otherwise.
2214	*/
2215	if (!smrw->detach_reason && smrw->whead) {
2216	__smr_worker_tick(smrw, ctime, true);
2217	}
2218
2219	if (__improbable(!processor->is_recommended)) {
2220	__smrw_lock(smrw);
2221	__smr_cpu_down_and_unlock(processor, smrw, reason: SMR_CPU_REASON_IGNORED);
2222	}
2223
2224	#if CONFIG_QUIESCE_COUNTER
2225	cpu_quiescent_leave(smrw);
2226	#endif /* CONFIG_QUIESCE_COUNTER */
2227	}
2228
2229	/!*
2230	* @function smr_maintenance()
2231	*
2232	* @brief
2233	* Scheduler callback called at the scheduler tick.
2234	*
2235	* @discussion
2236	* Called at splsched().
2237	*/
2238	void
2239	smr_maintenance(uint64_t ctime)
2240	{
2241	#if CONFIG_QUIESCE_COUNTER
2242	cpu_quiescent_advance(os_atomic_load(quiesce_genp, relaxed), ctime);
2243	#else
2244	(void)ctime;
2245	#endif /* CONFIG_QUIESCE_COUNTER */
2246	}
2247
2248
2249	#pragma mark - SMR hash tables
2250
2251	static struct smrq_slist_head *
2252	smr_hash_alloc_array(size_t size)
2253	{
2254	return kalloc_type(struct smrq_slist_head, size,
2255	Z_WAITOK \| Z_ZERO \| Z_SPRAYQTN);
2256	}
2257
2258	static void
2259	smr_hash_free_array(struct smrq_slist_head *array, size_t size)
2260	{
2261	kfree_type(struct smrq_slist_head, size, array);
2262	}
2263
2264	static inline uintptr_t
2265	smr_hash_array_encode(struct smrq_slist_head *array, uint16_t order)
2266	{
2267	uintptr_t ptr;
2268
2269	ptr = (uintptr_t)array;
2270	ptr &= ~SMRH_ARRAY_ORDER_MASK;
2271	ptr \|= (uintptr_t)order << SMRH_ARRAY_ORDER_SHIFT;
2272
2273	return ptr;
2274	}
2275
2276	#pragma mark SMR simple hash tables
2277
2278	void
2279	smr_hash_init(struct smr_hash *smrh, size_t size)
2280	{
2281	struct smrq_slist_head *array;
2282	uint16_t shift;
2283
2284	assert(size);
2285	shift = (uint16_t)flsll(mask: size - `1`);
2286	size = `1UL` << shift;
2287	if (startup_phase >= STARTUP_SUB_LOCKDOWN) {
2288	assert(size * sizeof(struct smrq_slist_head) <=
2289	KALLOC_SAFE_ALLOC_SIZE);
2290	}
2291	array = smr_hash_alloc_array(size);
2292
2293	smrh = (struct* smr_hash){
2294	.smrh_array = smr_hash_array_encode(array, order: `64` - shift),
2295	};
2296	}
2297
2298	void
2299	smr_hash_destroy(struct smr_hash *smrh)
2300	{
2301	struct smr_hash_array array = smr_hash_array_decode(smrh);
2302
2303	smr_hash_free_array(array: array.smrh_array, size: smr_hash_size(array));
2304	smrh = (struct* smr_hash){ };
2305	}
2306
2307	void
2308	__smr_hash_serialized_clear(
2309	struct smr_hash *smrh,
2310	smrh_traits_t smrht,
2311	void (^free)(void *obj))
2312	{
2313	struct smr_hash_array array = smr_hash_array_decode(smrh);
2314
2315	for (size_t i = `0`; i < smr_hash_size(array); i++) {
2316	struct smrq_slink *link;
2317	__smrq_slink_t *prev;
2318
2319	prev = &array.smrh_array[i].first;
2320	while ((link = smr_serialized_load(prev))) {
2321	prev = &link->next;
2322	free(__smrht_link_to_obj(traits: smrht, link));
2323	}
2324
2325	smr_clear_store(&array.smrh_array[i].first);
2326	}
2327
2328	smrh->smrh_count = `0`;
2329	}
2330
2331	kern_return_t
2332	__smr_hash_shrink_and_unlock(
2333	struct smr_hash *smrh,
2334	lck_mtx_t *lock,
2335	smrh_traits_t smrht)
2336	{
2337	struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2338	struct smrq_slist_head newarray, oldarray;
2339	uint16_t neworder = decptr.smrh_order + `1`;
2340	size_t oldsize = smr_hash_size(array: decptr);
2341	size_t newsize = oldsize / `2`;
2342
2343	assert(newsize);
2344
2345	if (os_atomic_load(&smrh->smrh_resizing, relaxed)) {
2346	lck_mtx_unlock(lck: lock);
2347	return KERN_FAILURE;
2348	}
2349
2350	os_atomic_store(&smrh->smrh_resizing, true, relaxed);
2351	lck_mtx_unlock(lck: lock);
2352
2353	newarray = smr_hash_alloc_array(size: newsize);
2354	if (newarray == NULL) {
2355	os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2356	return KERN_RESOURCE_SHORTAGE;
2357	}
2358
2359	lck_mtx_lock(lck: lock);
2360
2361	/*
2362	* Step 1: collapse all the chains in pairs.
2363	*/
2364	oldarray = decptr.smrh_array;
2365
2366	for (size_t i = `0`; i < newsize; i++) {
2367	newarray[i] = oldarray[i];
2368	smrq_serialized_append(&newarray[i], &oldarray[i + newsize]);
2369	}
2370
2371	/*
2372	* Step 2: publish the new array.
2373	*/
2374	os_atomic_store(&smrh->smrh_array,
2375	smr_hash_array_encode(newarray, neworder), release);
2376
2377	os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2378
2379	lck_mtx_unlock(lck: lock);
2380
2381	/*
2382	* Step 3: free the old array once readers can't observe the old values.
2383	*/
2384	smr_synchronize(smr: smrht->domain);
2385
2386	smr_hash_free_array(array: oldarray, size: oldsize);
2387	return KERN_SUCCESS;
2388	}
2389
2390	kern_return_t
2391	__smr_hash_grow_and_unlock(
2392	struct smr_hash *smrh,
2393	lck_mtx_t *lock,
2394	smrh_traits_t smrht)
2395	{
2396	struct smr_hash_array decptr = smr_hash_array_decode(smrh);
2397	struct smrq_slist_head newarray, oldarray;
2398	__smrq_slink_t **prevarray;
2399	uint16_t neworder = decptr.smrh_order - `1`;
2400	size_t oldsize = smr_hash_size(array: decptr);
2401	size_t newsize = `2` * oldsize;
2402	bool needs_another_round = false;
2403
2404	if (smrh->smrh_resizing) {
2405	lck_mtx_unlock(lck: lock);
2406	return KERN_FAILURE;
2407	}
2408
2409	smrh->smrh_resizing = true;
2410	lck_mtx_unlock(lck: lock);
2411
2412	newarray = smr_hash_alloc_array(size: newsize);
2413	if (newarray == NULL) {
2414	os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2415	return KERN_RESOURCE_SHORTAGE;
2416	}
2417
2418	prevarray = kalloc_type(__smrq_slink_t *, newsize,
2419	Z_WAITOK \| Z_ZERO \| Z_SPRAYQTN);
2420	if (prevarray == NULL) {
2421	smr_hash_free_array(array: newarray, size: newsize);
2422	os_atomic_store(&smrh->smrh_resizing, false, relaxed);
2423	return KERN_RESOURCE_SHORTAGE;
2424	}
2425
2426
2427	lck_mtx_lock(lck: lock);
2428
2429	/*
2430	* Step 1: create a duplicated array with twice as many heads.
2431	*/
2432	oldarray = decptr.smrh_array;
2433
2434	memcpy(dst: newarray, src: oldarray, n: oldsize * sizeof(newarray[`0`]));
2435	memcpy(dst: newarray + oldsize, src: oldarray, n: oldsize * sizeof(newarray[`0`]));
2436
2437	/*
2438	* Step 2: Publish the new array, and wait for readers to observe it
2439	* before we do any change.
2440	*/
2441	os_atomic_store(&smrh->smrh_array,
2442	smr_hash_array_encode(newarray, neworder), release);
2443
2444	smr_synchronize(smr: smrht->domain);
2445
2446
2447	/*
2448	* Step 3: split the lists.
2449	*/
2450
2451	/*
2452	* If the list we are trying to split looked like this,
2453	* where L elements will go to the "left" bucket and "R"
2454	* to the right one:
2455	*
2456	* old_head --> L1 --> L2 -> L5
2457	* \ / \
2458	* -> R3 --> R4 -> R6 --> NULL
2459	*
2460	* Then make sure the new heads point to their legitimate first element,
2461	* leading to this state:
2462	*
2463	* l_head --> L1 --> L2 -> L5
2464	* \ / \
2465	* r_head ----------------> R3 --> R4 -> R6 --> NULL
2466	*
2467	*
2468	* prevarray[left] = &L2->next
2469	* prevarray[right] = &r_head
2470	* oldarray[old] = L2
2471	*/
2472
2473	for (size_t i = `0`; i < oldsize; i++) {
2474	struct smrq_slink link, next;
2475	uint32_t want_mask;
2476
2477	link = smr_serialized_load(&oldarray[i].first);
2478	if (link == NULL) {
2479	continue;
2480	}
2481
2482	want_mask = smrht->obj_hash(link, `0`) & oldsize;
2483	while ((next = smr_serialized_load(&link->next)) &&
2484	(smrht->obj_hash(next, `0`) & oldsize) == want_mask) {
2485	link = next;
2486	}
2487
2488	if (want_mask == `0`) {
2489	/ elements seen go to the "left" bucket /
2490	prevarray[i] = &link->next;
2491	prevarray[i + oldsize] = &newarray[i + oldsize].first;
2492	smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2493	} else {
2494	/ elements seen go to the "right" bucket /
2495	prevarray[i] = &newarray[i].first;
2496	prevarray[i + oldsize] = &link->next;
2497	smr_serialized_store_relaxed(prevarray[i], next);
2498	}
2499
2500	smr_serialized_store_relaxed(&oldarray[i].first,
2501	next ? link : NULL);
2502
2503	needs_another_round \|= (next != NULL);
2504	}
2505
2506	/*
2507	* At this point, when we split further, we must wait for
2508	* readers to observe the previous state before we split
2509	* further. Indeed, reusing the example above, the next
2510	* round of splitting would end up with this:
2511	*
2512	* l_head --> L1 --> L2 ----------------> L5
2513	* / \
2514	* r_head ----------------> R3 --> R4 -> R6 --> NULL
2515	*
2516	*
2517	* prevarray[left] = &L2->next
2518	* prevarray[right] = &R4->next
2519	* oldarray[old] = R4
2520	*
2521	* But we must be sure that no readers can observe r_head
2522	* having been L1, otherwise a stale reader might skip over
2523	* R3/R4.
2524	*
2525	* Generally speaking we need to do that each time we do a round
2526	* of splitting that isn't terminating the list with NULL.
2527	*/
2528
2529	while (needs_another_round) {
2530	smr_synchronize(smr: smrht->domain);
2531
2532	needs_another_round = false;
2533
2534	for (size_t i = `0`; i < oldsize; i++) {
2535	struct smrq_slink link, next;
2536	uint32_t want_mask;
2537
2538	link = smr_serialized_load(&oldarray[i].first);
2539	if (link == NULL) {
2540	continue;
2541	}
2542
2543	/*
2544	* If `prevarray[i]` (left) points to the linkage
2545	* we stopped at, then it means the next element
2546	* will be "to the right" and vice versa.
2547	*
2548	* We also already know "next" exists, so only probe
2549	* after it.
2550	*/
2551	if (prevarray[i] == &link->next) {
2552	want_mask = (uint32_t)oldsize;
2553	} else {
2554	want_mask = `0`;
2555	}
2556
2557	link = smr_serialized_load(&link->next);
2558
2559	while ((next = smr_serialized_load(&link->next)) &&
2560	(smrht->obj_hash(next, `0`) & oldsize) == want_mask) {
2561	link = next;
2562	}
2563
2564	if (want_mask == `0`) {
2565	/ elements seen go to the "left" bucket /
2566	prevarray[i] = &link->next;
2567	smr_serialized_store_relaxed(prevarray[i + oldsize], next);
2568	} else {
2569	/ elements seen go to the "right" bucket /
2570	smr_serialized_store_relaxed(prevarray[i], next);
2571	prevarray[i + oldsize] = &link->next;
2572	}
2573
2574	smr_serialized_store_relaxed(&oldarray[i].first,
2575	next ? link : NULL);
2576
2577	needs_another_round \|= (next != NULL);
2578	}
2579	}
2580
2581	smrh->smrh_resizing = false;
2582	lck_mtx_unlock(lck: lock);
2583
2584	/*
2585	* Step 4: cleanup, no need to wait for readers, this happened already
2586	* at least once for splitting reasons.
2587	*/
2588	smr_hash_free_array(array: oldarray, size: oldsize);
2589	kfree_type(__smrq_slink_t *, newsize, prevarray);
2590	return KERN_SUCCESS;
2591	}
2592
2593	#pragma mark SMR scalable hash tables
2594
2595	#define SMRSH_MIGRATED ((struct smrq_slink *)SMRSH_BUCKET_STOP_BIT)
2596	static LCK_GRP_DECLARE(smr_shash_grp, "smr_shash");
2597
2598	static inline size_t
2599	__smr_shash_min_size(struct smr_shash *smrh)
2600	{
2601	return `1ul` << smrh->smrsh_min_shift;
2602	}
2603
2604	static inline size_t
2605	__smr_shash_size_for_shift(uint8_t shift)
2606	{
2607	return (~`0u` >> shift) + `1`;
2608	}
2609
2610	static inline size_t
2611	__smr_shash_cursize(smrsh_state_t state)
2612	{
2613	return __smr_shash_size_for_shift(shift: state.curshift);
2614	}
2615
2616	static void
2617	__smr_shash_bucket_init(hw_lck_ptr_t *head)
2618	{
2619	hw_lck_ptr_init(head, __smr_shash_bucket_stop(head), &smr_shash_grp);
2620	}
2621
2622	static void
2623	__smr_shash_bucket_destroy(hw_lck_ptr_t *head)
2624	{
2625	hw_lck_ptr_destroy(head, &smr_shash_grp);
2626	}
2627
2628	__attribute__((noinline))
2629	void *
2630	__smr_shash_entered_find_slow(
2631	const struct smr_shash *smrh,
2632	smrh_key_t key,
2633	hw_lck_ptr_t *head,
2634	smrh_traits_t traits)
2635	{
2636	struct smrq_slink *link;
2637	smrsh_state_t state;
2638	uint32_t hash;
2639
2640	/ wait for the rehashing to be done into their target buckets /
2641	hw_lck_ptr_wait_for_value(head, SMRSH_MIGRATED, &smr_shash_grp);
2642
2643	state = os_atomic_load(&smrh->smrsh_state, dependency);
2644	hash = __smr_shash_hash(smrh, idx: state.newidx, key, traits);
2645	head = __smr_shash_bucket(smrh, state, sel: SMRSH_NEW, hash);
2646
2647	link = hw_lck_ptr_value(lck: head);
2648	while (!__smr_shash_is_stop(link)) {
2649	if (traits->obj_equ(link, key)) {
2650	return __smrht_link_to_obj(traits, link);
2651	}
2652	link = smr_entered_load(&link->next);
2653	}
2654
2655	assert(link == __smr_shash_bucket_stop(head));
2656	return NULL;
2657	}
2658
2659	static const uint8_t __smr_shash_grow_ratio[] = {
2660	[SMRSH_COMPACT] = `6`,
2661	[SMRSH_BALANCED] = `4`,
2662	[SMRSH_BALANCED_NOSHRINK] = `4`,
2663	[SMRSH_FASTEST] = `2`,
2664	};
2665
2666	static inline uint64_t
2667	__smr_shash_count(struct smr_shash *smrh)
2668	{
2669	int64_t count = (int64_t)counter_load(&smrh->smrsh_count);
2670
2671	/*
2672	* negative values make no sense and is likely due to some
2673	* stale values being read.
2674	*/
2675	return count < `0` ? `0ull` : (uint64_t)count;
2676	}
2677
2678	static inline bool
2679	__smr_shash_should_grow(
2680	struct smr_shash *smrh,
2681	smrsh_state_t state,
2682	uint64_t count)
2683	{
2684	size_t size = __smr_shash_cursize(state);
2685
2686	/ grow if elem:bucket ratio is worse than grow_ratio:1 /
2687	return count > __smr_shash_grow_ratio[smrh->smrsh_policy] * size;
2688	}
2689
2690	static inline bool
2691	__smr_shash_should_reseed(
2692	struct smr_shash *smrh,
2693	size_t observed_depth)
2694	{
2695	return observed_depth > `10` * __smr_shash_grow_ratio[smrh->smrsh_policy];
2696	}
2697
2698	static inline bool
2699	__smr_shash_should_shrink(
2700	struct smr_shash *smrh,
2701	smrsh_state_t state,
2702	uint64_t count)
2703	{
2704	size_t size = __smr_shash_cursize(state);
2705
2706	switch (smrh->smrsh_policy) {
2707	case SMRSH_COMPACT:
2708	/ shrink if bucket:elem ratio is worse than 1:1 /
2709	return size > count && size > __smr_shash_min_size(smrh);
2710	case SMRSH_BALANCED:
2711	/ shrink if bucket:elem ratio is worse than 2:1 /
2712	return size > `2` * count && size > __smr_shash_min_size(smrh);
2713	case SMRSH_BALANCED_NOSHRINK:
2714	case SMRSH_FASTEST:
2715	return false;
2716	}
2717	}
2718
2719	static inline void
2720	__smr_shash_schedule_rehash(
2721	struct smr_shash *smrh,
2722	smrh_traits_t traits,
2723	smrsh_rehash_t reason)
2724	{
2725	smrsh_rehash_t rehash;
2726
2727	rehash = os_atomic_load(&smrh->smrsh_rehashing, relaxed);
2728	if (rehash & reason) {
2729	return;
2730	}
2731
2732	rehash = os_atomic_or_orig(&smrh->smrsh_rehashing, reason, relaxed);
2733	if (!rehash) {
2734	thread_call_enter1(call: smrh->smrsh_callout,
2735	__DECONST(void *, traits));
2736	}
2737	}
2738
2739	void *
2740	__smr_shash_entered_get_or_insert(
2741	struct smr_shash *smrh,
2742	smrh_key_t key,
2743	struct smrq_slink *link,
2744	smrh_traits_t traits)
2745	{
2746	struct smrq_slink *first;
2747	struct smrq_slink *other;
2748	uint32_t hash, depth;
2749	smrsh_state_t state;
2750	hw_lck_ptr_t *head;
2751	void *obj;
2752
2753	state = os_atomic_load(&smrh->smrsh_state, dependency);
2754	hash = __smr_shash_hash(smrh, idx: state.curidx, key, traits);
2755	head = __smr_shash_bucket(smrh, state, sel: SMRSH_CUR, hash);
2756	first = hw_lck_ptr_lock(head, &smr_shash_grp);
2757
2758	if (__improbable(first == SMRSH_MIGRATED)) {
2759	hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2760
2761	state = os_atomic_load(&smrh->smrsh_state, dependency);
2762	hash = __smr_shash_hash(smrh, idx: state.newidx, key, traits);
2763	head = __smr_shash_bucket(smrh, state, sel: SMRSH_NEW, hash);
2764	first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2765	}
2766
2767	depth = `0`;
2768	other = first;
2769	while (!__smr_shash_is_stop(link: other)) {
2770	depth++;
2771	if (traits->obj_equ(other, key)) {
2772	obj = __smrht_link_to_obj(traits, link: other);
2773	if (traits->obj_try_get(obj)) {
2774	hw_lck_ptr_unlock(head, first,
2775	&smr_shash_grp);
2776	return obj;
2777	}
2778	break;
2779	}
2780	other = smr_serialized_load(&other->next);
2781	}
2782
2783	counter_inc_preemption_disabled(&smrh->smrsh_count);
2784	smr_serialized_store_relaxed(&link->next, first);
2785	hw_lck_ptr_unlock(head, link, &smr_shash_grp);
2786
2787	if (__smr_shash_should_reseed(smrh, observed_depth: depth)) {
2788	__smr_shash_schedule_rehash(smrh, traits, reason: SMRSH_REHASH_RESEED);
2789	} else if (depth * `2` >= __smr_shash_grow_ratio[smrh->smrsh_policy] &&
2790	__smr_shash_should_grow(smrh, state, count: __smr_shash_count(smrh))) {
2791	__smr_shash_schedule_rehash(smrh, traits, reason: SMRSH_REHASH_GROW);
2792	}
2793	return NULL;
2794	}
2795
2796	__abortlike
2797	static void
2798	__smr_shash_missing_elt_panic(
2799	struct smr_shash *smrh,
2800	struct smrq_slink *link,
2801	smrh_traits_t traits)
2802	{
2803	panic("Unable to find item %p (linkage %p) in %p (traits %p)",
2804	__smrht_link_to_obj(traits, link), link, smrh, traits);
2805	}
2806
2807	smr_shash_mut_cursor_t
2808	__smr_shash_entered_mut_begin(
2809	struct smr_shash *smrh,
2810	struct smrq_slink *link,
2811	smrh_traits_t traits)
2812	{
2813	struct smrq_slink first, next;
2814	__smrq_slink_t *prev;
2815	smrsh_state_t state;
2816	hw_lck_ptr_t *head;
2817	uint32_t hash;
2818
2819	state = os_atomic_load(&smrh->smrsh_state, dependency);
2820	hash = __smr_shash_hash(smrh, idx: state.curidx, link, traits);
2821	head = __smr_shash_bucket(smrh, state, sel: SMRSH_CUR, hash);
2822	first = hw_lck_ptr_lock(head, &smr_shash_grp);
2823
2824	if (__improbable(first == SMRSH_MIGRATED)) {
2825	hw_lck_ptr_unlock_nopreempt(head, first, &smr_shash_grp);
2826
2827	state = os_atomic_load(&smrh->smrsh_state, dependency);
2828	hash = __smr_shash_hash(smrh, idx: state.newidx, link, traits);
2829	head = __smr_shash_bucket(smrh, state, sel: SMRSH_NEW, hash);
2830	first = hw_lck_ptr_lock_nopreempt(head, &smr_shash_grp);
2831	}
2832
2833	next = first;
2834	while (next != link) {
2835	if (__smr_shash_is_stop(link: next)) {
2836	__smr_shash_missing_elt_panic(smrh, link, traits);
2837	}
2838	prev = &next->next;
2839	next = smr_serialized_load(prev);
2840	}
2841
2842	return (smr_shash_mut_cursor_t){ .head = head, .prev = prev };
2843	}
2844
2845	void
2846	__smr_shash_entered_mut_erase(
2847	struct smr_shash *smrh,
2848	smr_shash_mut_cursor_t cursor,
2849	struct smrq_slink *link,
2850	smrh_traits_t traits)
2851	{
2852	struct smrq_slink next, first;
2853	smrsh_state_t state;
2854
2855	first = hw_lck_ptr_value(lck: cursor.head);
2856
2857	next = smr_serialized_load(&link->next);
2858	if (first == link) {
2859	counter_dec_preemption_disabled(&smrh->smrsh_count);
2860	hw_lck_ptr_unlock(cursor.head, next, &smr_shash_grp);
2861	} else {
2862	smr_serialized_store_relaxed(cursor.prev, next);
2863	counter_dec_preemption_disabled(&smrh->smrsh_count);
2864	hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2865	}
2866
2867	state = atomic_load_explicit(&smrh->smrsh_state, memory_order_relaxed);
2868	if (first == link && __smr_shash_is_stop(link: next) &&
2869	__smr_shash_should_shrink(smrh, state, count: __smr_shash_count(smrh))) {
2870	__smr_shash_schedule_rehash(smrh, traits, reason: SMRSH_REHASH_SHRINK);
2871	}
2872	}
2873
2874	void
2875	__smr_shash_entered_mut_replace(
2876	smr_shash_mut_cursor_t cursor,
2877	struct smrq_slink *old_link,
2878	struct smrq_slink *new_link)
2879	{
2880	struct smrq_slink first, next;
2881
2882	first = hw_lck_ptr_value(lck: cursor.head);
2883
2884	next = smr_serialized_load(&old_link->next);
2885	smr_serialized_store_relaxed(&new_link->next, next);
2886	if (first == old_link) {
2887	hw_lck_ptr_unlock(cursor.head, new_link, &smr_shash_grp);
2888	} else {
2889	smr_serialized_store_relaxed(cursor.prev, new_link);
2890	hw_lck_ptr_unlock(cursor.head, first, &smr_shash_grp);
2891	}
2892	}
2893
2894	void
2895	__smr_shash_entered_mut_abort(smr_shash_mut_cursor_t cursor)
2896	{
2897	hw_lck_ptr_unlock(cursor.head,
2898	hw_lck_ptr_value(cursor.head), &smr_shash_grp);
2899	}
2900
2901	static kern_return_t
2902	__smr_shash_rehash_with_target(
2903	struct smr_shash *smrh,
2904	smrsh_state_t state,
2905	uint8_t newshift,
2906	smrh_traits_t traits)
2907	{
2908	const size_t FLAT_SIZE = `256`;
2909	struct smrq_slink *flat_queue[FLAT_SIZE];
2910
2911	size_t oldsize, newsize;
2912	hw_lck_ptr_t *oldarray;
2913	hw_lck_ptr_t *newarray;
2914	uint32_t newseed;
2915	uint8_t oldidx;
2916
2917	/*
2918	* This function resizes a scalable hash table.
2919	*
2920	* It doesn't require a lock because it is the callout
2921	* of a THREAD_CALL_ONCE thread call.
2922	*/
2923
2924	oldidx = state.curidx;
2925	state.newidx = `1` - state.curidx;
2926	state.newshift = newshift;
2927	assert(__smr_shash_load_array(smrh, state.newidx) == NULL);
2928
2929	oldsize = __smr_shash_cursize(state);
2930	newsize = __smr_shash_size_for_shift(shift: newshift);
2931
2932	oldarray = __smr_shash_load_array(smrh, idx: state.curidx);
2933	newarray = (hw_lck_ptr_t *)smr_hash_alloc_array(size: newsize);
2934	newseed = (uint32_t)early_random();
2935
2936	if (newarray == NULL) {
2937	return KERN_RESOURCE_SHORTAGE;
2938	}
2939
2940	/*
2941	* Step 1: initialize the new array and seed,
2942	* and then publish the state referencing it.
2943	*
2944	* We do not need to synchronize explicitly with SMR,
2945	* because readers/writers will notice rehashing when
2946	* the bucket they interact with has a SMRSH_MIGRATED
2947	* value.
2948	*/
2949
2950	for (size_t i = `0`; i < newsize; i++) {
2951	__smr_shash_bucket_init(head: &newarray[i]);
2952	}
2953	os_atomic_store(&smrh->smrsh_array[state.newidx], newarray, relaxed);
2954	os_atomic_store(&smrh->smrsh_seed[state.newidx], newseed, relaxed);
2955	os_atomic_store(&smrh->smrsh_state, state, release);
2956
2957	/*
2958	* Step 2: migrate buckets "atomically" under the old bucket lock.
2959	*
2960	* This migration is atomic for writers because
2961	* they take the old bucket lock first, and if
2962	* they observe SMRSH_MIGRATED as the value,
2963	* go look in the new bucket instead.
2964	*
2965	* This migration is atomic for readers, because
2966	* as we move elements to their new buckets,
2967	* the hash chains will not circle back to their
2968	* bucket head (the "stop" value won't match),
2969	* or the bucket head will be SMRSH_MIGRATED.
2970	*
2971	* This causes a slowpath which spins waiting
2972	* for SMRSH_MIGRATED to appear and then looks
2973	* in the new bucket.
2974	*/
2975	for (size_t i = `0`; i < oldsize; i++) {
2976	struct smrq_slink first, link, *next;
2977	hw_lck_ptr_t *head;
2978	uint32_t hash;
2979	size_t n = `0`;
2980
2981	link = first = hw_lck_ptr_lock(&oldarray[i], &smr_shash_grp);
2982
2983	while (!__smr_shash_is_stop(link)) {
2984	flat_queue[n++ % FLAT_SIZE] = link;
2985	link = smr_serialized_load(&link->next);
2986	}
2987
2988	while (n-- > `0`) {
2989	for (size_t j = (n % FLAT_SIZE) + `1`; j-- > `0`;) {
2990	link = flat_queue[j];
2991	hash = traits->obj_hash(link, newseed);
2992	head = &newarray[hash >> newshift];
2993	next = hw_lck_ptr_lock_nopreempt(head,
2994	&smr_shash_grp);
2995	smr_serialized_store_relaxed(&link->next, next);
2996	hw_lck_ptr_unlock_nopreempt(head, link,
2997	&smr_shash_grp);
2998	}
2999	n &= ~(FLAT_SIZE - `1`);
3000
3001	/*
3002	* If there were more than FLAT_SIZE elements in the
3003	* chain (which is super unlikely and in many ways,
3004	* worrisome), then we need to repopoulate
3005	* the flattened queue array for each run.
3006	*
3007	* This is O(n^2) but we have worse problems anyway
3008	* if we ever hit this path.
3009	*/
3010	if (__improbable(n > `0`)) {
3011	link = first;
3012	for (size_t j = `0`; j < n - FLAT_SIZE; j++) {
3013	link = smr_serialized_load(&link->next);
3014	}
3015
3016	flat_queue[`0`] = link;
3017	for (size_t j = `1`; j < FLAT_SIZE; j++) {
3018	link = smr_serialized_load(&link->next);
3019	flat_queue[j] = link;
3020	}
3021	}
3022	}
3023
3024	hw_lck_ptr_unlock(&oldarray[i], SMRSH_MIGRATED, &smr_shash_grp);
3025	}
3026
3027	/*
3028	* Step 3: deallocate the old array of buckets,
3029	* making sure to hide it from readers.
3030	*/
3031
3032	state.curshift = state.newshift;
3033	state.curidx = state.newidx;
3034	os_atomic_store(&smrh->smrsh_state, state, release);
3035
3036	smr_synchronize(smr: traits->domain);
3037
3038	os_atomic_store(&smrh->smrsh_array[oldidx], NULL, relaxed);
3039	for (size_t i = `0`; i < oldsize; i++) {
3040	__smr_shash_bucket_destroy(head: &oldarray[i]);
3041	}
3042	smr_hash_free_array(array: (struct smrq_slist_head *)oldarray, size: oldsize);
3043
3044	return KERN_SUCCESS;
3045	}
3046
3047	static void
3048	__smr_shash_rehash(thread_call_param_t arg0, thread_call_param_t arg1)
3049	{
3050	struct smr_shash *smrh = arg0;
3051	smrh_traits_t traits = arg1;
3052	smrsh_rehash_t reason;
3053	smrsh_state_t state;
3054	uint64_t count;
3055	kern_return_t kr;
3056
3057	do {
3058	reason = os_atomic_xchg(&smrh->smrsh_rehashing,
3059	SMRSH_REHASH_RUNNING, relaxed);
3060
3061	state = os_atomic_load(&smrh->smrsh_state, relaxed);
3062	count = __smr_shash_count(smrh);
3063
3064	if (__smr_shash_should_grow(smrh, state, count)) {
3065	kr = __smr_shash_rehash_with_target(smrh, state,
3066	newshift: state.curshift - `1`, traits);
3067	} else if (__smr_shash_should_shrink(smrh, state, count)) {
3068	kr = __smr_shash_rehash_with_target(smrh, state,
3069	newshift: state.curshift + `1`, traits);
3070	} else if (reason & SMRSH_REHASH_RESEED) {
3071	kr = __smr_shash_rehash_with_target(smrh, state,
3072	newshift: state.curshift, traits);
3073	} else {
3074	kr = KERN_SUCCESS;
3075	}
3076
3077	if (kr == KERN_RESOURCE_SHORTAGE) {
3078	uint64_t deadline;
3079
3080	os_atomic_or(&smrh->smrsh_rehashing, reason, relaxed);
3081	nanoseconds_to_deadline(NSEC_PER_MSEC, result: &deadline);
3082	thread_call_enter1_delayed(call: smrh->smrsh_callout,
3083	param1: arg1, deadline);
3084	break;
3085	}
3086	} while (!os_atomic_cmpxchg(&smrh->smrsh_rehashing,
3087	SMRSH_REHASH_RUNNING, SMRSH_REHASH_NONE, relaxed));
3088	}
3089
3090	void
3091	smr_shash_init(struct smr_shash *smrh, smrsh_policy_t policy, size_t min_size)
3092	{
3093	smrsh_state_t state;
3094	hw_lck_ptr_t *array;
3095	uint8_t shift;
3096	size_t size;
3097
3098	switch (policy) {
3099	case SMRSH_COMPACT:
3100	if (min_size < `2`) {
3101	min_size = `2`;
3102	}
3103	break;
3104	default:
3105	if (min_size < `16`) {
3106	min_size = `16`;
3107	}
3108	break;
3109	}
3110
3111	switch (policy) {
3112	case SMRSH_COMPACT:
3113	size = MIN(`2`, min_size);
3114	break;
3115	case SMRSH_BALANCED:
3116	case SMRSH_BALANCED_NOSHRINK:
3117	size = MIN(`16`, min_size);
3118	break;
3119	case SMRSH_FASTEST:
3120	size = min_size;
3121	break;
3122	}
3123
3124	if (size > KALLOC_SAFE_ALLOC_SIZE / sizeof(*array)) {
3125	size = KALLOC_SAFE_ALLOC_SIZE / sizeof(*array);
3126	}
3127	shift = (uint8_t)__builtin_clz((uint32_t)(size - `1`));
3128	size = (~`0u` >> shift) + `1`;
3129	array = (hw_lck_ptr_t *)smr_hash_alloc_array(size);
3130	for (size_t i = `0`; i < size; i++) {
3131	__smr_shash_bucket_init(head: &array[i]);
3132	}
3133
3134	state = (smrsh_state_t){
3135	.curshift = shift,
3136	.newshift = shift,
3137	};
3138	smrh = (struct* smr_shash){
3139	.smrsh_array[`0`] = array,
3140	.smrsh_seed[`0`] = (uint32_t)early_random(),
3141	.smrsh_state = state,
3142	.smrsh_policy = policy,
3143	.smrsh_min_shift = (uint8_t)flsll(mask: min_size - `1`),
3144	};
3145	counter_alloc(&smrh->smrsh_count);
3146	smrh->smrsh_callout = thread_call_allocate_with_options(func: __smr_shash_rehash,
3147	param0: smrh, pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
3148	}
3149
3150	void
3151	__smr_shash_destroy(
3152	struct smr_shash *smrh,
3153	smrh_traits_t traits,
3154	void (^free)(void *))
3155	{
3156	smrsh_state_t state;
3157	hw_lck_ptr_t *array;
3158	size_t size;
3159
3160	thread_call_cancel_wait(call: smrh->smrsh_callout);
3161
3162	state = os_atomic_load(&smrh->smrsh_state, dependency);
3163	assert(state.curidx == state.newidx);
3164	assert(__smr_shash_load_array(smrh, `1` - state.curidx) == NULL);
3165	size = __smr_shash_cursize(state);
3166	array = __smr_shash_load_array(smrh, idx: state.curidx);
3167
3168	if (free) {
3169	for (size_t i = `0`; i < size; i++) {
3170	struct smrq_slink link, next;
3171
3172	next = hw_lck_ptr_value(lck: &array[i]);
3173	while (!__smr_shash_is_stop(link: next)) {
3174	link = next;
3175	next = smr_serialized_load(&link->next);
3176	free(__smrht_link_to_obj(traits, link));
3177	}
3178	}
3179	}
3180	for (size_t i = `0`; i < size; i++) {
3181	__smr_shash_bucket_destroy(head: &array[i]);
3182	}
3183
3184	thread_call_free(call: smrh->smrsh_callout);
3185	counter_free(&smrh->smrsh_count);
3186	smr_hash_free_array(array: (struct smrq_slist_head *)array, size);
3187	bzero(s: smrh, n: sizeof(*smrh));
3188	}
3189
3190
3191	#pragma mark misc
3192
3193	void
3194	__smr_linkage_invalid(__smrq_link_t *link)
3195	{
3196	struct smrq_link elem = __container_of(link, struct* smrq_link, next);
3197	struct smrq_link *next = smr_serialized_load(&elem->next);
3198
3199	panic("Invalid queue linkage: elt:%p next:%p next->prev:%p",
3200	elem, next, __container_of(next->prev, struct smrq_link, next));
3201	}
3202
3203	void
3204	__smr_stail_invalid(__smrq_slink_t link, __smrq_slink_t last)
3205	{
3206	struct smrq_slink elem = __container_of(link, struct* smrq_slink, next);
3207	struct smrq_slink *next = smr_serialized_load(&elem->next);
3208
3209	if (next) {
3210	panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
3211	elem, next);
3212	} else {
3213	panic("Invalid queue tail (early end): elt:%p tail:%p",
3214	elem, __container_of(last, struct smrq_slink, next));
3215	}
3216	}
3217
3218	void
3219	__smr_tail_invalid(__smrq_link_t link, __smrq_link_t last)
3220	{
3221	struct smrq_link elem = __container_of(link, struct* smrq_link, next);
3222	struct smrq_link *next = smr_serialized_load(&elem->next);
3223
3224	if (next) {
3225	panic("Invalid queue tail (element past end): elt:%p elt->next:%p",
3226	elem, next);
3227	} else {
3228	panic("Invalid queue tail (early end): elt:%p tail:%p",
3229	elem, __container_of(last, struct smrq_link, next));
3230	}
3231	}
3232

Browse the source code of xnu/osfmk/kern/smr.c