flow_track.c source code [xnu/bsd/skywalk/nexus/flowswitch/flow/flow_track.c]

1	/*
2	* Copyright (c) 2017-2020 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include <skywalk/os_skywalk_private.h>
30	#include <skywalk/nexus/flowswitch/fsw_var.h>
31	#include <skywalk/nexus/flowswitch/flow/flow_var.h>
32	#include <netinet/tcp.h>
33	#include <netinet/tcp_fsm.h>
34	#include <netinet/tcp_seq.h>
35	#include <netinet/tcp_timer.h>
36	#include <netinet/tcp_var.h>
37	#include <netinet/udp.h>
38	#include <netinet/in_stat.h>
39	#include <netinet/ip.h>
40	#include <netinet/ip6.h>
41	#include <sys/kdebug.h>
42
43	/ min/max linger time (in seconds /
44	#define FLOWTRACK_LINGER_MIN 1
45	#define FLOWTRACK_LINGER_MAX 120
46
47	/ maximum allowed rate of SYNs per second /
48	#define FLOWTRACK_SYN_RATE 20
49
50	static int flow_track_tcp(struct flow_entry , struct* flow_track *,
51	struct flow_track , struct* __kern_packet *, bool);
52	static int flow_track_udp(struct flow_entry , struct* flow_track *,
53	struct flow_track , struct* __kern_packet *, bool);
54
55	static void
56	flow_track_tcp_get_wscale(struct flow_track s, struct* __kern_packet *pkt)
57	{
58	const uint8_t hdr = (uint8_t )(void *)pkt->pkt_flow_tcp_hdr;
59	int hlen = pkt->pkt_flow_tcp_hlen;
60	uint8_t optlen, wscale = `0`;
61	const uint8_t *opt;
62
63	_CASSERT(sizeof(s->fse_flags) == sizeof(uint16_t));
64	ASSERT(hlen >= (int)sizeof(struct tcphdr));
65
66	opt = hdr + sizeof(struct tcphdr);
67	hlen -= sizeof(struct tcphdr);
68	while (hlen >= `3`) {
69	switch (*opt) {
70	case TCPOPT_EOL:
71	case TCPOPT_NOP:
72	++opt;
73	--hlen;
74	break;
75	case TCPOPT_WINDOW:
76	wscale = opt[`2`];
77	if (wscale > TCP_MAX_WINSHIFT) {
78	wscale = TCP_MAX_WINSHIFT;
79	}
80	os_atomic_or(&s->fse_flags, FLOWSTATEF_WSCALE, relaxed);
81	OS_FALLTHROUGH;
82	default:
83	optlen = opt[`1`];
84	if (optlen < `2`) {
85	optlen = `2`;
86	}
87	hlen -= optlen;
88	opt += optlen;
89	break;
90	}
91	}
92	s->fse_wscale = wscale;
93	}
94
95	static void
96	flow_track_tcp_init(struct flow_entry fe, struct* flow_track *src,
97	struct flow_track dst, struct* __kern_packet *pkt)
98	{
99	#pragma unused(dst)
100	const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
101
102	/*
103	* Source state initialization.
104	*/
105	src->fse_state = TCPS_SYN_SENT;
106	src->fse_seqlo = ntohl(pkt->pkt_flow_tcp_seq);
107	src->fse_seqhi = (src->fse_seqlo + pkt->pkt_flow_ulen + `1`);
108	if (tcp_flags & TH_SYN) {
109	src->fse_seqhi++;
110	flow_track_tcp_get_wscale(s: src, pkt);
111	}
112	if (tcp_flags & TH_FIN) {
113	src->fse_seqhi++;
114	}
115
116	src->fse_max_win = MAX(ntohs(pkt->pkt_flow_tcp_win), `1`);
117	if (src->fse_flags & FLOWSTATEF_WSCALE) {
118	/ remove scale factor from initial window /
119	int win = src->fse_max_win;
120	ASSERT(src->fse_wscale <= TCP_MAX_WINSHIFT);
121	win += (`1` << src->fse_wscale);
122	src->fse_max_win = (uint16_t)((win - `1`) >> src->fse_wscale);
123	}
124
125	/*
126	* Destination state initialization.
127	*/
128	dst->fse_state = TCPS_CLOSED;
129	dst->fse_seqhi = `1`;
130	dst->fse_max_win = `1`;
131
132	/*
133	* Linger time (in seconds).
134	*/
135	fe->fe_linger_wait = (`2` * tcp_msl) / TCP_RETRANSHZ;
136	if (fe->fe_linger_wait < FLOWTRACK_LINGER_MIN) {
137	fe->fe_linger_wait = FLOWTRACK_LINGER_MIN;
138	} else if (fe->fe_linger_wait > FLOWTRACK_LINGER_MAX) {
139	fe->fe_linger_wait = FLOWTRACK_LINGER_MAX;
140	}
141
142	os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
143	}
144
145	/*
146	* The TCP ACK RTT tracking is a coarse grain measurement of the time it takes
147	* for a endpoint to process incoming segment and generate ACK, at the point of
148	* observation. For flowswitch, it means that:
149	*
150	* local end RTT = local stack processing time
151	* remote end RTT = driver + network + remote endpoint's processing time
152	*
153	* Since the measurement is lightweight and sampling based, it won't learn and
154	* distinguish lost segment's ACK. So we could occasionally get large RTT
155	* sample from an ACK to a retransmitted segment. Thus rtt_max is not any
156	* meaningful to us.
157	*/
158	__attribute__((always_inline))
159	static inline void
160	flow_track_tcp_rtt(struct flow_entry *fe, boolean_t input,
161	struct flow_track src, struct* flow_track *dst, uint8_t tcp_flags,
162	uint32_t seq, uint32_t ack, uint32_t ulen)
163	{
164	#pragma unused(fe, input) /* KDBG defined as noop in release build */
165	uint64_t dst_last, src_last;
166	uint64_t now, time_diff;
167	uint32_t curval, oldval;
168	clock_sec_t tv_sec;
169	clock_usec_t tv_usec;
170
171	src_last = src->fse_rtt.frtt_last;
172	dst_last = dst->fse_rtt.frtt_last;
173
174	/ start a new RTT tracking session under sampling rate limit /
175	if (dst_last == `0` \|\|
176	_net_uptime - dst_last > FLOWTRACK_RTT_SAMPLE_INTERVAL) {
177	if (ulen > `0` &&
178	dst->fse_rtt.frtt_timestamp == `0`) {
179	dst->fse_rtt.frtt_timestamp = mach_absolute_time();
180	dst->fse_rtt.frtt_last = _net_uptime;
181	dst->fse_rtt.frtt_seg_begin = seq;
182	dst->fse_rtt.frtt_seg_end = seq + ulen;
183	KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT \| DBG_FUNC_START),
184	SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
185	input ? `1` : `0`);
186	}
187	}
188
189	/ we have an ACK, see if current tracking session matches it /
190	if (tcp_flags & TH_ACK) {
191	if (src->fse_rtt.frtt_timestamp != `0` &&
192	src->fse_rtt.frtt_seg_begin <= ack) {
193	now = mach_absolute_time();
194	time_diff = now - src->fse_rtt.frtt_timestamp;
195
196	absolutetime_to_microtime(abstime: time_diff, secs: &tv_sec, microsecs: &tv_usec);
197	curval = (uint32_t)(tv_usec + tv_sec * `1000` * `1000`);
198	oldval = src->fse_rtt.frtt_usec;
199	if (oldval == `0`) {
200	src->fse_rtt.frtt_usec = curval;
201	} else {
202	/ same EWMA decay as TCP RTT /
203	src->fse_rtt.frtt_usec =
204	((oldval << `4`) - oldval + curval) >> `4`;
205	}
206
207	/ reset RTT tracking session /
208	src->fse_rtt.frtt_timestamp = `0`;
209	src->fse_rtt.frtt_last = `0`;
210	KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT \| DBG_FUNC_END),
211	SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
212	input ? `0` : `1`);
213
214	/ publish rtt stats into flow_stats object /
215	/ just store both to avoid branch prediction etc. /
216	fe->fe_stats->fs_lrtt = fe->fe_ltrack.fse_rtt_usec;
217	fe->fe_stats->fs_rrtt = fe->fe_rtrack.fse_rtt_usec;
218	}
219	}
220	}
221
222	/*
223	* The TCP connection tracking logic is based on Guido van Rooij's paper:
224	* http://www.sane.nl/events/sane2000/papers/rooij.pdf
225	*
226	* In some ways, we act as a middlebox that passively tracks the TCP windows
227	* of each connection on flows marked with FLOWENTF_TRACK. We never modify
228	* the packet or generate any response (e.g. RST) to the sender; thus we are
229	* simply a silent observer. The information we gather here is used later
230	* if we need to generate a valid {FIN\|RST} segment when the flow is nonviable.
231	*
232	* The implementation is borrowed from Packet Filter, and is further
233	* simplified to cater for our use cases.
234	*/
235	#define FTF_HALFCLOSED 0x1 /* want flow to be marked as half closed */
236	#define FTF_WAITCLOSE 0x2 /* want flow to linger after close */
237	#define FTF_CLOSENOTIFY 0x4 /* want to notify NECP upon torn down */
238	#define FTF_WITHDRAWN 0x8 /* want flow to be torn down */
239	#define FTF_SYN_RLIM 0x10 /* want flow to rate limit SYN */
240	#define FTF_RST_RLIM 0x20 /* want flow to rate limit RST */
241	__attribute__((always_inline))
242	static inline int
243	flow_track_tcp(struct flow_entry fe, struct* flow_track *src,
244	struct flow_track dst, struct* __kern_packet *pkt, bool input)
245	{
246	const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
247	uint16_t win = ntohs(pkt->pkt_flow_tcp_win);
248	uint32_t ack, end, seq, orig_seq;
249	uint32_t ftflags = `0`;
250	uint8_t sws, dws;
251	int ackskew, err = `0`;
252
253	if (__improbable((fe->fe_flags & FLOWENTF_INITED) == `0`)) {
254	flow_track_tcp_init(fe, src, dst, pkt);
255	}
256
257	flow_track_tcp_rtt(fe, input, src, dst, tcp_flags,
258	ntohl(pkt->pkt_flow_tcp_seq), ntohl(pkt->pkt_flow_tcp_ack),
259	ulen: pkt->pkt_flow_ulen);
260
261	if (__improbable(dst->fse_state >= TCPS_FIN_WAIT_2 &&
262	src->fse_state >= TCPS_FIN_WAIT_2)) {
263	if ((tcp_flags & (TH_SYN \| TH_ACK)) == TH_SYN) {
264	src->fse_state = dst->fse_state = TCPS_CLOSED;
265	ftflags \|= FTF_SYN_RLIM;
266	}
267	if (tcp_flags & TH_RST) {
268	ftflags \|= FTF_RST_RLIM;
269	}
270	if (input) {
271	err = ENETRESET;
272	}
273	goto done;
274	}
275
276	if (__probable((tcp_flags & TH_SYN) == `0` &&
277	src->fse_wscale != `0` && dst->fse_wscale != `0`)) {
278	sws = src->fse_wscale;
279	dws = dst->fse_wscale;
280	} else {
281	sws = dws = `0`;
282	}
283
284	orig_seq = seq = ntohl(pkt->pkt_flow_tcp_seq);
285	if (__probable(src->fse_seqlo != `0`)) {
286	ack = ntohl(pkt->pkt_flow_tcp_ack);
287	end = seq + pkt->pkt_flow_ulen;
288	if (tcp_flags & TH_SYN) {
289	if ((tcp_flags & (TH_SYN \| TH_ACK)) == TH_SYN) {
290	ftflags \|= FTF_SYN_RLIM;
291	}
292	end++;
293	}
294	if (tcp_flags & TH_FIN) {
295	end++;
296	}
297	if (tcp_flags & TH_RST) {
298	ftflags \|= FTF_RST_RLIM;
299	}
300	} else {
301	/ first packet from this end; set its state /
302	ack = ntohl(pkt->pkt_flow_tcp_ack);
303
304	/ We saw the first SYN, but stack does not reply with a SYN /
305	if (dst->fse_state == TCPS_SYN_SENT && ((tcp_flags & TH_SYN) == `0`)) {
306	/ Act as if no sequence number is set /
307	seq = `0`;
308	/ Pretend the outgoing SYN was not ACK'ed /
309	ack = dst->fse_seqlo;
310	}
311
312	end = seq + pkt->pkt_flow_ulen;
313	if (tcp_flags & TH_SYN) {
314	if ((tcp_flags & (TH_SYN \| TH_ACK)) == TH_SYN) {
315	ftflags \|= FTF_SYN_RLIM;
316	}
317	end++;
318	if (dst->fse_flags & FLOWSTATEF_WSCALE) {
319	flow_track_tcp_get_wscale(s: src, pkt);
320	if (src->fse_flags & FLOWSTATEF_WSCALE) {
321	/*
322	* Remove scale factor from
323	* initial window.
324	*/
325	sws = src->fse_wscale;
326	win = (uint16_t)(((u_int32_t)win + (`1` << sws) - `1`)
327	>> sws);
328	dws = dst->fse_wscale;
329	} else {
330	/ fixup other window /
331	dst->fse_max_win = (uint16_t)(dst->fse_max_win << dst->fse_wscale);
332	/ in case of a retrans SYN\|ACK /
333	dst->fse_wscale = `0`;
334	}
335	}
336	}
337	if (tcp_flags & TH_FIN) {
338	end++;
339	}
340	if (tcp_flags & TH_RST) {
341	ftflags \|= FTF_RST_RLIM;
342	}
343
344	src->fse_seqlo = seq;
345	if (src->fse_state < TCPS_SYN_SENT) {
346	if (tcp_flags & TH_SYN) {
347	src->fse_state = TCPS_SYN_SENT;
348	} else {
349	/ Picking up the connection in the middle /
350	src->fse_state = TCPS_ESTABLISHED;
351	}
352	}
353
354	/*
355	* May need to slide the window (seqhi may have been set by
356	* the crappy stack check or if we picked up the connection
357	* after establishment).
358	*/
359	if (src->fse_seqhi == `1` \|\| SEQ_GEQ(end +
360	MAX(`1`, dst->fse_max_win << dws), src->fse_seqhi)) {
361	src->fse_seqhi = end + MAX(`1`, dst->fse_max_win << dws);
362	}
363	if (win > src->fse_max_win) {
364	src->fse_max_win = win;
365	}
366	}
367
368	if (!(tcp_flags & TH_ACK)) {
369	/ let it pass through the ack skew check /
370	ack = dst->fse_seqlo;
371	} else if ((ack == `0` &&
372	(tcp_flags & (TH_ACK \| TH_RST)) == (TH_ACK \| TH_RST)) \|\|
373	/ broken tcp stacks do not set ack /
374	(dst->fse_state < TCPS_SYN_SENT)) {
375	/*
376	* Many stacks (ours included) will set the ACK number in an
377	* FIN\|ACK if the SYN times out -- no sequence to ACK.
378	*/
379	ack = dst->fse_seqlo;
380	}
381
382	if (seq == end) {
383	/ ease sequencing restrictions on no data packets /
384	seq = src->fse_seqlo;
385	end = seq;
386	}
387
388	ackskew = dst->fse_seqlo - ack;
389
390	#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
391	if (SEQ_GEQ(src->fse_seqhi, end) &&
392	/ last octet inside other's window space /
393	SEQ_GEQ(seq, src->fse_seqlo - (dst->fse_max_win << dws)) &&
394	/ retrans: not more than one window back /
395	(ackskew >= -MAXACKWINDOW) &&
396	/ acking not more than one reassembled fragment backwards /
397	(ackskew <= (MAXACKWINDOW << sws)) &&
398	/ acking not more than one window forward /
399	(!(tcp_flags & TH_RST) \|\| orig_seq == src->fse_seqlo \|\|
400	(orig_seq == src->fse_seqlo + `1`) \|\|
401	(orig_seq + `1` == src->fse_seqlo))) {
402	/ require an exact/+1 sequence match on resets when possible /
403
404	/ update max window /
405	if (src->fse_max_win < win) {
406	src->fse_max_win = win;
407	}
408	/ synchronize sequencing /
409	if (SEQ_GT(end, src->fse_seqlo)) {
410	src->fse_seqlo = end;
411	}
412	/ slide the window of what the other end can send /
413	if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
414	dst->fse_seqhi = ack + MAX((win << sws), `1`);
415	}
416
417	/ update states /
418	if (tcp_flags & TH_SYN) {
419	if (src->fse_state < TCPS_SYN_SENT) {
420	src->fse_state = TCPS_SYN_SENT;
421	}
422	}
423	if (tcp_flags & TH_FIN) {
424	if (src->fse_state < TCPS_CLOSING) {
425	src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
426	src->fse_state = TCPS_CLOSING;
427	}
428	}
429	if (tcp_flags & TH_ACK) {
430	/*
431	* Avoid transitioning to ESTABLISHED when our SYN
432	* is ACK'd along with a RST. The sending TCP may
433	* still retransmit the SYN (after dropping some
434	* options like ECN, etc.)
435	*/
436	if (dst->fse_state == TCPS_SYN_SENT &&
437	!(tcp_flags & TH_RST)) {
438	dst->fse_state = TCPS_ESTABLISHED;
439	ftflags \|= (FTF_WAITCLOSE \| FTF_CLOSENOTIFY);
440	} else if (dst->fse_state == TCPS_CLOSING &&
441	ack == dst->fse_seqlast + `1`) {
442	dst->fse_state = TCPS_FIN_WAIT_2;
443	ftflags \|= FTF_WAITCLOSE;
444	if (src->fse_state >= TCPS_FIN_WAIT_2) {
445	ftflags \|= FTF_WITHDRAWN;
446	} else {
447	ftflags \|= FTF_HALFCLOSED;
448	}
449	}
450	}
451	if ((tcp_flags & TH_RST) &&
452	(src->fse_state == TCPS_ESTABLISHED \|\|
453	dst->fse_state == TCPS_ESTABLISHED)) {
454	/*
455	* If either endpoint is in ESTABLISHED, transition
456	* both to TIME_WAIT. Otherwise, keep the existing
457	* state as is, e.g. SYN_SENT.
458	*/
459	src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
460	ftflags \|= (FTF_WITHDRAWN \| FTF_WAITCLOSE);
461	}
462	} else if ((dst->fse_state < TCPS_SYN_SENT \|\|
463	dst->fse_state >= TCPS_FIN_WAIT_2 \|\|
464	src->fse_state >= TCPS_FIN_WAIT_2) &&
465	SEQ_GEQ(src->fse_seqhi + MAXACKWINDOW, end) &&
466	/ within a window forward of the originating packet /
467	SEQ_GEQ(seq, src->fse_seqlo - MAXACKWINDOW)) {
468	/ within a window backward of the originating packet /
469
470	/ BEGIN CSTYLED /
471	/*
472	* This currently handles three situations:
473	* 1) Stupid stacks will shotgun SYNs before their peer
474	* replies.
475	* 2) When flow tracking catches an already established
476	* stream (the flow states are cleared, etc.)
477	* 3) Packets get funky immediately after the connection
478	* closes (this should catch spurious ACK\|FINs that
479	* web servers like to spew after a close).
480	*
481	* This must be a little more careful than the above code
482	* since packet floods will also be caught here.
483	*/
484	/ END CSTYLED /
485
486	/ update max window /
487	if (src->fse_max_win < win) {
488	src->fse_max_win = win;
489	}
490	/ synchronize sequencing /
491	if (SEQ_GT(end, src->fse_seqlo)) {
492	src->fse_seqlo = end;
493	}
494	/ slide the window of what the other end can send /
495	if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
496	dst->fse_seqhi = ack + MAX((win << sws), `1`);
497	}
498
499	/*
500	* Cannot set dst->fse_seqhi here since this could be a
501	* shotgunned SYN and not an already established connection.
502	*/
503
504	if (tcp_flags & TH_FIN) {
505	if (src->fse_state < TCPS_CLOSING) {
506	src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
507	src->fse_state = TCPS_CLOSING;
508	}
509	}
510	if (tcp_flags & TH_RST) {
511	src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
512	ftflags \|= FTF_WAITCLOSE;
513	}
514	} else {
515	if (dst->fse_state == TCPS_SYN_SENT &&
516	src->fse_state == TCPS_SYN_SENT) {
517	src->fse_seqlo = `0`;
518	src->fse_seqhi = `1`;
519	src->fse_max_win = `1`;
520	}
521	}
522
523	done:
524	if (__improbable((ftflags & FTF_HALFCLOSED) != `0`)) {
525	os_atomic_or(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
526	ftflags &= ~FTF_HALFCLOSED;
527	}
528
529	/*
530	* Hold on to namespace for a while after the flow is closed.
531	*/
532	if (__improbable((ftflags & FTF_WAITCLOSE) != `0` &&
533	(fe->fe_flags & FLOWENTF_WAIT_CLOSE) == `0`)) {
534	os_atomic_or(&fe->fe_flags, FLOWENTF_WAIT_CLOSE, relaxed);
535	ftflags &= ~FTF_WAITCLOSE;
536	}
537
538	/*
539	* Notify NECP upon tear down (for established flows).
540	*/
541	if (__improbable((ftflags & FTF_CLOSENOTIFY) != `0` &&
542	(fe->fe_flags & FLOWENTF_CLOSE_NOTIFY) == `0`)) {
543	os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY, relaxed);
544	ftflags &= ~FTF_CLOSENOTIFY;
545	}
546
547	/*
548	* Flow is withdrawn; the port we have should not be included in
549	* the list of offloaded ports, as the connection is no longer
550	* usable (we're not expecting any more data).
551	* Also clear FLOWENTF_HALF_CLOSED flag here. It's fine if reaper
552	* thread hadn't pickedup FLOWENTF_HALF_CLOSED, as it will pick up
553	* FLOWENTF_WITHDRAWN and notify netns of full withdrawn.
554	*/
555	if (__improbable((ftflags & FTF_WITHDRAWN) != `0`)) {
556	ftflags &= ~FTF_WITHDRAWN;
557	if (fe->fe_flags & FLOWENTF_HALF_CLOSED) {
558	os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
559	}
560	fe->fe_want_withdraw = `1`;
561	}
562
563	/*
564	* If no other work is needed, we're done.
565	*/
566	if (ftflags == `0` \|\| input) {
567	return err;
568	}
569
570	/*
571	* If we're over the rate limit for outbound SYNs, drop packet.
572	*/
573	if (__improbable((ftflags & FTF_SYN_RLIM) != `0`)) {
574	uint32_t now = (uint32_t)_net_uptime;
575	if ((now - src->fse_syn_ts) > `1`) {
576	src->fse_syn_ts = now;
577	src->fse_syn_cnt = `0`;
578	}
579	if (++src->fse_syn_cnt > FLOWTRACK_SYN_RATE) {
580	err = EPROTO;
581	}
582	}
583
584	return err;
585	}
586	#undef FTF_WAITCLOSE
587	#undef FTF_CLOSENOTIFY
588	#undef FTF_WITHDRAWN
589	#undef FTF_SYN_RLIM
590	#undef FTF_RST_RLIM
591
592	boolean_t
593	flow_track_tcp_want_abort(struct flow_entry *fe)
594	{
595	struct flow_track *src = &fe->fe_ltrack;
596	struct flow_track *dst = &fe->fe_rtrack;
597
598	if (fe->fe_key.fk_proto != IPPROTO_TCP \|\|
599	(fe->fe_flags & FLOWENTF_ABORTED)) {
600	goto done;
601	}
602
603	/ this can be enhanced; for now rely on established state /
604	if (src->fse_state == TCPS_ESTABLISHED \|\|
605	dst->fse_state == TCPS_ESTABLISHED) {
606	src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
607	/ don't process more than once /
608	os_atomic_or(&fe->fe_flags, FLOWENTF_ABORTED, relaxed);
609	return TRUE;
610	}
611	done:
612	return FALSE;
613	}
614
615	static void
616	flow_track_udp_init(struct flow_entry fe, struct* flow_track *src,
617	struct flow_track dst, struct* __kern_packet *pkt)
618	{
619	#pragma unused(pkt)
620	/*
621	* Source state initialization.
622	*/
623	src->fse_state = FT_STATE_NO_TRAFFIC;
624
625	/*
626	* Destination state initialization.
627	*/
628	dst->fse_state = FT_STATE_NO_TRAFFIC;
629
630	os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
631	}
632
633	__attribute__((always_inline))
634	static inline int
635	flow_track_udp(struct flow_entry fe, struct* flow_track *src,
636	struct flow_track dst, struct* __kern_packet *pkt, bool input)
637	{
638	#pragma unused(input)
639	if (__improbable((fe->fe_flags & FLOWENTF_INITED) == `0`)) {
640	flow_track_udp_init(fe, src, dst, pkt);
641	}
642
643	if (__improbable(src->fse_state == FT_STATE_NO_TRAFFIC)) {
644	src->fse_state = FT_STATE_SINGLE;
645	}
646	if (__improbable(dst->fse_state == FT_STATE_SINGLE)) {
647	dst->fse_state = FT_STATE_MULTIPLE;
648	}
649
650	return `0`;
651	}
652
653	void
654	flow_track_stats(struct flow_entry *fe, uint64_t bytes, uint64_t packets,
655	bool active, bool in)
656	{
657	volatile struct sk_stats_flow_track *fst;
658
659	if (in) {
660	fst = &fe->fe_stats->fs_rtrack;
661	} else {
662	fst = &fe->fe_stats->fs_ltrack;
663	}
664
665	fst->sft_bytes += bytes;
666	fst->sft_packets += packets;
667
668	if (__probable(active)) {
669	in_stat_set_activity_bitmap(activity: &fe->fe_stats->fs_activity,
670	now: _net_uptime);
671	}
672	}
673
674	int
675	flow_pkt_track(struct flow_entry fe, struct* __kern_packet *pkt, bool in)
676	{
677	struct flow_track src, dst;
678	int ret = `0`;
679
680	_CASSERT(SFT_STATE_CLOSED == FT_STATE_CLOSED);
681	_CASSERT(SFT_STATE_LISTEN == FT_STATE_LISTEN);
682	_CASSERT(SFT_STATE_SYN_SENT == FT_STATE_SYN_SENT);
683	_CASSERT(SFT_STATE_SYN_RECEIVED == FT_STATE_SYN_RECEIVED);
684	_CASSERT(SFT_STATE_ESTABLISHED == FT_STATE_ESTABLISHED);
685	_CASSERT(SFT_STATE_CLOSE_WAIT == FT_STATE_CLOSE_WAIT);
686	_CASSERT(SFT_STATE_FIN_WAIT_1 == FT_STATE_FIN_WAIT_1);
687	_CASSERT(SFT_STATE_CLOSING == FT_STATE_CLOSING);
688	_CASSERT(SFT_STATE_LAST_ACK == FT_STATE_LAST_ACK);
689	_CASSERT(SFT_STATE_FIN_WAIT_2 == FT_STATE_FIN_WAIT_2);
690	_CASSERT(SFT_STATE_TIME_WAIT == FT_STATE_TIME_WAIT);
691	_CASSERT(SFT_STATE_NO_TRAFFIC == FT_STATE_NO_TRAFFIC);
692	_CASSERT(SFT_STATE_SINGLE == FT_STATE_SINGLE);
693	_CASSERT(SFT_STATE_MULTIPLE == FT_STATE_MULTIPLE);
694	_CASSERT(SFT_STATE_MAX == FT_STATE_MAX);
695
696	_CASSERT(FT_STATE_CLOSED == TCPS_CLOSED);
697	_CASSERT(FT_STATE_LISTEN == TCPS_LISTEN);
698	_CASSERT(FT_STATE_SYN_SENT == TCPS_SYN_SENT);
699	_CASSERT(FT_STATE_SYN_RECEIVED == TCPS_SYN_RECEIVED);
700	_CASSERT(FT_STATE_ESTABLISHED == TCPS_ESTABLISHED);
701	_CASSERT(FT_STATE_CLOSE_WAIT == TCPS_CLOSE_WAIT);
702	_CASSERT(FT_STATE_FIN_WAIT_1 == TCPS_FIN_WAIT_1);
703	_CASSERT(FT_STATE_CLOSING == TCPS_CLOSING);
704	_CASSERT(FT_STATE_LAST_ACK == TCPS_LAST_ACK);
705	_CASSERT(FT_STATE_FIN_WAIT_2 == TCPS_FIN_WAIT_2);
706	_CASSERT(FT_STATE_TIME_WAIT == TCPS_TIME_WAIT);
707
708	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
709
710	if (in) {
711	src = &fe->fe_rtrack;
712	dst = &fe->fe_ltrack;
713	} else {
714	src = &fe->fe_ltrack;
715	dst = &fe->fe_rtrack;
716	}
717
718	flow_track_stats(fe, bytes: (pkt->pkt_length - pkt->pkt_l2_len), packets: `1`,
719	active: (pkt->pkt_flow_ulen != `0`), in);
720
721	/ skip flow state tracking on non-initial fragments /
722	if (pkt->pkt_flow_ip_is_frag && !pkt->pkt_flow_ip_is_first_frag) {
723	return `0`;
724	}
725
726	switch (pkt->pkt_flow_ip_proto) {
727	case IPPROTO_TCP:
728	if (__probable((fe->fe_flags & FLOWENTF_TRACK) != `0`)) {
729	ret = flow_track_tcp(fe, src, dst, pkt, input: in);
730	}
731	break;
732
733	case IPPROTO_UDP:
734	if (__probable((fe->fe_flags & FLOWENTF_TRACK) != `0`)) {
735	ret = flow_track_udp(fe, src, dst, pkt, input: in);
736	}
737	break;
738	}
739
740	return ret;
741	}
742
743	/*
744	* @function flow_track_abort_tcp
745	* @abstract send RST for a given TCP flow.
746	* @param in_pkt incoming packet that triggers RST.
747	* @param rst_pkt use as RST template for SEQ/ACK information.
748	*/
749	void
750	flow_track_abort_tcp(struct flow_entry fe, struct* __kern_packet *in_pkt,
751	struct __kern_packet *rst_pkt)
752	{
753	struct nx_flowswitch *fsw = fe->fe_fsw;
754	struct flow_track src, dst;
755	struct ip *ip;
756	struct ip6_hdr *ip6;
757	struct tcphdr *th;
758	uint16_t len, tlen;
759	struct mbuf *m;
760
761	/ guaranteed by caller /
762	ASSERT(fsw->fsw_ifp != NULL);
763	ASSERT(in_pkt == NULL \|\| rst_pkt == NULL);
764
765	src = &fe->fe_ltrack;
766	dst = &fe->fe_rtrack;
767
768	tlen = sizeof(struct tcphdr);
769	if (fe->fe_key.fk_ipver == IPVERSION) {
770	len = sizeof(struct ip) + tlen;
771	} else {
772	ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
773	len = sizeof(struct ip6_hdr) + tlen;
774	}
775
776	m = m_gethdr(M_NOWAIT, MT_HEADER);
777	if (__improbable(m == NULL)) {
778	return;
779	}
780
781	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
782	m->m_data += max_linkhdr; / 32-bit aligned /
783	m->m_pkthdr.len = m->m_len = len;
784
785	/ zero out for checksum /
786	bzero(s: m_mtod_current(m), n: len);
787
788	if (fe->fe_key.fk_ipver == IPVERSION) {
789	ip = mtod(m, struct ip *);
790
791	/ IP header fields included in the TCP checksum /
792	ip->ip_p = IPPROTO_TCP;
793	ip->ip_len = htons(tlen);
794	if (rst_pkt == NULL) {
795	ip->ip_src = fe->fe_key.fk_src4;
796	ip->ip_dst = fe->fe_key.fk_dst4;
797	} else {
798	ip->ip_src = rst_pkt->pkt_flow_ipv4_src;
799	ip->ip_dst = rst_pkt->pkt_flow_ipv4_dst;
800	}
801
802	th = (struct tcphdr )(void* )((char* )ip + sizeof(ip));
803	} else {
804	ip6 = mtod(m, struct ip6_hdr *);
805
806	/ IP header fields included in the TCP checksum /
807	ip6->ip6_nxt = IPPROTO_TCP;
808	ip6->ip6_plen = htons(tlen);
809	if (rst_pkt == NULL) {
810	ip6->ip6_src = fe->fe_key.fk_src6;
811	ip6->ip6_dst = fe->fe_key.fk_dst6;
812	} else {
813	ip6->ip6_src = rst_pkt->pkt_flow_ipv6_src;
814	ip6->ip6_dst = rst_pkt->pkt_flow_ipv6_dst;
815	}
816
817	th = (struct tcphdr )(void* )((char* )ip6 + sizeof(ip6));
818	}
819
820	/*
821	* TCP header (fabricate a pure RST).
822	*/
823	if (in_pkt != NULL) {
824	th->th_sport = in_pkt->pkt_flow_tcp_dst;
825	th->th_dport = in_pkt->pkt_flow_tcp_src;
826	if (__probable(in_pkt->pkt_flow_tcp_flags \| TH_ACK)) {
827	/ <SEQ=SEG.ACK><CTL=RST> /
828	th->th_seq = in_pkt->pkt_flow_tcp_ack;
829	th->th_ack = `0`;
830	th->th_flags = TH_RST;
831	} else {
832	/ <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK> /
833	th->th_seq = `0`;
834	th->th_ack = in_pkt->pkt_flow_tcp_seq +
835	in_pkt->pkt_flow_ulen;
836	th->th_flags = TH_RST \| TH_ACK;
837	}
838	} else if (rst_pkt != NULL) {
839	th->th_sport = rst_pkt->pkt_flow_tcp_src;
840	th->th_dport = rst_pkt->pkt_flow_tcp_dst;
841	th->th_seq = rst_pkt->pkt_flow_tcp_seq;
842	th->th_ack = rst_pkt->pkt_flow_tcp_ack;
843	th->th_flags = rst_pkt->pkt_flow_tcp_flags;
844	} else {
845	th->th_sport = fe->fe_key.fk_sport;
846	th->th_dport = fe->fe_key.fk_dport;
847	th->th_seq = htonl(src->fse_seqlo); / peer's last ACK /
848	th->th_ack = `0`;
849	th->th_flags = TH_RST;
850	}
851	th->th_off = (tlen >> `2`);
852	th->th_win = `0`;
853
854	FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
855
856	if (fe->fe_key.fk_ipver == IPVERSION) {
857	struct ip_out_args ipoa;
858	struct route ro;
859
860	bzero(s: &ipoa, n: sizeof(ipoa));
861	ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
862	ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF \| IPOAF_BOUND_IF \|
863	IPOAF_BOUND_SRCADDR);
864	ipoa.ipoa_sotc = SO_TC_UNSPEC;
865	ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
866
867	/ TCP checksum /
868	th->th_sum = in_cksum(m, len);
869
870	ip->ip_v = IPVERSION;
871	ip->ip_hl = sizeof(*ip) >> `2`;
872	ip->ip_tos = `0`;
873	/*
874	* ip_output() expects ip_len and ip_off to be in host order.
875	*/
876	ip->ip_len = len;
877	ip->ip_off = IP_DF;
878	ip->ip_ttl = (uint8_t)ip_defttl;
879	ip->ip_sum = `0`;
880
881	bzero(s: &ro, n: sizeof(ro));
882	(void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
883	ROUTE_RELEASE(&ro);
884	} else {
885	struct ip6_out_args ip6oa;
886	struct route_in6 ro6;
887
888	bzero(s: &ip6oa, n: sizeof(ip6oa));
889	ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
890	ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF \| IP6OAF_BOUND_IF \|
891	IP6OAF_BOUND_SRCADDR);
892	ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
893	ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
894
895	/ TCP checksum /
896	th->th_sum = in6_cksum(m, IPPROTO_TCP,
897	sizeof(struct ip6_hdr), tlen);
898
899	ip6->ip6_vfc \|= IPV6_VERSION;
900	ip6->ip6_hlim = IPV6_DEFHLIM;
901
902	bzero(s: &ro6, n: sizeof(ro6));
903	(void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
904	NULL, NULL, &ip6oa);
905	ROUTE_RELEASE(&ro6);
906	}
907	}
908
909	void
910	flow_track_abort_quic(struct flow_entry fe, uint8_t token)
911	{
912	struct quic_stateless_reset {
913	uint8_t ssr_header[`30`];
914	uint8_t ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE];
915	};
916	struct nx_flowswitch *fsw = fe->fe_fsw;
917	struct ip *ip;
918	struct ip6_hdr *ip6;
919	struct udphdr *uh;
920	struct quic_stateless_reset *qssr;
921	uint16_t len, l3hlen, ulen;
922	struct mbuf *m;
923	unsigned int one = `1`;
924	int error;
925
926	/ guaranteed by caller /
927	ASSERT(fsw->fsw_ifp != NULL);
928
929	/ skip zero token /
930	bool is_zero_token = true;
931	for (size_t i = `0`; i < QUIC_STATELESS_RESET_TOKEN_SIZE; i++) {
932	if (token[i] != `0`) {
933	is_zero_token = false;
934	break;
935	}
936	}
937	if (is_zero_token) {
938	return;
939	}
940
941	ulen = sizeof(struct udphdr) + sizeof(struct quic_stateless_reset);
942	if (fe->fe_key.fk_ipver == IPVERSION) {
943	l3hlen = sizeof(struct ip);
944	} else {
945	ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
946	l3hlen = sizeof(struct ip6_hdr);
947	}
948
949	len = l3hlen + ulen;
950
951	error = mbuf_allocpacket(how: MBUF_DONTWAIT, packetlen: max_linkhdr + len, maxchunks: &one, mbuf: &m);
952	if (__improbable(error != `0`)) {
953	return;
954	}
955	VERIFY(m != `0`);
956
957	m->m_pkthdr.pkt_proto = IPPROTO_UDP;
958	m->m_data += max_linkhdr; / 32-bit aligned /
959	m->m_pkthdr.len = m->m_len = len;
960
961	/ zero out for checksum /
962	bzero(s: m_mtod_current(m), n: len);
963
964	if (fe->fe_key.fk_ipver == IPVERSION) {
965	ip = mtod(m, struct ip *);
966	ip->ip_p = IPPROTO_UDP;
967	ip->ip_len = htons(ulen);
968	ip->ip_src = fe->fe_key.fk_src4;
969	ip->ip_dst = fe->fe_key.fk_dst4;
970	uh = (struct udphdr )(void* )((char* )ip + sizeof(ip));
971	} else {
972	ip6 = mtod(m, struct ip6_hdr *);
973	ip6->ip6_nxt = IPPROTO_UDP;
974	ip6->ip6_plen = htons(ulen);
975	ip6->ip6_src = fe->fe_key.fk_src6;
976	ip6->ip6_dst = fe->fe_key.fk_dst6;
977	uh = (struct udphdr )(void* )((char* )ip6 + sizeof(ip6));
978	}
979
980	/ UDP header /
981	uh->uh_sport = fe->fe_key.fk_sport;
982	uh->uh_dport = fe->fe_key.fk_dport;
983	uh->uh_ulen = htons(ulen);
984
985	/ QUIC stateless reset /
986	qssr = (struct quic_stateless_reset *)(uh + `1`);
987	read_frandom(buffer: &qssr->ssr_header, numBytes: sizeof(qssr->ssr_header));
988	qssr->ssr_header[`0`] = (qssr->ssr_header[`0`] & `0x3f`) \| `0x40`;
989	memcpy(dst: qssr->ssr_token, src: token, QUIC_STATELESS_RESET_TOKEN_SIZE);
990
991	FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
992
993	if (fe->fe_key.fk_ipver == IPVERSION) {
994	struct ip_out_args ipoa;
995	struct route ro;
996
997	bzero(s: &ipoa, n: sizeof(ipoa));
998	ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
999	ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF \| IPOAF_BOUND_IF \|
1000	IPOAF_BOUND_SRCADDR);
1001	ipoa.ipoa_sotc = SO_TC_UNSPEC;
1002	ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1003
1004	uh->uh_sum = in_cksum(m, len);
1005	if (uh->uh_sum == `0`) {
1006	uh->uh_sum = `0xffff`;
1007	}
1008
1009	ip->ip_v = IPVERSION;
1010	ip->ip_hl = sizeof(*ip) >> `2`;
1011	ip->ip_tos = `0`;
1012	/*
1013	* ip_output() expects ip_len and ip_off to be in host order.
1014	*/
1015	ip->ip_len = len;
1016	ip->ip_off = IP_DF;
1017	ip->ip_ttl = (uint8_t)ip_defttl;
1018	ip->ip_sum = `0`;
1019
1020	bzero(s: &ro, n: sizeof(ro));
1021	(void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
1022	ROUTE_RELEASE(&ro);
1023	} else {
1024	struct ip6_out_args ip6oa;
1025	struct route_in6 ro6;
1026
1027	bzero(s: &ip6oa, n: sizeof(ip6oa));
1028	ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
1029	ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF \| IP6OAF_BOUND_IF \|
1030	IP6OAF_BOUND_SRCADDR);
1031	ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
1032	ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1033
1034	uh->uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(struct ip6_hdr),
1035	ulen);
1036	if (uh->uh_sum == `0`) {
1037	uh->uh_sum = `0xffff`;
1038	}
1039
1040	ip6->ip6_vfc \|= IPV6_VERSION;
1041	ip6->ip6_hlim = IPV6_DEFHLIM;
1042
1043	bzero(s: &ro6, n: sizeof(ro6));
1044	(void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
1045	NULL, NULL, &ip6oa);
1046	ROUTE_RELEASE(&ro6);
1047	}
1048	}
1049

Browse the source code of xnu/bsd/skywalk/nexus/flowswitch/flow/flow_track.c