1/*
2 * Copyright (c) 2017-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <skywalk/os_skywalk_private.h>
30#include <skywalk/nexus/flowswitch/fsw_var.h>
31#include <skywalk/nexus/flowswitch/flow/flow_var.h>
32#include <netinet/tcp.h>
33#include <netinet/tcp_fsm.h>
34#include <netinet/tcp_seq.h>
35#include <netinet/tcp_timer.h>
36#include <netinet/tcp_var.h>
37#include <netinet/udp.h>
38#include <netinet/in_stat.h>
39#include <netinet/ip.h>
40#include <netinet/ip6.h>
41#include <sys/kdebug.h>
42
43/* min/max linger time (in seconds */
44#define FLOWTRACK_LINGER_MIN 1
45#define FLOWTRACK_LINGER_MAX 120
46
47/* maximum allowed rate of SYNs per second */
48#define FLOWTRACK_SYN_RATE 20
49
50static int flow_track_tcp(struct flow_entry *, struct flow_track *,
51 struct flow_track *, struct __kern_packet *, bool);
52static int flow_track_udp(struct flow_entry *, struct flow_track *,
53 struct flow_track *, struct __kern_packet *, bool);
54
55static void
56flow_track_tcp_get_wscale(struct flow_track *s, struct __kern_packet *pkt)
57{
58 const uint8_t *hdr = (uint8_t *)(void *)pkt->pkt_flow_tcp_hdr;
59 int hlen = pkt->pkt_flow_tcp_hlen;
60 uint8_t optlen, wscale = 0;
61 const uint8_t *opt;
62
63 _CASSERT(sizeof(s->fse_flags) == sizeof(uint16_t));
64 ASSERT(hlen >= (int)sizeof(struct tcphdr));
65
66 opt = hdr + sizeof(struct tcphdr);
67 hlen -= sizeof(struct tcphdr);
68 while (hlen >= 3) {
69 switch (*opt) {
70 case TCPOPT_EOL:
71 case TCPOPT_NOP:
72 ++opt;
73 --hlen;
74 break;
75 case TCPOPT_WINDOW:
76 wscale = opt[2];
77 if (wscale > TCP_MAX_WINSHIFT) {
78 wscale = TCP_MAX_WINSHIFT;
79 }
80 os_atomic_or(&s->fse_flags, FLOWSTATEF_WSCALE, relaxed);
81 OS_FALLTHROUGH;
82 default:
83 optlen = opt[1];
84 if (optlen < 2) {
85 optlen = 2;
86 }
87 hlen -= optlen;
88 opt += optlen;
89 break;
90 }
91 }
92 s->fse_wscale = wscale;
93}
94
95static void
96flow_track_tcp_init(struct flow_entry *fe, struct flow_track *src,
97 struct flow_track *dst, struct __kern_packet *pkt)
98{
99#pragma unused(dst)
100 const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
101
102 /*
103 * Source state initialization.
104 */
105 src->fse_state = TCPS_SYN_SENT;
106 src->fse_seqlo = ntohl(pkt->pkt_flow_tcp_seq);
107 src->fse_seqhi = (src->fse_seqlo + pkt->pkt_flow_ulen + 1);
108 if (tcp_flags & TH_SYN) {
109 src->fse_seqhi++;
110 flow_track_tcp_get_wscale(s: src, pkt);
111 }
112 if (tcp_flags & TH_FIN) {
113 src->fse_seqhi++;
114 }
115
116 src->fse_max_win = MAX(ntohs(pkt->pkt_flow_tcp_win), 1);
117 if (src->fse_flags & FLOWSTATEF_WSCALE) {
118 /* remove scale factor from initial window */
119 int win = src->fse_max_win;
120 ASSERT(src->fse_wscale <= TCP_MAX_WINSHIFT);
121 win += (1 << src->fse_wscale);
122 src->fse_max_win = (uint16_t)((win - 1) >> src->fse_wscale);
123 }
124
125 /*
126 * Destination state initialization.
127 */
128 dst->fse_state = TCPS_CLOSED;
129 dst->fse_seqhi = 1;
130 dst->fse_max_win = 1;
131
132 /*
133 * Linger time (in seconds).
134 */
135 fe->fe_linger_wait = (2 * tcp_msl) / TCP_RETRANSHZ;
136 if (fe->fe_linger_wait < FLOWTRACK_LINGER_MIN) {
137 fe->fe_linger_wait = FLOWTRACK_LINGER_MIN;
138 } else if (fe->fe_linger_wait > FLOWTRACK_LINGER_MAX) {
139 fe->fe_linger_wait = FLOWTRACK_LINGER_MAX;
140 }
141
142 os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
143}
144
145/*
146 * The TCP ACK RTT tracking is a coarse grain measurement of the time it takes
147 * for a endpoint to process incoming segment and generate ACK, at the point of
148 * observation. For flowswitch, it means that:
149 *
150 * local end RTT = local stack processing time
151 * remote end RTT = driver + network + remote endpoint's processing time
152 *
153 * Since the measurement is lightweight and sampling based, it won't learn and
154 * distinguish lost segment's ACK. So we could occasionally get large RTT
155 * sample from an ACK to a retransmitted segment. Thus rtt_max is not any
156 * meaningful to us.
157 */
158__attribute__((always_inline))
159static inline void
160flow_track_tcp_rtt(struct flow_entry *fe, boolean_t input,
161 struct flow_track *src, struct flow_track *dst, uint8_t tcp_flags,
162 uint32_t seq, uint32_t ack, uint32_t ulen)
163{
164#pragma unused(fe, input) /* KDBG defined as noop in release build */
165 uint64_t dst_last, src_last;
166 uint64_t now, time_diff;
167 uint32_t curval, oldval;
168 clock_sec_t tv_sec;
169 clock_usec_t tv_usec;
170
171 src_last = src->fse_rtt.frtt_last;
172 dst_last = dst->fse_rtt.frtt_last;
173
174 /* start a new RTT tracking session under sampling rate limit */
175 if (dst_last == 0 ||
176 _net_uptime - dst_last > FLOWTRACK_RTT_SAMPLE_INTERVAL) {
177 if (ulen > 0 &&
178 dst->fse_rtt.frtt_timestamp == 0) {
179 dst->fse_rtt.frtt_timestamp = mach_absolute_time();
180 dst->fse_rtt.frtt_last = _net_uptime;
181 dst->fse_rtt.frtt_seg_begin = seq;
182 dst->fse_rtt.frtt_seg_end = seq + ulen;
183 KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_START),
184 SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
185 input ? 1 : 0);
186 }
187 }
188
189 /* we have an ACK, see if current tracking session matches it */
190 if (tcp_flags & TH_ACK) {
191 if (src->fse_rtt.frtt_timestamp != 0 &&
192 src->fse_rtt.frtt_seg_begin <= ack) {
193 now = mach_absolute_time();
194 time_diff = now - src->fse_rtt.frtt_timestamp;
195
196 absolutetime_to_microtime(abstime: time_diff, secs: &tv_sec, microsecs: &tv_usec);
197 curval = (uint32_t)(tv_usec + tv_sec * 1000 * 1000);
198 oldval = src->fse_rtt.frtt_usec;
199 if (oldval == 0) {
200 src->fse_rtt.frtt_usec = curval;
201 } else {
202 /* same EWMA decay as TCP RTT */
203 src->fse_rtt.frtt_usec =
204 ((oldval << 4) - oldval + curval) >> 4;
205 }
206
207 /* reset RTT tracking session */
208 src->fse_rtt.frtt_timestamp = 0;
209 src->fse_rtt.frtt_last = 0;
210 KDBG((SK_KTRACE_FSW_FLOW_TRACK_RTT | DBG_FUNC_END),
211 SK_KVA(fe), fe->fe_pid, ntohs(fe->fe_key.fk_sport),
212 input ? 0 : 1);
213
214 /* publish rtt stats into flow_stats object */
215 /* just store both to avoid branch prediction etc. */
216 fe->fe_stats->fs_lrtt = fe->fe_ltrack.fse_rtt_usec;
217 fe->fe_stats->fs_rrtt = fe->fe_rtrack.fse_rtt_usec;
218 }
219 }
220}
221
222/*
223 * The TCP connection tracking logic is based on Guido van Rooij's paper:
224 * http://www.sane.nl/events/sane2000/papers/rooij.pdf
225 *
226 * In some ways, we act as a middlebox that passively tracks the TCP windows
227 * of each connection on flows marked with FLOWENTF_TRACK. We never modify
228 * the packet or generate any response (e.g. RST) to the sender; thus we are
229 * simply a silent observer. The information we gather here is used later
230 * if we need to generate a valid {FIN|RST} segment when the flow is nonviable.
231 *
232 * The implementation is borrowed from Packet Filter, and is further
233 * simplified to cater for our use cases.
234 */
235#define FTF_HALFCLOSED 0x1 /* want flow to be marked as half closed */
236#define FTF_WAITCLOSE 0x2 /* want flow to linger after close */
237#define FTF_CLOSENOTIFY 0x4 /* want to notify NECP upon torn down */
238#define FTF_WITHDRAWN 0x8 /* want flow to be torn down */
239#define FTF_SYN_RLIM 0x10 /* want flow to rate limit SYN */
240#define FTF_RST_RLIM 0x20 /* want flow to rate limit RST */
241__attribute__((always_inline))
242static inline int
243flow_track_tcp(struct flow_entry *fe, struct flow_track *src,
244 struct flow_track *dst, struct __kern_packet *pkt, bool input)
245{
246 const uint8_t tcp_flags = pkt->pkt_flow_tcp_flags;
247 uint16_t win = ntohs(pkt->pkt_flow_tcp_win);
248 uint32_t ack, end, seq, orig_seq;
249 uint32_t ftflags = 0;
250 uint8_t sws, dws;
251 int ackskew, err = 0;
252
253 if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
254 flow_track_tcp_init(fe, src, dst, pkt);
255 }
256
257 flow_track_tcp_rtt(fe, input, src, dst, tcp_flags,
258 ntohl(pkt->pkt_flow_tcp_seq), ntohl(pkt->pkt_flow_tcp_ack),
259 ulen: pkt->pkt_flow_ulen);
260
261 if (__improbable(dst->fse_state >= TCPS_FIN_WAIT_2 &&
262 src->fse_state >= TCPS_FIN_WAIT_2)) {
263 if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
264 src->fse_state = dst->fse_state = TCPS_CLOSED;
265 ftflags |= FTF_SYN_RLIM;
266 }
267 if (tcp_flags & TH_RST) {
268 ftflags |= FTF_RST_RLIM;
269 }
270 if (input) {
271 err = ENETRESET;
272 }
273 goto done;
274 }
275
276 if (__probable((tcp_flags & TH_SYN) == 0 &&
277 src->fse_wscale != 0 && dst->fse_wscale != 0)) {
278 sws = src->fse_wscale;
279 dws = dst->fse_wscale;
280 } else {
281 sws = dws = 0;
282 }
283
284 orig_seq = seq = ntohl(pkt->pkt_flow_tcp_seq);
285 if (__probable(src->fse_seqlo != 0)) {
286 ack = ntohl(pkt->pkt_flow_tcp_ack);
287 end = seq + pkt->pkt_flow_ulen;
288 if (tcp_flags & TH_SYN) {
289 if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
290 ftflags |= FTF_SYN_RLIM;
291 }
292 end++;
293 }
294 if (tcp_flags & TH_FIN) {
295 end++;
296 }
297 if (tcp_flags & TH_RST) {
298 ftflags |= FTF_RST_RLIM;
299 }
300 } else {
301 /* first packet from this end; set its state */
302 ack = ntohl(pkt->pkt_flow_tcp_ack);
303
304 /* We saw the first SYN, but stack does not reply with a SYN */
305 if (dst->fse_state == TCPS_SYN_SENT && ((tcp_flags & TH_SYN) == 0)) {
306 /* Act as if no sequence number is set */
307 seq = 0;
308 /* Pretend the outgoing SYN was not ACK'ed */
309 ack = dst->fse_seqlo;
310 }
311
312 end = seq + pkt->pkt_flow_ulen;
313 if (tcp_flags & TH_SYN) {
314 if ((tcp_flags & (TH_SYN | TH_ACK)) == TH_SYN) {
315 ftflags |= FTF_SYN_RLIM;
316 }
317 end++;
318 if (dst->fse_flags & FLOWSTATEF_WSCALE) {
319 flow_track_tcp_get_wscale(s: src, pkt);
320 if (src->fse_flags & FLOWSTATEF_WSCALE) {
321 /*
322 * Remove scale factor from
323 * initial window.
324 */
325 sws = src->fse_wscale;
326 win = (uint16_t)(((u_int32_t)win + (1 << sws) - 1)
327 >> sws);
328 dws = dst->fse_wscale;
329 } else {
330 /* fixup other window */
331 dst->fse_max_win = (uint16_t)(dst->fse_max_win << dst->fse_wscale);
332 /* in case of a retrans SYN|ACK */
333 dst->fse_wscale = 0;
334 }
335 }
336 }
337 if (tcp_flags & TH_FIN) {
338 end++;
339 }
340 if (tcp_flags & TH_RST) {
341 ftflags |= FTF_RST_RLIM;
342 }
343
344 src->fse_seqlo = seq;
345 if (src->fse_state < TCPS_SYN_SENT) {
346 if (tcp_flags & TH_SYN) {
347 src->fse_state = TCPS_SYN_SENT;
348 } else {
349 /* Picking up the connection in the middle */
350 src->fse_state = TCPS_ESTABLISHED;
351 }
352 }
353
354 /*
355 * May need to slide the window (seqhi may have been set by
356 * the crappy stack check or if we picked up the connection
357 * after establishment).
358 */
359 if (src->fse_seqhi == 1 || SEQ_GEQ(end +
360 MAX(1, dst->fse_max_win << dws), src->fse_seqhi)) {
361 src->fse_seqhi = end + MAX(1, dst->fse_max_win << dws);
362 }
363 if (win > src->fse_max_win) {
364 src->fse_max_win = win;
365 }
366 }
367
368 if (!(tcp_flags & TH_ACK)) {
369 /* let it pass through the ack skew check */
370 ack = dst->fse_seqlo;
371 } else if ((ack == 0 &&
372 (tcp_flags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) ||
373 /* broken tcp stacks do not set ack */
374 (dst->fse_state < TCPS_SYN_SENT)) {
375 /*
376 * Many stacks (ours included) will set the ACK number in an
377 * FIN|ACK if the SYN times out -- no sequence to ACK.
378 */
379 ack = dst->fse_seqlo;
380 }
381
382 if (seq == end) {
383 /* ease sequencing restrictions on no data packets */
384 seq = src->fse_seqlo;
385 end = seq;
386 }
387
388 ackskew = dst->fse_seqlo - ack;
389
390#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
391 if (SEQ_GEQ(src->fse_seqhi, end) &&
392 /* last octet inside other's window space */
393 SEQ_GEQ(seq, src->fse_seqlo - (dst->fse_max_win << dws)) &&
394 /* retrans: not more than one window back */
395 (ackskew >= -MAXACKWINDOW) &&
396 /* acking not more than one reassembled fragment backwards */
397 (ackskew <= (MAXACKWINDOW << sws)) &&
398 /* acking not more than one window forward */
399 (!(tcp_flags & TH_RST) || orig_seq == src->fse_seqlo ||
400 (orig_seq == src->fse_seqlo + 1) ||
401 (orig_seq + 1 == src->fse_seqlo))) {
402 /* require an exact/+1 sequence match on resets when possible */
403
404 /* update max window */
405 if (src->fse_max_win < win) {
406 src->fse_max_win = win;
407 }
408 /* synchronize sequencing */
409 if (SEQ_GT(end, src->fse_seqlo)) {
410 src->fse_seqlo = end;
411 }
412 /* slide the window of what the other end can send */
413 if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
414 dst->fse_seqhi = ack + MAX((win << sws), 1);
415 }
416
417 /* update states */
418 if (tcp_flags & TH_SYN) {
419 if (src->fse_state < TCPS_SYN_SENT) {
420 src->fse_state = TCPS_SYN_SENT;
421 }
422 }
423 if (tcp_flags & TH_FIN) {
424 if (src->fse_state < TCPS_CLOSING) {
425 src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
426 src->fse_state = TCPS_CLOSING;
427 }
428 }
429 if (tcp_flags & TH_ACK) {
430 /*
431 * Avoid transitioning to ESTABLISHED when our SYN
432 * is ACK'd along with a RST. The sending TCP may
433 * still retransmit the SYN (after dropping some
434 * options like ECN, etc.)
435 */
436 if (dst->fse_state == TCPS_SYN_SENT &&
437 !(tcp_flags & TH_RST)) {
438 dst->fse_state = TCPS_ESTABLISHED;
439 ftflags |= (FTF_WAITCLOSE | FTF_CLOSENOTIFY);
440 } else if (dst->fse_state == TCPS_CLOSING &&
441 ack == dst->fse_seqlast + 1) {
442 dst->fse_state = TCPS_FIN_WAIT_2;
443 ftflags |= FTF_WAITCLOSE;
444 if (src->fse_state >= TCPS_FIN_WAIT_2) {
445 ftflags |= FTF_WITHDRAWN;
446 } else {
447 ftflags |= FTF_HALFCLOSED;
448 }
449 }
450 }
451 if ((tcp_flags & TH_RST) &&
452 (src->fse_state == TCPS_ESTABLISHED ||
453 dst->fse_state == TCPS_ESTABLISHED)) {
454 /*
455 * If either endpoint is in ESTABLISHED, transition
456 * both to TIME_WAIT. Otherwise, keep the existing
457 * state as is, e.g. SYN_SENT.
458 */
459 src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
460 ftflags |= (FTF_WITHDRAWN | FTF_WAITCLOSE);
461 }
462 } else if ((dst->fse_state < TCPS_SYN_SENT ||
463 dst->fse_state >= TCPS_FIN_WAIT_2 ||
464 src->fse_state >= TCPS_FIN_WAIT_2) &&
465 SEQ_GEQ(src->fse_seqhi + MAXACKWINDOW, end) &&
466 /* within a window forward of the originating packet */
467 SEQ_GEQ(seq, src->fse_seqlo - MAXACKWINDOW)) {
468 /* within a window backward of the originating packet */
469
470 /* BEGIN CSTYLED */
471 /*
472 * This currently handles three situations:
473 * 1) Stupid stacks will shotgun SYNs before their peer
474 * replies.
475 * 2) When flow tracking catches an already established
476 * stream (the flow states are cleared, etc.)
477 * 3) Packets get funky immediately after the connection
478 * closes (this should catch spurious ACK|FINs that
479 * web servers like to spew after a close).
480 *
481 * This must be a little more careful than the above code
482 * since packet floods will also be caught here.
483 */
484 /* END CSTYLED */
485
486 /* update max window */
487 if (src->fse_max_win < win) {
488 src->fse_max_win = win;
489 }
490 /* synchronize sequencing */
491 if (SEQ_GT(end, src->fse_seqlo)) {
492 src->fse_seqlo = end;
493 }
494 /* slide the window of what the other end can send */
495 if (SEQ_GEQ(ack + (win << sws), dst->fse_seqhi)) {
496 dst->fse_seqhi = ack + MAX((win << sws), 1);
497 }
498
499 /*
500 * Cannot set dst->fse_seqhi here since this could be a
501 * shotgunned SYN and not an already established connection.
502 */
503
504 if (tcp_flags & TH_FIN) {
505 if (src->fse_state < TCPS_CLOSING) {
506 src->fse_seqlast = orig_seq + pkt->pkt_flow_ulen;
507 src->fse_state = TCPS_CLOSING;
508 }
509 }
510 if (tcp_flags & TH_RST) {
511 src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
512 ftflags |= FTF_WAITCLOSE;
513 }
514 } else {
515 if (dst->fse_state == TCPS_SYN_SENT &&
516 src->fse_state == TCPS_SYN_SENT) {
517 src->fse_seqlo = 0;
518 src->fse_seqhi = 1;
519 src->fse_max_win = 1;
520 }
521 }
522
523done:
524 if (__improbable((ftflags & FTF_HALFCLOSED) != 0)) {
525 os_atomic_or(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
526 ftflags &= ~FTF_HALFCLOSED;
527 }
528
529 /*
530 * Hold on to namespace for a while after the flow is closed.
531 */
532 if (__improbable((ftflags & FTF_WAITCLOSE) != 0 &&
533 (fe->fe_flags & FLOWENTF_WAIT_CLOSE) == 0)) {
534 os_atomic_or(&fe->fe_flags, FLOWENTF_WAIT_CLOSE, relaxed);
535 ftflags &= ~FTF_WAITCLOSE;
536 }
537
538 /*
539 * Notify NECP upon tear down (for established flows).
540 */
541 if (__improbable((ftflags & FTF_CLOSENOTIFY) != 0 &&
542 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY) == 0)) {
543 os_atomic_or(&fe->fe_flags, FLOWENTF_CLOSE_NOTIFY, relaxed);
544 ftflags &= ~FTF_CLOSENOTIFY;
545 }
546
547 /*
548 * Flow is withdrawn; the port we have should not be included in
549 * the list of offloaded ports, as the connection is no longer
550 * usable (we're not expecting any more data).
551 * Also clear FLOWENTF_HALF_CLOSED flag here. It's fine if reaper
552 * thread hadn't pickedup FLOWENTF_HALF_CLOSED, as it will pick up
553 * FLOWENTF_WITHDRAWN and notify netns of full withdrawn.
554 */
555 if (__improbable((ftflags & FTF_WITHDRAWN) != 0)) {
556 ftflags &= ~FTF_WITHDRAWN;
557 if (fe->fe_flags & FLOWENTF_HALF_CLOSED) {
558 os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
559 }
560 fe->fe_want_withdraw = 1;
561 }
562
563 /*
564 * If no other work is needed, we're done.
565 */
566 if (ftflags == 0 || input) {
567 return err;
568 }
569
570 /*
571 * If we're over the rate limit for outbound SYNs, drop packet.
572 */
573 if (__improbable((ftflags & FTF_SYN_RLIM) != 0)) {
574 uint32_t now = (uint32_t)_net_uptime;
575 if ((now - src->fse_syn_ts) > 1) {
576 src->fse_syn_ts = now;
577 src->fse_syn_cnt = 0;
578 }
579 if (++src->fse_syn_cnt > FLOWTRACK_SYN_RATE) {
580 err = EPROTO;
581 }
582 }
583
584 return err;
585}
586#undef FTF_WAITCLOSE
587#undef FTF_CLOSENOTIFY
588#undef FTF_WITHDRAWN
589#undef FTF_SYN_RLIM
590#undef FTF_RST_RLIM
591
592boolean_t
593flow_track_tcp_want_abort(struct flow_entry *fe)
594{
595 struct flow_track *src = &fe->fe_ltrack;
596 struct flow_track *dst = &fe->fe_rtrack;
597
598 if (fe->fe_key.fk_proto != IPPROTO_TCP ||
599 (fe->fe_flags & FLOWENTF_ABORTED)) {
600 goto done;
601 }
602
603 /* this can be enhanced; for now rely on established state */
604 if (src->fse_state == TCPS_ESTABLISHED ||
605 dst->fse_state == TCPS_ESTABLISHED) {
606 src->fse_state = dst->fse_state = TCPS_TIME_WAIT;
607 /* don't process more than once */
608 os_atomic_or(&fe->fe_flags, FLOWENTF_ABORTED, relaxed);
609 return TRUE;
610 }
611done:
612 return FALSE;
613}
614
615static void
616flow_track_udp_init(struct flow_entry *fe, struct flow_track *src,
617 struct flow_track *dst, struct __kern_packet *pkt)
618{
619#pragma unused(pkt)
620 /*
621 * Source state initialization.
622 */
623 src->fse_state = FT_STATE_NO_TRAFFIC;
624
625 /*
626 * Destination state initialization.
627 */
628 dst->fse_state = FT_STATE_NO_TRAFFIC;
629
630 os_atomic_or(&fe->fe_flags, FLOWENTF_INITED, relaxed);
631}
632
633__attribute__((always_inline))
634static inline int
635flow_track_udp(struct flow_entry *fe, struct flow_track *src,
636 struct flow_track *dst, struct __kern_packet *pkt, bool input)
637{
638#pragma unused(input)
639 if (__improbable((fe->fe_flags & FLOWENTF_INITED) == 0)) {
640 flow_track_udp_init(fe, src, dst, pkt);
641 }
642
643 if (__improbable(src->fse_state == FT_STATE_NO_TRAFFIC)) {
644 src->fse_state = FT_STATE_SINGLE;
645 }
646 if (__improbable(dst->fse_state == FT_STATE_SINGLE)) {
647 dst->fse_state = FT_STATE_MULTIPLE;
648 }
649
650 return 0;
651}
652
653void
654flow_track_stats(struct flow_entry *fe, uint64_t bytes, uint64_t packets,
655 bool active, bool in)
656{
657 volatile struct sk_stats_flow_track *fst;
658
659 if (in) {
660 fst = &fe->fe_stats->fs_rtrack;
661 } else {
662 fst = &fe->fe_stats->fs_ltrack;
663 }
664
665 fst->sft_bytes += bytes;
666 fst->sft_packets += packets;
667
668 if (__probable(active)) {
669 in_stat_set_activity_bitmap(activity: &fe->fe_stats->fs_activity,
670 now: _net_uptime);
671 }
672}
673
674int
675flow_pkt_track(struct flow_entry *fe, struct __kern_packet *pkt, bool in)
676{
677 struct flow_track *src, *dst;
678 int ret = 0;
679
680 _CASSERT(SFT_STATE_CLOSED == FT_STATE_CLOSED);
681 _CASSERT(SFT_STATE_LISTEN == FT_STATE_LISTEN);
682 _CASSERT(SFT_STATE_SYN_SENT == FT_STATE_SYN_SENT);
683 _CASSERT(SFT_STATE_SYN_RECEIVED == FT_STATE_SYN_RECEIVED);
684 _CASSERT(SFT_STATE_ESTABLISHED == FT_STATE_ESTABLISHED);
685 _CASSERT(SFT_STATE_CLOSE_WAIT == FT_STATE_CLOSE_WAIT);
686 _CASSERT(SFT_STATE_FIN_WAIT_1 == FT_STATE_FIN_WAIT_1);
687 _CASSERT(SFT_STATE_CLOSING == FT_STATE_CLOSING);
688 _CASSERT(SFT_STATE_LAST_ACK == FT_STATE_LAST_ACK);
689 _CASSERT(SFT_STATE_FIN_WAIT_2 == FT_STATE_FIN_WAIT_2);
690 _CASSERT(SFT_STATE_TIME_WAIT == FT_STATE_TIME_WAIT);
691 _CASSERT(SFT_STATE_NO_TRAFFIC == FT_STATE_NO_TRAFFIC);
692 _CASSERT(SFT_STATE_SINGLE == FT_STATE_SINGLE);
693 _CASSERT(SFT_STATE_MULTIPLE == FT_STATE_MULTIPLE);
694 _CASSERT(SFT_STATE_MAX == FT_STATE_MAX);
695
696 _CASSERT(FT_STATE_CLOSED == TCPS_CLOSED);
697 _CASSERT(FT_STATE_LISTEN == TCPS_LISTEN);
698 _CASSERT(FT_STATE_SYN_SENT == TCPS_SYN_SENT);
699 _CASSERT(FT_STATE_SYN_RECEIVED == TCPS_SYN_RECEIVED);
700 _CASSERT(FT_STATE_ESTABLISHED == TCPS_ESTABLISHED);
701 _CASSERT(FT_STATE_CLOSE_WAIT == TCPS_CLOSE_WAIT);
702 _CASSERT(FT_STATE_FIN_WAIT_1 == TCPS_FIN_WAIT_1);
703 _CASSERT(FT_STATE_CLOSING == TCPS_CLOSING);
704 _CASSERT(FT_STATE_LAST_ACK == TCPS_LAST_ACK);
705 _CASSERT(FT_STATE_FIN_WAIT_2 == TCPS_FIN_WAIT_2);
706 _CASSERT(FT_STATE_TIME_WAIT == TCPS_TIME_WAIT);
707
708 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
709
710 if (in) {
711 src = &fe->fe_rtrack;
712 dst = &fe->fe_ltrack;
713 } else {
714 src = &fe->fe_ltrack;
715 dst = &fe->fe_rtrack;
716 }
717
718 flow_track_stats(fe, bytes: (pkt->pkt_length - pkt->pkt_l2_len), packets: 1,
719 active: (pkt->pkt_flow_ulen != 0), in);
720
721 /* skip flow state tracking on non-initial fragments */
722 if (pkt->pkt_flow_ip_is_frag && !pkt->pkt_flow_ip_is_first_frag) {
723 return 0;
724 }
725
726 switch (pkt->pkt_flow_ip_proto) {
727 case IPPROTO_TCP:
728 if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
729 ret = flow_track_tcp(fe, src, dst, pkt, input: in);
730 }
731 break;
732
733 case IPPROTO_UDP:
734 if (__probable((fe->fe_flags & FLOWENTF_TRACK) != 0)) {
735 ret = flow_track_udp(fe, src, dst, pkt, input: in);
736 }
737 break;
738 }
739
740 return ret;
741}
742
743/*
744 * @function flow_track_abort_tcp
745 * @abstract send RST for a given TCP flow.
746 * @param in_pkt incoming packet that triggers RST.
747 * @param rst_pkt use as RST template for SEQ/ACK information.
748 */
749void
750flow_track_abort_tcp(struct flow_entry *fe, struct __kern_packet *in_pkt,
751 struct __kern_packet *rst_pkt)
752{
753 struct nx_flowswitch *fsw = fe->fe_fsw;
754 struct flow_track *src, *dst;
755 struct ip *ip;
756 struct ip6_hdr *ip6;
757 struct tcphdr *th;
758 uint16_t len, tlen;
759 struct mbuf *m;
760
761 /* guaranteed by caller */
762 ASSERT(fsw->fsw_ifp != NULL);
763 ASSERT(in_pkt == NULL || rst_pkt == NULL);
764
765 src = &fe->fe_ltrack;
766 dst = &fe->fe_rtrack;
767
768 tlen = sizeof(struct tcphdr);
769 if (fe->fe_key.fk_ipver == IPVERSION) {
770 len = sizeof(struct ip) + tlen;
771 } else {
772 ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
773 len = sizeof(struct ip6_hdr) + tlen;
774 }
775
776 m = m_gethdr(M_NOWAIT, MT_HEADER);
777 if (__improbable(m == NULL)) {
778 return;
779 }
780
781 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
782 m->m_data += max_linkhdr; /* 32-bit aligned */
783 m->m_pkthdr.len = m->m_len = len;
784
785 /* zero out for checksum */
786 bzero(s: m_mtod_current(m), n: len);
787
788 if (fe->fe_key.fk_ipver == IPVERSION) {
789 ip = mtod(m, struct ip *);
790
791 /* IP header fields included in the TCP checksum */
792 ip->ip_p = IPPROTO_TCP;
793 ip->ip_len = htons(tlen);
794 if (rst_pkt == NULL) {
795 ip->ip_src = fe->fe_key.fk_src4;
796 ip->ip_dst = fe->fe_key.fk_dst4;
797 } else {
798 ip->ip_src = rst_pkt->pkt_flow_ipv4_src;
799 ip->ip_dst = rst_pkt->pkt_flow_ipv4_dst;
800 }
801
802 th = (struct tcphdr *)(void *)((char *)ip + sizeof(*ip));
803 } else {
804 ip6 = mtod(m, struct ip6_hdr *);
805
806 /* IP header fields included in the TCP checksum */
807 ip6->ip6_nxt = IPPROTO_TCP;
808 ip6->ip6_plen = htons(tlen);
809 if (rst_pkt == NULL) {
810 ip6->ip6_src = fe->fe_key.fk_src6;
811 ip6->ip6_dst = fe->fe_key.fk_dst6;
812 } else {
813 ip6->ip6_src = rst_pkt->pkt_flow_ipv6_src;
814 ip6->ip6_dst = rst_pkt->pkt_flow_ipv6_dst;
815 }
816
817 th = (struct tcphdr *)(void *)((char *)ip6 + sizeof(*ip6));
818 }
819
820 /*
821 * TCP header (fabricate a pure RST).
822 */
823 if (in_pkt != NULL) {
824 th->th_sport = in_pkt->pkt_flow_tcp_dst;
825 th->th_dport = in_pkt->pkt_flow_tcp_src;
826 if (__probable(in_pkt->pkt_flow_tcp_flags | TH_ACK)) {
827 /* <SEQ=SEG.ACK><CTL=RST> */
828 th->th_seq = in_pkt->pkt_flow_tcp_ack;
829 th->th_ack = 0;
830 th->th_flags = TH_RST;
831 } else {
832 /* <SEQ=0><ACK=SEG.SEQ+SEG.LEN><CTL=RST,ACK> */
833 th->th_seq = 0;
834 th->th_ack = in_pkt->pkt_flow_tcp_seq +
835 in_pkt->pkt_flow_ulen;
836 th->th_flags = TH_RST | TH_ACK;
837 }
838 } else if (rst_pkt != NULL) {
839 th->th_sport = rst_pkt->pkt_flow_tcp_src;
840 th->th_dport = rst_pkt->pkt_flow_tcp_dst;
841 th->th_seq = rst_pkt->pkt_flow_tcp_seq;
842 th->th_ack = rst_pkt->pkt_flow_tcp_ack;
843 th->th_flags = rst_pkt->pkt_flow_tcp_flags;
844 } else {
845 th->th_sport = fe->fe_key.fk_sport;
846 th->th_dport = fe->fe_key.fk_dport;
847 th->th_seq = htonl(src->fse_seqlo); /* peer's last ACK */
848 th->th_ack = 0;
849 th->th_flags = TH_RST;
850 }
851 th->th_off = (tlen >> 2);
852 th->th_win = 0;
853
854 FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
855
856 if (fe->fe_key.fk_ipver == IPVERSION) {
857 struct ip_out_args ipoa;
858 struct route ro;
859
860 bzero(s: &ipoa, n: sizeof(ipoa));
861 ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
862 ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
863 IPOAF_BOUND_SRCADDR);
864 ipoa.ipoa_sotc = SO_TC_UNSPEC;
865 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
866
867 /* TCP checksum */
868 th->th_sum = in_cksum(m, len);
869
870 ip->ip_v = IPVERSION;
871 ip->ip_hl = sizeof(*ip) >> 2;
872 ip->ip_tos = 0;
873 /*
874 * ip_output() expects ip_len and ip_off to be in host order.
875 */
876 ip->ip_len = len;
877 ip->ip_off = IP_DF;
878 ip->ip_ttl = (uint8_t)ip_defttl;
879 ip->ip_sum = 0;
880
881 bzero(s: &ro, n: sizeof(ro));
882 (void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
883 ROUTE_RELEASE(&ro);
884 } else {
885 struct ip6_out_args ip6oa;
886 struct route_in6 ro6;
887
888 bzero(s: &ip6oa, n: sizeof(ip6oa));
889 ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
890 ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
891 IP6OAF_BOUND_SRCADDR);
892 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
893 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
894
895 /* TCP checksum */
896 th->th_sum = in6_cksum(m, IPPROTO_TCP,
897 sizeof(struct ip6_hdr), tlen);
898
899 ip6->ip6_vfc |= IPV6_VERSION;
900 ip6->ip6_hlim = IPV6_DEFHLIM;
901
902 bzero(s: &ro6, n: sizeof(ro6));
903 (void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
904 NULL, NULL, &ip6oa);
905 ROUTE_RELEASE(&ro6);
906 }
907}
908
909void
910flow_track_abort_quic(struct flow_entry *fe, uint8_t *token)
911{
912 struct quic_stateless_reset {
913 uint8_t ssr_header[30];
914 uint8_t ssr_token[QUIC_STATELESS_RESET_TOKEN_SIZE];
915 };
916 struct nx_flowswitch *fsw = fe->fe_fsw;
917 struct ip *ip;
918 struct ip6_hdr *ip6;
919 struct udphdr *uh;
920 struct quic_stateless_reset *qssr;
921 uint16_t len, l3hlen, ulen;
922 struct mbuf *m;
923 unsigned int one = 1;
924 int error;
925
926 /* guaranteed by caller */
927 ASSERT(fsw->fsw_ifp != NULL);
928
929 /* skip zero token */
930 bool is_zero_token = true;
931 for (size_t i = 0; i < QUIC_STATELESS_RESET_TOKEN_SIZE; i++) {
932 if (token[i] != 0) {
933 is_zero_token = false;
934 break;
935 }
936 }
937 if (is_zero_token) {
938 return;
939 }
940
941 ulen = sizeof(struct udphdr) + sizeof(struct quic_stateless_reset);
942 if (fe->fe_key.fk_ipver == IPVERSION) {
943 l3hlen = sizeof(struct ip);
944 } else {
945 ASSERT(fe->fe_key.fk_ipver == IPV6_VERSION);
946 l3hlen = sizeof(struct ip6_hdr);
947 }
948
949 len = l3hlen + ulen;
950
951 error = mbuf_allocpacket(how: MBUF_DONTWAIT, packetlen: max_linkhdr + len, maxchunks: &one, mbuf: &m);
952 if (__improbable(error != 0)) {
953 return;
954 }
955 VERIFY(m != 0);
956
957 m->m_pkthdr.pkt_proto = IPPROTO_UDP;
958 m->m_data += max_linkhdr; /* 32-bit aligned */
959 m->m_pkthdr.len = m->m_len = len;
960
961 /* zero out for checksum */
962 bzero(s: m_mtod_current(m), n: len);
963
964 if (fe->fe_key.fk_ipver == IPVERSION) {
965 ip = mtod(m, struct ip *);
966 ip->ip_p = IPPROTO_UDP;
967 ip->ip_len = htons(ulen);
968 ip->ip_src = fe->fe_key.fk_src4;
969 ip->ip_dst = fe->fe_key.fk_dst4;
970 uh = (struct udphdr *)(void *)((char *)ip + sizeof(*ip));
971 } else {
972 ip6 = mtod(m, struct ip6_hdr *);
973 ip6->ip6_nxt = IPPROTO_UDP;
974 ip6->ip6_plen = htons(ulen);
975 ip6->ip6_src = fe->fe_key.fk_src6;
976 ip6->ip6_dst = fe->fe_key.fk_dst6;
977 uh = (struct udphdr *)(void *)((char *)ip6 + sizeof(*ip6));
978 }
979
980 /* UDP header */
981 uh->uh_sport = fe->fe_key.fk_sport;
982 uh->uh_dport = fe->fe_key.fk_dport;
983 uh->uh_ulen = htons(ulen);
984
985 /* QUIC stateless reset */
986 qssr = (struct quic_stateless_reset *)(uh + 1);
987 read_frandom(buffer: &qssr->ssr_header, numBytes: sizeof(qssr->ssr_header));
988 qssr->ssr_header[0] = (qssr->ssr_header[0] & 0x3f) | 0x40;
989 memcpy(dst: qssr->ssr_token, src: token, QUIC_STATELESS_RESET_TOKEN_SIZE);
990
991 FSW_STATS_INC(FSW_STATS_FLOWS_ABORTED);
992
993 if (fe->fe_key.fk_ipver == IPVERSION) {
994 struct ip_out_args ipoa;
995 struct route ro;
996
997 bzero(s: &ipoa, n: sizeof(ipoa));
998 ipoa.ipoa_boundif = fsw->fsw_ifp->if_index;
999 ipoa.ipoa_flags = (IPOAF_SELECT_SRCIF | IPOAF_BOUND_IF |
1000 IPOAF_BOUND_SRCADDR);
1001 ipoa.ipoa_sotc = SO_TC_UNSPEC;
1002 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1003
1004 uh->uh_sum = in_cksum(m, len);
1005 if (uh->uh_sum == 0) {
1006 uh->uh_sum = 0xffff;
1007 }
1008
1009 ip->ip_v = IPVERSION;
1010 ip->ip_hl = sizeof(*ip) >> 2;
1011 ip->ip_tos = 0;
1012 /*
1013 * ip_output() expects ip_len and ip_off to be in host order.
1014 */
1015 ip->ip_len = len;
1016 ip->ip_off = IP_DF;
1017 ip->ip_ttl = (uint8_t)ip_defttl;
1018 ip->ip_sum = 0;
1019
1020 bzero(s: &ro, n: sizeof(ro));
1021 (void) ip_output(m, NULL, &ro, IP_OUTARGS, NULL, &ipoa);
1022 ROUTE_RELEASE(&ro);
1023 } else {
1024 struct ip6_out_args ip6oa;
1025 struct route_in6 ro6;
1026
1027 bzero(s: &ip6oa, n: sizeof(ip6oa));
1028 ip6oa.ip6oa_boundif = fsw->fsw_ifp->if_index;
1029 ip6oa.ip6oa_flags = (IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_IF |
1030 IP6OAF_BOUND_SRCADDR);
1031 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
1032 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1033
1034 uh->uh_sum = in6_cksum(m, IPPROTO_UDP, sizeof(struct ip6_hdr),
1035 ulen);
1036 if (uh->uh_sum == 0) {
1037 uh->uh_sum = 0xffff;
1038 }
1039
1040 ip6->ip6_vfc |= IPV6_VERSION;
1041 ip6->ip6_hlim = IPV6_DEFHLIM;
1042
1043 bzero(s: &ro6, n: sizeof(ro6));
1044 (void) ip6_output(m, NULL, &ro6, IPV6_OUTARGS,
1045 NULL, NULL, &ip6oa);
1046 ROUTE_RELEASE(&ro6);
1047 }
1048}
1049