1/*
2 * Copyright (c) 2019-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <skywalk/os_skywalk_private.h>
30#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
31#include <skywalk/nexus/flowswitch/fsw_var.h>
32#include <skywalk/nexus/flowswitch/flow/flow_var.h>
33#include <skywalk/nexus/netif/nx_netif.h>
34#include <skywalk/nexus/netif/nx_netif_compat.h>
35#include <netinet/tcp.h>
36#include <netinet/ip.h>
37#include <netinet/ip6.h>
38#include <net/pktap.h>
39#include <sys/sdt.h>
40
41#define MAX_AGG_IP_LEN() MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET)
42#define MAX_BUFLET_COUNT (32)
43#define TCP_FLAGS_IGNORE (TH_FIN|TH_SYN|TH_RST|TH_URG)
44#define PKT_IS_MBUF(_pkt) (_pkt->pkt_pflags & PKT_F_MBUF_DATA)
45#define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) && \
46 (_pkt->pkt_pflags & PKT_F_TRUNCATED))
47#define PKT_IS_WAKE_PKT(_pkt) ((PKT_IS_MBUF(_pkt) && \
48 (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \
49 (!PKT_IS_MBUF(_pkt) && \
50 (_pkt->pkt_pflags & PKT_F_WAKE_PKT)))
51
52
53typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t);
54
55static uint16_t
56flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new);
57
58static uint16_t
59flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new);
60
61/*
62 * This structure holds per-super object (mbuf/packet) flow aggregation.
63 */
64struct flow_agg {
65 union {
66 struct {
67 union {
68 void * _fa_sobj;
69 struct mbuf * _fa_smbuf; /* super mbuf */
70 struct __kern_packet *_fa_spkt; /* super pkt */
71 };
72 uint8_t *_fa_sptr; /* ptr to super IP header */
73 bool _fa_sobj_is_pkt; /* super obj is pkt or mbuf */
74 /*
75 * super obj is not large enough to hold the IP & TCP
76 * header in a contiguous buffer.
77 */
78 bool _fa_sobj_is_short;
79 uint32_t _fa_tcp_seq; /* expected next sequence # */
80 uint32_t _fa_ulen; /* expected next ulen */
81 uint32_t _fa_total; /* total aggregated bytes */
82 /* function that fix packet checksum */
83 flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum;
84 } __flow_agg;
85 uint64_t __flow_agg_data[5];
86 };
87#define fa_sobj __flow_agg._fa_sobj
88#define fa_smbuf __flow_agg._fa_smbuf
89#define fa_spkt __flow_agg._fa_spkt
90#define fa_sptr __flow_agg._fa_sptr
91#define fa_sobj_is_pkt __flow_agg._fa_sobj_is_pkt
92#define fa_sobj_is_short __flow_agg._fa_sobj_is_short
93#define fa_tcp_seq __flow_agg._fa_tcp_seq
94#define fa_ulen __flow_agg._fa_ulen
95#define fa_total __flow_agg._fa_total
96#define fa_fix_pkt_sum __flow_agg._fa_fix_pkt_sum
97};
98
99#define FLOW_AGG_CLEAR(_fa) do { \
100 _CASSERT(sizeof(struct flow_agg) == 40); \
101 _CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32); \
102 sk_zero_32(_fa); \
103 (_fa)->fa_fix_pkt_sum = 0; \
104} while (0)
105
106#define MASK_SIZE 80 /* size of struct {ip,ip6}_tcp_mask */
107
108struct ip_tcp_mask {
109 struct ip ip_m;
110 struct tcphdr tcp_m;
111 uint32_t tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)];
112};
113
114static const struct ip_tcp_mask ip_tcp_mask
115__sk_aligned(16) =
116{
117 .ip_m = {
118 .ip_hl = 0xf,
119 .ip_v = 0xf,
120 .ip_tos = 0xff,
121 /* Not checked; aggregated packet's ip_len is increasing */
122 .ip_len = 0,
123 .ip_id = 0,
124 .ip_off = 0xffff,
125 .ip_ttl = 0xff,
126 .ip_p = 0xff,
127 .ip_sum = 0,
128 .ip_src.s_addr = 0xffffffff,
129 .ip_dst.s_addr = 0xffffffff,
130 },
131 .tcp_m = {
132 .th_sport = 0xffff,
133 .th_dport = 0xffff,
134 .th_seq = 0,
135 .th_ack = 0xffffffff,
136 .th_x2 = 0xf,
137 .th_off = 0xf,
138 .th_flags = ~TH_PUSH,
139 .th_win = 0xffff,
140 .th_sum = 0,
141 .th_urp = 0xffff,
142 },
143 .tcp_option_m = {
144 /* Max 40 bytes of TCP options */
145 0xffffffff,
146 0xffffffff,
147 0xffffffff,
148 0, /* Filling up to MASK_SIZE */
149 0, /* Filling up to MASK_SIZE */
150 0, /* Filling up to MASK_SIZE */
151 0, /* Filling up to MASK_SIZE */
152 0, /* Filling up to MASK_SIZE */
153 0, /* Filling up to MASK_SIZE */
154 0, /* Filling up to MASK_SIZE */
155 },
156};
157
158struct ip6_tcp_mask {
159 struct ip6_hdr ip6_m;
160 struct tcphdr tcp_m;
161 uint32_t tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */
162};
163
164static const struct ip6_tcp_mask ip6_tcp_mask
165__sk_aligned(16) =
166{
167 .ip6_m = {
168 .ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff,
169 /* Not checked; aggregated packet's ip_len is increasing */
170 .ip6_ctlun.ip6_un1.ip6_un1_plen = 0,
171 .ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff,
172 .ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff,
173 .ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff,
174 .ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff,
175 .ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff,
176 .ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff,
177 .ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff,
178 .ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff,
179 .ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff,
180 .ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff,
181 },
182 .tcp_m = {
183 .th_sport = 0xffff,
184 .th_dport = 0xffff,
185 .th_seq = 0,
186 .th_ack = 0xffffffff,
187 .th_x2 = 0xf,
188 .th_off = 0xf,
189 .th_flags = ~TH_PUSH,
190 .th_win = 0xffff,
191 .th_sum = 0,
192 .th_urp = 0xffff,
193 },
194 .tcp_option_m = {
195 /* Max 40 bytes of TCP options */
196 0xffffffff,
197 0xffffffff,
198 0xffffffff,
199 0, /* Filling up to MASK_SIZE */
200 0, /* Filling up to MASK_SIZE */
201 },
202};
203
204#if SK_LOG
205SK_LOG_ATTRIBUTE
206static void
207_pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input)
208{
209 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
210 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
211
212 kern_packet_t ph = SK_PKT2PH(pkt);
213 uint64_t bufcnt = 1;
214 if (!is_input) {
215 bufcnt = kern_packet_get_buflet_count(ph);
216 }
217
218 SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u",
219 sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s":"d",
220 SK_KVA(pkt), pkt->pkt_length);
221
222 SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x",
223 is_input ? "s":"d", pkt->pkt_csum_flags,
224 (uint32_t)pkt->pkt_csum_rx_start_off,
225 (uint32_t)pkt->pkt_csum_rx_value);
226
227 if (!is_input) {
228 kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL);
229
230 /* Individual buflets */
231 for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) {
232 SK_DF(logflags | SK_VERB_DUMP, "%s",
233 sk_dump("buf", kern_buflet_get_data_address(buf),
234 pkt->pkt_length, 128, NULL, 0));
235 buf = kern_packet_get_next_buflet(ph, buf);
236 }
237 }
238}
239
240#define pkt_agg_log(_pkt, _p, _is_input) do { \
241 if (__improbable(sk_verbose != 0)) { \
242 _pkt_agg_log(_pkt, _p, _is_input); \
243 } \
244} while (0)
245
246SK_LOG_ATTRIBUTE
247static void
248_mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
249{
250 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
251 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
252
253 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
254 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
255 m->m_pkthdr.len);
256
257 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
258 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
259 (uint32_t)m->m_pkthdr.csum_rx_val);
260
261 /* Dump the first mbuf */
262 ASSERT(m_mtod_current(m) != NULL);
263 SK_DF(logflags | SK_VERB_DUMP, "%s", sk_dump("buf",
264 (uint8_t *)m_mtod_current(m), m->m_len, 128, NULL, 0));
265}
266
267#define mbuf_agg_log(_m, _p, _is_mbuf) do { \
268 if (__improbable(sk_verbose != 0)) { \
269 _mbuf_agg_log(_m, _p, _is_mbuf); \
270 } \
271} while (0)
272
273SK_LOG_ATTRIBUTE
274static void
275_mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf)
276{
277 SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
278 (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
279
280 while (m != NULL) {
281 SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u",
282 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m),
283 m->m_pkthdr.len);
284
285 SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x",
286 m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start,
287 (uint32_t)m->m_pkthdr.csum_rx_val);
288
289 m = m->m_nextpkt;
290 }
291}
292
293#define mchain_agg_log(_m, _p, _is_mbuf) do { \
294 if (__improbable(sk_verbose != 0)) { \
295 _mchain_agg_log(_m, _p, _is_mbuf); \
296 } \
297} while (0)
298#else
299#define pkt_agg_log(...)
300#define mbuf_agg_log(...)
301#define mchain_agg_log(...)
302#endif /* SK_LOG */
303
304/*
305 * Checksum only for packet with mbuf.
306 */
307static bool
308mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3,
309 uint16_t *data_csum)
310{
311 ASSERT(data_csum != NULL);
312
313 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
314 uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen +
315 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
316 uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen;
317 uint16_t start = pkt->pkt_l2_len;
318 uint32_t partial = 0;
319 uint16_t csum = 0;
320
321 ASSERT(plen == m_pktlen(m));
322
323 /* Some compat drivers compute full checksum */
324 if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) ==
325 CSUM_RX_FULL_FLAGS) {
326 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
327 m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start,
328 m->m_pkthdr.csum_rx_val);
329
330 /* Compute the data_csum */
331 struct tcphdr *tcp =
332 (struct tcphdr *)(void *)(mtod(m, uint8_t *) +
333 pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen);
334 /* 16-bit alignment is sufficient */
335 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
336
337 uint16_t th_sum = tcp->th_sum;
338 tcp->th_sum = 0;
339
340 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
341 pkt->pkt_flow_tcp_hlen);
342 partial += htons(l4len + IPPROTO_TCP);
343 if (pkt->pkt_flow_ip_ver == IPVERSION) {
344 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
345 pkt->pkt_flow_ipv4_dst.s_addr, partial);
346 } else {
347 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
348 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
349 &pkt->pkt_flow_ipv6_dst, partial);
350 }
351 /* Restore the original checksum */
352 tcp->th_sum = th_sum;
353 th_sum = __packet_fix_sum(csum: th_sum, old: csum, new: 0);
354 *data_csum = ~th_sum & 0xffff;
355
356 /* pkt metadata will be transfer to super packet */
357 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
358 start: 0, stuff_val: m->m_pkthdr.csum_rx_val, false);
359
360 if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) {
361 return true;
362 } else {
363 return false;
364 }
365 }
366 /* Reset the csum RX flags */
367 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
368 if (verify_l3) {
369 csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen);
370 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
371 start, pkt->pkt_flow_ip_hlen, csum);
372 m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
373 if ((csum ^ 0xffff) != 0) {
374 return false;
375 } else {
376 m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
377 }
378 }
379 /* Compute L4 header checksum */
380 partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen,
381 pkt->pkt_flow_tcp_hlen);
382 /* Compute payload checksum */
383 start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
384 *data_csum = m_sum16(m, start, (plen - start));
385
386 /* Fold in the data checksum to TCP checksum */
387 partial += *data_csum;
388 partial += htons(l4len + IPPROTO_TCP);
389 if (pkt->pkt_flow_ip_ver == IPVERSION) {
390 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
391 pkt->pkt_flow_ipv4_dst.s_addr, partial);
392 } else {
393 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
394 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
395 &pkt->pkt_flow_ipv6_dst, partial);
396 }
397 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
398 start - pkt->pkt_flow_tcp_hlen, l4len, csum);
399 // Set start to 0 for full checksum
400 m->m_pkthdr.csum_rx_start = 0;
401 m->m_pkthdr.csum_rx_val = csum;
402 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
403
404 /* pkt metadata will be transfer to super packet */
405 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
406 start: 0, stuff_val: csum, false);
407
408 if ((csum ^ 0xffff) != 0) {
409 return false;
410 }
411
412 return true;
413}
414
415/* structure to pass an array of data buffers */
416typedef struct _dbuf_array {
417 union {
418 struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT];
419 struct mbuf *dba_mbuf[MAX_BUFLET_COUNT];
420 };
421 uint8_t dba_num_dbufs;
422 bool dba_is_buflet;
423} _dbuf_array_t;
424
425static inline void
426_copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen,
427 uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf,
428 boolean_t do_csum)
429{
430 uint8_t i = 0;
431 uint32_t buflet_dlim, buflet_dlen, buf_off = 0;
432
433 ASSERT(plen > 0);
434 while (plen > 0) {
435 ASSERT(i < dbuf->dba_num_dbufs);
436 uint32_t dbuf_lim, tmplen;
437 uint8_t *dbuf_addr;
438
439 if (dbuf->dba_is_buflet) {
440 ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0);
441 dbuf_addr = kern_buflet_get_data_address(dbuf->dba_buflet[i]);
442
443 buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]);
444 buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]);
445 buf_off = buflet_dlen;
446 dbuf_lim = buflet_dlim - buf_off;
447 dbuf_addr += buf_off;
448 } else {
449 dbuf_lim = (uint32_t) M_TRAILINGSPACE(dbuf->dba_mbuf[i]);
450 dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *);
451 buf_off = dbuf->dba_mbuf[i]->m_len;
452 dbuf_addr += buf_off;
453 }
454 tmplen = min(a: plen, b: dbuf_lim);
455 if (PKT_IS_TRUNC_MBUF(spkt)) {
456 if (do_csum) {
457 *partial_sum = m_copydata_sum(m: spkt->pkt_mbuf,
458 off: soff, len: tmplen, vp: dbuf_addr, initial_sum: *partial_sum,
459 odd_start);
460 } else {
461 m_copydata(spkt->pkt_mbuf, soff, tmplen,
462 dbuf_addr);
463 }
464 } else {
465 *partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt),
466 soff, dbaddr: dbuf_addr, len: tmplen, do_csum, initial_sum: *partial_sum,
467 odd_start);
468 }
469 if (dbuf->dba_is_buflet) {
470 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i],
471 tmplen + buf_off) == 0);
472 } else {
473 dbuf->dba_mbuf[i]->m_len += tmplen;
474 dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen;
475 }
476 soff += tmplen;
477 plen -= tmplen;
478 buf_off = 0;
479 i++;
480 }
481 ASSERT(plen == 0);
482}
483
484/*
485 * Copy (fill) and checksum for packet.
486 * spkt: source IP packet.
487 * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload).
488 * verify_l3: verify IPv4 header checksum.
489 * currm: destination mbuf.
490 * currp: destination skywalk packet.
491 * dbuf: additional destination data buffer(s), used when current destination
492 * packet is out of space.
493 * added: amount of data copied from spkt to the additional buffer.
494 * data_sum: 16-bit folded partial checksum of the copied TCP payload.
495 */
496static bool
497copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen,
498 _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm,
499 struct __kern_buflet *currp, uint16_t *data_csum, int *added)
500{
501 ASSERT(data_csum != NULL);
502
503 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
504 SK_VERB_COPY));
505
506 uint16_t start = 0, csum = 0;
507 uint32_t len = 0;
508 uint32_t l4len;
509 /* soff is only used for packets */
510 uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len;
511 uint32_t data_partial = 0, partial = 0;
512 int32_t curr_oldlen;
513 uint32_t curr_trailing;
514 char *curr_ptr;
515 int32_t curr_len;
516 uint16_t data_off;
517 uint32_t tmplen;
518 boolean_t odd_start = FALSE;
519 bool verify_l4;
520
521 /* One of them must be != NULL, but they can't be both set */
522 VERIFY((currm != NULL || currp != NULL) &&
523 ((currm != NULL) != (currp != NULL)));
524
525 if (currm != NULL) {
526 curr_oldlen = currm->m_len;
527 curr_trailing = (uint32_t)M_TRAILINGSPACE(currm);
528 curr_ptr = mtod(currm, char *) + currm->m_len;
529 curr_len = currm->m_len;
530 } else {
531 curr_oldlen = currp->buf_dlen;
532 curr_trailing = currp->buf_dlim - currp->buf_doff -
533 currp->buf_dlen;
534 curr_ptr = (char *)(currp->buf_addr + currp->buf_doff +
535 currp->buf_dlen);
536 curr_len = currp->buf_dlen;
537 }
538
539 /* Verify checksum only for IPv4 */
540 len = spkt->pkt_flow_ip_hlen;
541 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt));
542 if (verify_l3) {
543 if (PKT_IS_TRUNC_MBUF(spkt)) {
544 partial = os_cpu_in_cksum_mbuf(m: spkt->pkt_mbuf,
545 len, off: 0, initial_sum: 0);
546 } else {
547 partial = pkt_sum(SK_PKT2PH(spkt), soff, len);
548 }
549
550 csum = __packet_fold_sum(sum: partial);
551 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)", 0,
552 len, csum);
553 spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
554 if ((csum ^ 0xffff) != 0) {
555 /* No need to copy & checkum TCP+payload */
556 return false;
557 } else {
558 spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
559 }
560 }
561
562 verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt);
563
564 /* Copy & verify TCP checksum */
565 start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen;
566 l4len = plen - spkt->pkt_flow_ip_hlen;
567 len = plen - start;
568 if (PKT_IS_TRUNC_MBUF(spkt)) {
569 tmplen = min(a: len, b: curr_trailing);
570 odd_start = FALSE;
571
572 /* First, simple checksum on the TCP header */
573 if (verify_l4) {
574 partial = os_cpu_in_cksum_mbuf(m: spkt->pkt_mbuf,
575 len: spkt->pkt_flow_tcp_hlen, off: spkt->pkt_flow_ip_hlen, initial_sum: 0);
576 }
577
578 /* Now, copy & sum the payload */
579 if (tmplen > 0) {
580 data_partial = m_copydata_sum(m: spkt->pkt_mbuf,
581 off: start, len: tmplen, vp: curr_ptr, initial_sum: 0, odd_start: &odd_start);
582 curr_len += tmplen;
583 }
584 data_off = start + tmplen;
585 } else {
586 tmplen = min(a: len, b: curr_trailing);
587 odd_start = FALSE;
588
589 /* First, simple checksum on the TCP header */
590 if (verify_l4) {
591 partial = pkt_sum(SK_PKT2PH(spkt), (soff +
592 spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen);
593 }
594
595 /* Now, copy & sum the payload */
596 if (tmplen > 0) {
597 data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt),
598 soff: (soff + start), dbaddr: (uint8_t *)curr_ptr, len: tmplen,
599 true, initial_sum: 0, odd_start: &odd_start);
600 curr_len += tmplen;
601 }
602 data_off = soff + start + tmplen;
603 }
604
605 /* copy & sum remaining payload in additional buffers */
606 if ((len - tmplen) > 0) {
607 ASSERT(dbuf != NULL);
608 _copy_data_sum_dbuf(spkt, soff: data_off, plen: (len - tmplen),
609 partial_sum: &data_partial, odd_start: &odd_start, dbuf, true);
610 *added = (len - tmplen);
611 }
612
613 /* Fold data checksum to 16 bit */
614 *data_csum = __packet_fold_sum(sum: data_partial);
615
616 if (currm != NULL) {
617 currm->m_len = curr_len;
618 } else {
619 currp->buf_dlen = curr_len;
620 }
621
622 if (verify_l4) {
623 /* Fold in the data checksum to TCP checksum */
624 partial += *data_csum;
625 partial += htons(l4len + IPPROTO_TCP);
626 if (spkt->pkt_flow_ip_ver == IPVERSION) {
627 csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr,
628 spkt->pkt_flow_ipv4_dst.s_addr, partial);
629 } else {
630 ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION);
631 csum = in6_pseudo(&spkt->pkt_flow_ipv6_src,
632 &spkt->pkt_flow_ipv6_dst, partial);
633 }
634 /* pkt metadata will be transfer to super packet */
635 __packet_set_inet_checksum(SK_PKT2PH(spkt),
636 PACKET_CSUM_RX_FULL_FLAGS, start: 0, stuff_val: csum, false);
637 } else {
638 /* grab csum value from offload */
639 csum = spkt->pkt_csum_rx_value;
640 }
641
642 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
643 start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum));
644
645 if ((csum ^ 0xffff) != 0) {
646 /*
647 * Revert whatever we did here!
648 * currm/currp should be restored to previous value.
649 * dbuf (for additional payload) should be restore to 0.
650 */
651 if (currm != NULL) {
652 currm->m_len = curr_oldlen;
653 } else {
654 currp->buf_dlen = curr_oldlen;
655 }
656 if (dbuf != NULL) {
657 for (int i = 0; i < dbuf->dba_num_dbufs; i++) {
658 if (dbuf->dba_is_buflet) {
659 struct __kern_buflet *b = dbuf->dba_buflet[i];
660 kern_buflet_set_data_length(b, 0);
661 kern_buflet_set_data_offset(b, 0);
662 } else {
663 struct mbuf *m = dbuf->dba_mbuf[i];
664 m->m_len = m->m_pkthdr.len = 0;
665 }
666 }
667 }
668
669 return false;
670 }
671
672 return true;
673}
674
675/*
676 * Copy and checksum for packet or packet with mbuf
677 * data_csum is only supported for bsd flows
678 */
679static bool
680copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf,
681 uint16_t *data_csum, bool verify_l3)
682{
683 /*
684 * To keep this routine simple and optimal, we are asserting on the
685 * assumption that the smallest flowswitch packet pool buffer should
686 * be large enough to hold the IP and TCP headers in the first buflet.
687 */
688 _CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY);
689
690 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX |
691 (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY)));
692
693 uint16_t start = 0, csum = 0;
694 uint32_t len = 0;
695 /* soff is only used for packets */
696 uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len;
697 uint32_t data_partial = 0, partial = 0;
698 boolean_t odd_start = false;
699 uint32_t data_len;
700 uint16_t dbuf_off;
701 uint16_t copied_len = 0;
702 bool l3_csum_ok;
703 uint8_t *daddr;
704
705 if (dbuf->dba_is_buflet) {
706 daddr = kern_buflet_get_data_address(dbuf->dba_buflet[0]);
707 daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]);
708 } else {
709 daddr = mtod(dbuf->dba_mbuf[0], uint8_t *);
710 daddr += dbuf->dba_mbuf[0]->m_len;
711 /*
712 * available space check for payload is done later
713 * in _copy_data_sum_dbuf
714 */
715 ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >=
716 pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen);
717 }
718
719 if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) {
720 /* copy only */
721 _copy_data_sum_dbuf(spkt: pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff,
722 plen, partial_sum: &partial, odd_start: &odd_start, dbuf, false);
723 if (PKT_IS_MBUF(pkt)) {
724 csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val;
725 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
726 pkt->pkt_mbuf->m_pkthdr.csum_flags,
727 pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum);
728 } else {
729 csum = pkt->pkt_csum_rx_value;
730 SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x",
731 pkt->pkt_csum_flags,
732 pkt->pkt_csum_rx_start_off, csum);
733 }
734
735 /* pkt metadata will be transfer to super packet */
736 __packet_set_inet_checksum(SK_PKT2PH(pkt),
737 PACKET_CSUM_RX_FULL_FLAGS, start: 0, stuff_val: csum, false);
738 if ((csum ^ 0xffff) == 0) {
739 return true;
740 } else {
741 return false;
742 }
743 }
744
745 /* Copy l3 & verify checksum only for IPv4 */
746 start = 0;
747 len = pkt->pkt_flow_ip_hlen;
748 if (PKT_IS_TRUNC_MBUF(pkt)) {
749 partial = m_copydata_sum(m: pkt->pkt_mbuf, off: start, len,
750 vp: (daddr + start), initial_sum: 0, NULL);
751 } else {
752 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff,
753 dbaddr: (daddr + start), len, true, initial_sum: 0, NULL);
754 }
755 verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt));
756 l3_csum_ok = !verify_l3;
757 if (verify_l3) {
758 csum = __packet_fold_sum(sum: partial);
759 SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)",
760 start, len, csum);
761 pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED;
762 if ((csum ^ 0xffff) != 0) {
763 /* proceed to copy the rest of packet */
764 } else {
765 pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID;
766 l3_csum_ok = true;
767 }
768 }
769 copied_len += pkt->pkt_flow_ip_hlen;
770
771 /* Copy & verify TCP checksum */
772 start = pkt->pkt_flow_ip_hlen;
773 len = plen - start;
774
775 if (PKT_IS_TRUNC_MBUF(pkt)) {
776 /* First, copy and sum TCP header */
777 partial = m_copydata_sum(m: pkt->pkt_mbuf, off: start,
778 len: pkt->pkt_flow_tcp_hlen, vp: (daddr + start), initial_sum: 0, NULL);
779
780 data_len = len - pkt->pkt_flow_tcp_hlen;
781 start += pkt->pkt_flow_tcp_hlen;
782 dbuf_off = start;
783 /* Next, copy and sum payload (if any) */
784 } else {
785 /* First, copy and sum TCP header */
786 partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff: (soff + start),
787 dbaddr: (daddr + start), len: pkt->pkt_flow_tcp_hlen, true, initial_sum: 0, NULL);
788
789 data_len = len - pkt->pkt_flow_tcp_hlen;
790 start += pkt->pkt_flow_tcp_hlen;
791 dbuf_off = start;
792 start += soff;
793 }
794 copied_len += pkt->pkt_flow_tcp_hlen;
795
796 if (dbuf->dba_is_buflet) {
797 VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0],
798 kern_buflet_get_data_length(dbuf->dba_buflet[0]) +
799 copied_len) == 0);
800 } else {
801 dbuf->dba_mbuf[0]->m_len += copied_len;
802 dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len;
803 }
804
805 /* copy and sum payload (if any) */
806 if (data_len > 0) {
807 odd_start = false;
808 _copy_data_sum_dbuf(spkt: pkt, soff: start, plen: data_len, partial_sum: &data_partial,
809 odd_start: &odd_start, dbuf, do_csum: l3_csum_ok);
810 }
811
812 if (__improbable(!l3_csum_ok)) {
813 return false;
814 }
815
816 /* Fold data sum to 16 bit and then into the partial */
817 *data_csum = __packet_fold_sum(sum: data_partial);
818
819 /* Fold in the data checksum to TCP checksum */
820 partial += *data_csum;
821
822 partial += htons(len + IPPROTO_TCP);
823 if (pkt->pkt_flow_ip_ver == IPVERSION) {
824 csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
825 pkt->pkt_flow_ipv4_dst.s_addr, partial);
826 } else {
827 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
828 csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
829 &pkt->pkt_flow_ipv6_dst, partial);
830 }
831
832 SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)",
833 pkt->pkt_flow_ip_hlen, len, csum);
834
835 /* pkt metadata will be transfer to super packet */
836 __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS,
837 start: 0, stuff_val: csum, false);
838 if ((csum ^ 0xffff) != 0) {
839 return false;
840 }
841
842 return true;
843}
844
845SK_INLINE_ATTRIBUTE
846static void
847flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa,
848 struct __kern_packet *pkt)
849{
850 struct ifnet *ifp;
851
852 switch (pkt->pkt_flow_ip_ver) {
853 case IPVERSION:
854 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) {
855 return;
856 }
857 break;
858 case IPV6_VERSION:
859 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
860 return;
861 }
862 break;
863 default:
864 VERIFY(0);
865 /* NOTREACHED */
866 __builtin_unreachable();
867 }
868
869 fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen;
870 fa->fa_ulen = pkt->pkt_flow_ulen;
871 fa->fa_total = pkt->pkt_flow_ip_hlen +
872 pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen;
873
874 ifp = fsw->fsw_ifp;
875 ASSERT(ifp != NULL);
876 if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) {
877 /* in case hardware supports LRO, don't fix checksum in the header */
878 fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op;
879 } else {
880 fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum;
881 }
882}
883
884static void
885flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa,
886 struct mbuf *smbuf, struct __kern_packet *pkt)
887{
888 FLOW_AGG_CLEAR(fa);
889
890 ASSERT(smbuf != NULL);
891 fa->fa_smbuf = smbuf;
892
893 fa->fa_sptr = mtod(smbuf, uint8_t *);
894 ASSERT(fa->fa_sptr != NULL);
895
896 /*
897 * Note here we use 'pkt' instead of 'smbuf', since we rely on the
898 * contents of the flow structure which don't exist in 'smbuf'.
899 */
900 flow_agg_init_common(fsw, fa, pkt);
901}
902
903static void
904flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa,
905 struct __kern_packet *spkt, struct __kern_packet *pkt)
906{
907 FLOW_AGG_CLEAR(fa);
908
909 ASSERT(spkt != NULL);
910 fa->fa_spkt = spkt;
911 fa->fa_sobj_is_pkt = true;
912 VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0);
913
914 MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr);
915 ASSERT(fa->fa_sptr != NULL);
916
917 /*
918 * Note here we use 'pkt' instead of 'spkt', since we rely on the
919 * contents of the flow structure which don't exist in 'spkt'.
920 */
921 flow_agg_init_common(fsw, fa, pkt);
922}
923
924SK_INLINE_ATTRIBUTE
925static bool
926ipv4_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
927{
928 return sk_memcmp_mask_64B(src1: h1, src2: h2, byte_mask: (const uint8_t *)&ip_tcp_mask) == 0;
929}
930
931SK_INLINE_ATTRIBUTE
932static bool
933ipv6_tcp_memcmp(const uint8_t *h1, const uint8_t *h2)
934{
935 return sk_memcmp_mask_80B(src1: h1, src2: h2, byte_mask: (const uint8_t *)&ip6_tcp_mask) == 0;
936}
937
938SK_INLINE_ATTRIBUTE
939static bool
940can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt,
941 struct fsw_stats *fsws)
942{
943 bool match;
944
945 ASSERT(fa->fa_sptr != NULL);
946 _CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE);
947 _CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE);
948
949 if (__improbable(pkt->pkt_length < MASK_SIZE)) {
950 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP);
951 goto slow_path;
952 }
953
954 if (__improbable(fa->fa_sobj_is_short)) {
955 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF);
956 goto slow_path;
957 }
958
959 if (__improbable(pkt->pkt_flow_tcp_hlen !=
960 (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) {
961 goto slow_path;
962 }
963
964 switch (pkt->pkt_flow_ip_ver) {
965 case IPVERSION:
966 match = ipv4_tcp_memcmp(h1: fa->fa_sptr,
967 h2: (uint8_t *)pkt->pkt_flow_ip_hdr);
968 break;
969 case IPV6_VERSION:
970 match = ipv6_tcp_memcmp(h1: fa->fa_sptr,
971 h2: (uint8_t *)pkt->pkt_flow_ip_hdr);
972 break;
973 default:
974 VERIFY(0);
975 /* NOTREACHED */
976 __builtin_unreachable();
977 }
978
979 if (__improbable(!match)) {
980 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP);
981 goto slow_path;
982 }
983 if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) {
984 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP);
985 goto slow_path;
986 }
987
988 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP);
989 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
990 fa->fa_ulen = pkt->pkt_flow_ulen;
991 return true;
992
993slow_path:
994 return false;
995}
996
997SK_NO_INLINE_ATTRIBUTE
998static bool
999can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt,
1000 struct fsw_stats *fsws)
1001{
1002 uint8_t *sl3_hdr = fa->fa_sptr;
1003 uint32_t sl3tlen = 0;
1004 uint16_t sl3hlen = 0;
1005
1006 DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt,
1007 uint8_t *, sl3_hdr);
1008
1009 ASSERT(sl3_hdr != NULL);
1010
1011 /*
1012 * Compare IP header length, TOS, frag flags and IP options
1013 * For IPv4, the options should match exactly
1014 * For IPv6, if options are present, bail out
1015 */
1016 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1017 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1018 struct ip *iph = (struct ip *)pkt->pkt_flow_ip_hdr;
1019
1020 ASSERT(siph->ip_v == IPVERSION);
1021 /* 16-bit alignment is sufficient (handles mbuf case) */
1022 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1023 ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t)));
1024
1025 sl3hlen = (siph->ip_hl << 2);
1026 if (sl3hlen != pkt->pkt_flow_ip_hlen) {
1027 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1028 DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t,
1029 pkt->pkt_flow_ip_hlen);
1030 return false;
1031 }
1032
1033 if (siph->ip_ttl != iph->ip_ttl) {
1034 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1035 DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl,
1036 uint8_t, iph->ip_ttl);
1037 return false;
1038 }
1039
1040 if (siph->ip_tos != iph->ip_tos) {
1041 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1042 DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos,
1043 uint8_t, iph->ip_tos);
1044 return false;
1045 }
1046 /* For IPv4, DF bit should match */
1047 if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) !=
1048 (ntohs(iph->ip_off) & (IP_DF | IP_RF))) {
1049 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP);
1050 DTRACE_SKYWALK2(aggr__fail5, uint16_t,
1051 ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off));
1052 return false;
1053 }
1054
1055 uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen -
1056 sizeof(struct ip);
1057 if (ip_opts_len > 0 &&
1058 memcmp(s1: (uint8_t *)(siph + 1), s2: (uint8_t *)(iph + 1),
1059 n: ip_opts_len) != 0) {
1060 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP);
1061 DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len,
1062 uint8_t *, (uint8_t *)(siph + 1), uint8_t *,
1063 (uint8_t *)(iph + 1));
1064 return false;
1065 }
1066 sl3tlen = ntohs(siph->ip_len);
1067 } else {
1068 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1069 struct ip6_hdr *ip6 = (struct ip6_hdr *)pkt->pkt_flow_ip_hdr;
1070
1071 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1072 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1073 /* 16-bit alignment is sufficient (handles mbuf case) */
1074 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1075
1076 if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) {
1077 /*
1078 * Don't aggregate if extension header is present in
1079 * packet. N.B. currently flow switch only classifies
1080 * frag header
1081 */
1082 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP);
1083 DTRACE_SKYWALK1(aggr__fail7, uint8_t,
1084 pkt->pkt_flow_ip_hlen);
1085 return false;
1086 }
1087
1088 sl3hlen = sizeof(struct ip6_hdr);
1089 /* For IPv6, flow info mask covers TOS and flow label */
1090 if (memcmp(s1: &sip6->ip6_flow, s2: &ip6->ip6_flow,
1091 n: sizeof(sip6->ip6_flow)) != 0) {
1092 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP);
1093 DTRACE_SKYWALK2(aggr__fail8, uint32_t,
1094 ntohl(sip6->ip6_flow), uint32_t,
1095 ntohl(ip6->ip6_flow));
1096 return false;
1097 }
1098
1099 if (sip6->ip6_hlim != ip6->ip6_hlim) {
1100 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP);
1101 DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim,
1102 uint8_t, ip6->ip6_hlim);
1103 return false;
1104 }
1105
1106 sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen));
1107 }
1108
1109 /*
1110 * For TCP header, compare ACK number and window size
1111 * Compare TCP flags
1112 * Compare TCP header length and TCP options
1113 */
1114 struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen);
1115 struct tcphdr *tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1116
1117 uint16_t sl4hlen = (stcp->th_off << 2);
1118 if (memcmp(s1: &stcp->th_ack, s2: &tcp->th_ack, n: sizeof(stcp->th_ack)) != 0 ||
1119 memcmp(s1: &stcp->th_win, s2: &tcp->th_win, n: sizeof(stcp->th_win)) != 0) {
1120 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP);
1121 DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack),
1122 uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win),
1123 uint16_t, ntohs(tcp->th_win));
1124 return false;
1125 }
1126
1127 if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) {
1128 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP);
1129 DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags,
1130 uint8_t, tcp->th_flags);
1131 return false;
1132 }
1133
1134 if (sl4hlen != pkt->pkt_flow_tcp_hlen) {
1135 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP);
1136 DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen,
1137 uint8_t, pkt->pkt_flow_tcp_hlen);
1138 return false;
1139 }
1140
1141 uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr);
1142 /*
1143 * We know that the TCP-option lengthes are the same thanks to the above
1144 * sl4hlen check
1145 */
1146 if (tcp_opts_len > 0 && memcmp(s1: (uint8_t *)(stcp + 1),
1147 s2: (uint8_t *)(tcp + 1), n: tcp_opts_len) != 0) {
1148 /*
1149 * Fast-path header prediction:
1150 *
1151 * TCP Timestamp option is usually put after two NOP-headers,
1152 * and thus total TCP-option length is 12. If that's the case,
1153 * we can aggregate as only the TCP time-stamp option differs.
1154 */
1155 if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) {
1156 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP);
1157 DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len);
1158 return false;
1159 } else {
1160 uint32_t sts_hdr, ts_hdr;
1161 if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) {
1162 sts_hdr = *((uint32_t *)(stcp + 1));
1163 } else {
1164 bcopy(src: stcp + 1, dst: &sts_hdr, n: sizeof(sts_hdr));
1165 }
1166 if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) {
1167 ts_hdr = *((uint32_t *)(tcp + 1));
1168 } else {
1169 bcopy(src: tcp + 1, dst: &ts_hdr, n: sizeof(ts_hdr));
1170 }
1171
1172 if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) ||
1173 ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) {
1174 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP);
1175 DTRACE_SKYWALK2(aggr__fail14, uint32_t,
1176 sts_hdr, uint32_t, ts_hdr);
1177 return false;
1178 }
1179 }
1180 }
1181 STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP);
1182 fa->fa_tcp_seq += pkt->pkt_flow_ulen;
1183 fa->fa_ulen = pkt->pkt_flow_ulen;
1184 return true;
1185}
1186
1187static bool
1188flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt,
1189 struct fsw_stats *fsws)
1190{
1191 /* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */
1192 const uint32_t max_ip_len = MAX_AGG_IP_LEN();
1193 bool can_agg = false;
1194
1195 DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa,
1196 struct __kern_packet *, pkt);
1197
1198 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1199 if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) {
1200 pkt->pkt_flow_tcp_agg_fast = 0;
1201 }
1202 /*
1203 * Don't aggregate if any of the following is true:
1204 * 1. TCP flag is other than TH_{ACK,PUSH}
1205 * 2. Payload length is 0 (pure ACK)
1206 * 3. This is the first packet
1207 * 4. TCP sequence number is not expected
1208 * 5. We would've exceeded the maximum aggregated size
1209 * 6. It's not the first packet and the wake flag is set
1210 */
1211 if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 ||
1212 pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) {
1213 DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt);
1214 goto done;
1215 }
1216 if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) {
1217 DTRACE_SKYWALK2(aggr__fail1b, uint32_t,
1218 ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq);
1219 STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP);
1220 goto done;
1221 }
1222 if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) {
1223 DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total,
1224 uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len);
1225 /* We've reached aggregation limit */
1226 STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT);
1227 goto done;
1228 }
1229 if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) {
1230 DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt);
1231 goto done;
1232 }
1233
1234 can_agg = can_agg_fastpath(fa, pkt, fsws);
1235 if (can_agg) {
1236 pkt->pkt_flow_tcp_agg_fast = 1;
1237 goto done;
1238 }
1239
1240 can_agg = can_agg_slowpath(fa, pkt, fsws);
1241 ASSERT(!pkt->pkt_flow_tcp_agg_fast);
1242
1243done:
1244 return can_agg;
1245}
1246
1247static uint16_t
1248flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new)
1249{
1250 return __packet_fix_sum(csum, old, new);
1251}
1252
1253static uint16_t
1254flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old,
1255 uint16_t __unused new)
1256{
1257 return 0;
1258}
1259
1260static inline void
1261flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa, uint8_t *field, uint16_t *csum,
1262 uint32_t new)
1263{
1264 uint32_t old;
1265 memcpy(dst: &old, src: field, n: sizeof(old));
1266 memcpy(dst: field, src: &new, n: sizeof(uint32_t));
1267 *csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum,
1268 (uint16_t)(old >> 16), (uint16_t)(new >> 16)),
1269 (uint16_t)(old & 0xffff),
1270 (uint16_t)(new & 0xffff));
1271}
1272
1273static void
1274flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt,
1275 __unused uint16_t data_csum, struct fsw_stats *fsws)
1276{
1277 struct tcphdr *stcp, *tcp;
1278 uint8_t *l3hdr, l3hlen;
1279 uint16_t old_l3len = 0;
1280 uint8_t result;
1281
1282 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1283
1284 /*
1285 * The packet being merged should always have full checksum flags
1286 * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed
1287 * and not enter this function.
1288 */
1289 ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt));
1290 ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0);
1291
1292 ASSERT(fa->fa_sobj != NULL);
1293 ASSERT(!fa->fa_sobj_is_pkt ||
1294 (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0));
1295 uint8_t *sl3_hdr = fa->fa_sptr;
1296 ASSERT(sl3_hdr != NULL);
1297 ASSERT(fa->fa_fix_pkt_sum != NULL);
1298
1299 fa->fa_total += pkt->pkt_flow_ulen;
1300
1301 /*
1302 * Update the IP header as:
1303 * 1. Set the IP ID (IPv4 only) to that of the new packet
1304 * 2. Set the ttl to the lowest of the two
1305 * 3. Increment the IP length by the payload length of new packet
1306 * 4. Leave the IP (IPv4 only) checksum as is
1307 * Update the resp. flow classification fields, if any
1308 * Nothing to update for TCP header for now
1309 */
1310 if (pkt->pkt_flow_ip_ver == IPVERSION) {
1311 struct ip *siph = (struct ip *)(void *)sl3_hdr;
1312
1313 /* 16-bit alignment is sufficient (handles mbuf case) */
1314 ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t)));
1315
1316 l3hdr = (uint8_t *)siph;
1317 l3hlen = siph->ip_hl << 2;
1318
1319 old_l3len = ntohs(siph->ip_len);
1320 uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen;
1321 siph->ip_len = htons(l3tlen);
1322 siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0,
1323 htons(pkt->pkt_flow_ulen));
1324
1325 SK_DF(logflags, "Agg IP len %u", ntohs(siph->ip_len));
1326 } else {
1327 struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr;
1328
1329 /* 16-bit alignment is sufficient (handles mbuf case) */
1330 ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t)));
1331 ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION);
1332 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
1333
1334 l3hdr = (uint8_t *)sip6;
1335 l3hlen = sizeof(struct ip6_hdr);
1336
1337 /* No extension headers should be present */
1338 ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr));
1339
1340 old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr);
1341 uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen;
1342 sip6->ip6_plen = htons(l3plen);
1343
1344 SK_DF(logflags, "Agg IP6 len %u", ntohs(sip6->ip6_plen));
1345 }
1346
1347 if (__probable(pkt->pkt_flow_tcp_agg_fast)) {
1348 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP);
1349 } else {
1350 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP);
1351 }
1352
1353 stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen);
1354 tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
1355 /* 16-bit alignment is sufficient (handles mbuf case) */
1356 ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t)));
1357 ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t)));
1358
1359 /*
1360 * If it is bigger, that means there are TCP-options that need to be
1361 * copied over.
1362 */
1363 if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) ||
1364 (stcp->th_flags & TH_PUSH) == 0) {
1365 VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen);
1366 if (__improbable(!pkt->pkt_flow_tcp_agg_fast &&
1367 memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen -
1368 sizeof(struct tcphdr))) != 0)) {
1369 uint8_t *sopt = (uint8_t *)(stcp + 1);
1370 uint8_t *opt = (uint8_t *)(tcp + 1);
1371
1372 uint32_t ntsval, ntsecr;
1373 bcopy(src: (void *)(opt + 4), dst: &ntsval, n: sizeof(ntsval));
1374 bcopy(src: (void *)(opt + 8), dst: &ntsecr, n: sizeof(ntsecr));
1375
1376 flow_agg_pkt_fix_hdr_sum(fa, field: sopt + 4, csum: &stcp->th_sum, new: ntsval);
1377 flow_agg_pkt_fix_hdr_sum(fa, field: sopt + 8, csum: &stcp->th_sum, new: ntsecr);
1378
1379 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP);
1380 } else {
1381 STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP);
1382 }
1383
1384 if ((stcp->th_flags & TH_PUSH) == 0 &&
1385 (tcp->th_flags & TH_PUSH) != 0) {
1386 uint16_t old, new;
1387 old = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1388 /* If the new segment has a PUSH-flag, append it! */
1389 stcp->th_flags |= tcp->th_flags & TH_PUSH;
1390 new = *(uint16_t *)(void *)(&stcp->th_ack + 1);
1391 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new);
1392 }
1393 }
1394
1395 /* Update pseudo header checksum */
1396 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1397 htons(pkt->pkt_flow_ulen));
1398
1399 /* Update data checksum */
1400 if (__improbable(old_l3len & 0x1)) {
1401 /* swap the byte order, refer to rfc 1071 section 2 */
1402 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0,
1403 ntohs(data_csum));
1404 } else {
1405 stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum);
1406 }
1407
1408 if (fa->fa_sobj_is_pkt) {
1409 struct __kern_packet *spkt = fa->fa_spkt;
1410 spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1411 spkt->pkt_flow_ulen += pkt->pkt_flow_ulen;
1412 /*
1413 * Super packet length includes L3 and L4
1414 * header length for first packet only.
1415 */
1416 spkt->pkt_length += pkt->pkt_flow_ulen;
1417 if (spkt->pkt_seg_cnt == 0) {
1418 /* First time we append packets, need to set it to 1 */
1419 spkt->pkt_seg_cnt = 1;
1420 }
1421 _CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt));
1422 if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) {
1423 spkt->pkt_seg_cnt = result;
1424 }
1425 SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x",
1426 spkt->pkt_length, ntohs(stcp->th_sum));
1427 } else {
1428 struct mbuf *smbuf = fa->fa_smbuf;
1429 smbuf->m_pkthdr.len += pkt->pkt_flow_ulen;
1430 if (smbuf->m_pkthdr.seg_cnt == 0) {
1431 /* First time we append packets, need to set it to 1 */
1432 smbuf->m_pkthdr.seg_cnt = 1;
1433 }
1434 _CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt));
1435 if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) {
1436 smbuf->m_pkthdr.seg_cnt = result;
1437 }
1438 SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x",
1439 smbuf->m_pkthdr.len, ntohs(stcp->th_sum));
1440 }
1441}
1442
1443/*
1444 * Copy metadata from source packet to destination packet
1445 */
1446static void
1447pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
1448{
1449 /* Copy packet metadata */
1450 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1451 _PKT_COPY(spkt, dpkt);
1452}
1453
1454static void
1455pkt_finalize(kern_packet_t ph)
1456{
1457 int err = __packet_finalize(ph);
1458 VERIFY(err == 0);
1459#if (DEVELOPMENT || DEBUG)
1460 struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph);
1461 uint8_t *buf;
1462 MD_BUFLET_ADDR_ABS(pkt, buf);
1463 buf += pkt->pkt_headroom + pkt->pkt_l2_len;
1464 DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt,
1465 uint8_t *, buf);
1466#endif
1467}
1468
1469static inline uint32_t
1470estimate_buf_cnt(struct flow_entry *fe, uint32_t min_bufsize,
1471 uint32_t agg_bufsize)
1472{
1473 uint32_t max_ip_len = MAX_AGG_IP_LEN();
1474 uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize);
1475 uint32_t hdr_overhead;
1476
1477 agg_size = MIN(agg_size, agg_bufsize);
1478
1479 hdr_overhead = (fe->fe_rx_pktq_bytes / max_ip_len) *
1480 (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) +
1481 sizeof(struct tcphdr));
1482
1483 return ((fe->fe_rx_pktq_bytes + hdr_overhead) / agg_size) + 1;
1484}
1485
1486SK_INLINE_ATTRIBUTE
1487static inline void
1488_append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf,
1489 _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf)
1490{
1491 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1492 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1493 VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0);
1494 pbuf = buf;
1495 dbuf_array->dba_buflet[i] = NULL;
1496 }
1497 ASSERT(pbuf != NULL);
1498 dbuf_array->dba_num_dbufs = 0;
1499 *lbuf = pbuf;
1500}
1501
1502SK_INLINE_ATTRIBUTE
1503static inline void
1504_free_dbuf_array(struct kern_pbufpool *pp,
1505 _dbuf_array_t *dbuf_array)
1506{
1507 for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) {
1508 kern_buflet_t buf = dbuf_array->dba_buflet[i];
1509 pp_free_buflet(pp, buf);
1510 dbuf_array->dba_buflet[i] = NULL;
1511 }
1512 dbuf_array->dba_num_dbufs = 0;
1513}
1514
1515static inline void
1516finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph,
1517 struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts,
1518 uint16_t bufcnt)
1519{
1520 (*spkts)++;
1521 if (bufcnt > 1) {
1522 (*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1523 }
1524 pkt_finalize(ph: *sph);
1525 if ((*spkt)->pkt_length > *largest_spkt) {
1526 *largest_spkt = (*spkt)->pkt_length;
1527 }
1528 pkt_agg_log(*spkt, kernproc, false);
1529 DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt);
1530 *sph = 0;
1531 *spkt = NULL;
1532 FLOW_AGG_CLEAR(fa);
1533}
1534
1535static inline void
1536converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size)
1537{
1538 if (fe->fe_rx_largest_size > largest_agg_size) {
1539 /*
1540 * Make it slowly move towards largest_agg_size if we
1541 * consistently get non-aggregatable size.
1542 *
1543 * If we start at 16K, this makes us go to 4K within 6 rounds
1544 * and down to 2K within 12 rounds.
1545 */
1546 fe->fe_rx_largest_size -=
1547 ((fe->fe_rx_largest_size - largest_agg_size) >> 2);
1548 } else {
1549 fe->fe_rx_largest_size +=
1550 ((largest_agg_size - fe->fe_rx_largest_size) >> 2);
1551 }
1552}
1553
1554SK_NO_INLINE_ATTRIBUTE
1555static void
1556flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe,
1557 struct pktq *dropped_pkts, bool is_mbuf)
1558{
1559#define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt) do { \
1560 KPKTQ_ENQUEUE(dropped_pkts, (_pkt)); \
1561 (_pkt) = NULL; \
1562 FLOW_AGG_CLEAR(&fa); \
1563 prev_csum_ok = false; \
1564} while (0)
1565 struct flow_agg fa; /* states */
1566 FLOW_AGG_CLEAR(&fa);
1567
1568 struct pktq pkts; /* dst super packets */
1569 struct pktq disposed_pkts; /* done src packets */
1570
1571 KPKTQ_INIT(&pkts);
1572 KPKTQ_INIT(&disposed_pkts);
1573
1574 struct __kern_channel_ring *ring;
1575 ring = fsw_flow_get_rx_ring(fsw, fe);
1576 if (__improbable(ring == NULL)) {
1577 SK_ERR("Rx ring is NULL");
1578 KPKTQ_CONCAT(dropped_pkts, &fe->fe_rx_pktq);
1579 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID,
1580 KPKTQ_LEN(dropped_pkts));
1581 return;
1582 }
1583 struct kern_pbufpool *dpp = ring->ckr_pp;
1584 ASSERT(dpp->pp_max_frags > 1);
1585
1586 struct __kern_packet *pkt, *tpkt;
1587 /* state for super packet */
1588 struct __kern_packet *spkt = NULL;
1589 kern_packet_t sph = 0;
1590 kern_buflet_t sbuf = NULL;
1591 bool prev_csum_ok = false, csum_ok, agg_ok;
1592 uint16_t spkts = 0, bufcnt = 0;
1593 int err;
1594
1595 struct fsw_stats *fsws = &fsw->fsw_stats;
1596
1597 /* state for buflet batch alloc */
1598 uint32_t bh_cnt, bh_cnt_tmp;
1599 uint64_t buf_arr[MAX_BUFLET_COUNT];
1600 _dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0};
1601 uint32_t largest_spkt = 0; /* largest aggregated packet size */
1602 uint32_t agg_bufsize;
1603 uint8_t iter = 0;
1604 bool large_buffer = false;
1605
1606 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1607 SK_DF(logflags, "Rx input queue len %u", KPKTQ_LEN(&fe->fe_rx_pktq));
1608
1609 if (__probable(fe->fe_rx_largest_size != 0 &&
1610 NX_FSW_TCP_RX_AGG_ENABLED())) {
1611 if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) ||
1612 PP_BUF_SIZE_LARGE(dpp) == 0) {
1613 agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1614 } else {
1615 agg_bufsize = PP_BUF_SIZE_LARGE(dpp);
1616 large_buffer = true;
1617 }
1618 bh_cnt = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp),
1619 agg_bufsize);
1620 DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt);
1621 bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT);
1622 bh_cnt_tmp = bh_cnt;
1623 } else {
1624 /*
1625 * No payload, thus it's all small-sized ACKs/...
1626 * OR aggregation is disabled.
1627 */
1628 agg_bufsize = PP_BUF_SIZE_DEF(dpp);
1629 bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(&fe->fe_rx_pktq), MAX_BUFLET_COUNT);
1630 DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt);
1631 }
1632
1633 err = pp_alloc_buflet_batch(pp: dpp, array: buf_arr, size: &bh_cnt, SKMEM_NOSLEEP,
1634 large: large_buffer);
1635 if (__improbable(bh_cnt == 0)) {
1636 SK_ERR("failed to alloc %u buflets (err %d), use slow path",
1637 bh_cnt_tmp, err);
1638 }
1639 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1640 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1641 if (tpkt != NULL) {
1642 void *baddr;
1643 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1644 SK_PREFETCH(baddr, 0);
1645 }
1646
1647 ASSERT(pkt->pkt_qum.qum_pp != dpp);
1648 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
1649 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
1650 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
1651 ASSERT(!pkt->pkt_flow_ip_is_frag);
1652 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
1653
1654 csum_ok = false;
1655 agg_ok = false;
1656 /* supports TCP only */
1657 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
1658 pkt->pkt_flow_tcp_hlen);
1659 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
1660 uint16_t data_csum = 0;
1661
1662 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1663 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1664 err = flow_pkt_track(fe, pkt, true);
1665 if (__improbable(err != 0)) {
1666 STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR);
1667 /* if need to trigger RST */
1668 if (err == ENETRESET) {
1669 flow_track_abort_tcp(fe, in_pkt: pkt, NULL);
1670 }
1671 SK_ERR("flow_pkt_track failed (err %d)", err);
1672 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1673 continue;
1674 }
1675
1676 if (is_mbuf) { /* compat */
1677 m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len);
1678 pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf);
1679 if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
1680 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1681 }
1682 }
1683
1684 if (prev_csum_ok && sbuf) {
1685 ASSERT(fa.fa_spkt == spkt);
1686 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1687 agg_ok = flow_agg_is_ok(fa: &fa, pkt, fsws);
1688 agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags);
1689
1690 if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff -
1691 sbuf->buf_dlen >= plen - thlen) {
1692 /*
1693 * No need for a new packet, just
1694 * append to curr_m.
1695 */
1696 csum_ok = copy_pkt_csum_packed(spkt: pkt, plen, NULL,
1697 verify_l3: is_ipv4, NULL, currp: sbuf, data_csum: &data_csum, NULL);
1698
1699 if (!csum_ok) {
1700 STATS_INC(fsws,
1701 FSW_STATS_RX_AGG_BAD_CSUM);
1702 SK_ERR("Checksum for aggregation "
1703 "is wrong");
1704 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1);
1705 /*
1706 * Turns out, checksum is wrong!
1707 * Fallback to no-agg mode.
1708 */
1709 agg_ok = false;
1710 } else {
1711 flow_agg_merge_hdr(fa: &fa, pkt,
1712 data_csum, fsws);
1713 goto next;
1714 }
1715 }
1716 }
1717
1718 /* calculate number of buflets required */
1719 bh_cnt_tmp = howmany(plen, agg_bufsize);
1720 if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) {
1721 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1722 SK_ERR("packet too big: bufcnt %d len %d", bh_cnt_tmp,
1723 plen);
1724 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1725 continue;
1726 }
1727 if (bh_cnt < bh_cnt_tmp) {
1728 uint32_t tmp;
1729
1730 if (iter != 0) {
1731 /*
1732 * rearrange the array for additional
1733 * allocation
1734 */
1735 uint8_t i;
1736 for (i = 0; i < bh_cnt; i++, iter++) {
1737 buf_arr[i] = buf_arr[iter];
1738 buf_arr[iter] = 0;
1739 }
1740 iter = 0;
1741 }
1742 tmp = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp),
1743 agg_bufsize);
1744 tmp = MIN(tmp, MAX_BUFLET_COUNT);
1745 tmp = MAX(tmp, bh_cnt_tmp);
1746 tmp -= bh_cnt;
1747 ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt));
1748 DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp);
1749 err = pp_alloc_buflet_batch(pp: dpp, array: &buf_arr[bh_cnt],
1750 size: &tmp, SKMEM_NOSLEEP, large: large_buffer);
1751 bh_cnt += tmp;
1752 if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) {
1753 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1754 SK_ERR("buflet alloc failed (err %d)", err);
1755 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1756 continue;
1757 }
1758 }
1759 /* Use pre-allocated buflets */
1760 ASSERT(bh_cnt >= bh_cnt_tmp);
1761 dbuf_array.dba_num_dbufs = bh_cnt_tmp;
1762 while (bh_cnt_tmp-- > 0) {
1763 dbuf_array.dba_buflet[bh_cnt_tmp] =
1764 (kern_buflet_t)(buf_arr[iter]);
1765 buf_arr[iter] = 0;
1766 bh_cnt--;
1767 iter++;
1768 }
1769 /* copy and checksum TCP data */
1770 if (agg_ok) {
1771 int added = 0;
1772 ASSERT(dbuf_array.dba_num_dbufs != 0);
1773 csum_ok = copy_pkt_csum_packed(spkt: pkt, plen, dbuf: &dbuf_array,
1774 verify_l3: is_ipv4, NULL, currp: sbuf, data_csum: &data_csum, added: &added);
1775
1776 if (__improbable(!csum_ok)) {
1777 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1778 SK_ERR("Checksum for aggregation on new "
1779 "mbuf is wrong");
1780 DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2);
1781 agg_ok = false;
1782 /* reset the used buflets */
1783 uint8_t j;
1784 for (j = 0; j < dbuf_array.dba_num_dbufs; j++) {
1785 VERIFY(kern_buflet_set_data_length(
1786 dbuf_array.dba_buflet[j], 0) == 0);
1787 }
1788 goto non_agg;
1789 }
1790
1791 /*
1792 * There was not enough space in curr_m, thus we must
1793 * have added to m->m_data.
1794 */
1795 VERIFY(added > 0);
1796 } else {
1797non_agg:
1798 ASSERT(dbuf_array.dba_num_dbufs != 0);
1799 csum_ok = copy_pkt_csum(pkt, plen, dbuf: &dbuf_array,
1800 data_csum: &data_csum, verify_l3: is_ipv4);
1801 if (__improbable(!csum_ok)) {
1802 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
1803 SK_ERR("%d incorrect csum", __LINE__);
1804 DTRACE_SKYWALK(aggr__chan_tcp_csum_fail);
1805 }
1806 }
1807 if (agg_ok) {
1808 ASSERT(fa.fa_spkt == spkt);
1809 ASSERT(spkt == NULL || fa.fa_sobj_is_pkt);
1810 /* update current packet header */
1811 flow_agg_merge_hdr(fa: &fa, pkt, data_csum, fsws);
1812 ASSERT(dbuf_array.dba_num_dbufs > 0);
1813 bufcnt += dbuf_array.dba_num_dbufs;
1814 _append_dbuf_array_to_kpkt(ph: sph, pbuf: sbuf, dbuf_array: &dbuf_array,
1815 lbuf: &sbuf);
1816 } else {
1817 /* Finalize the current super packet */
1818 if (sph != 0) {
1819 finalize_super_packet(spkt: &spkt, sph: &sph, fa: &fa,
1820 largest_spkt: &largest_spkt, spkts: &spkts, bufcnt);
1821 }
1822
1823 /* New super packet */
1824 err = kern_pbufpool_alloc_nosleep(pbufpool: dpp, bufcnt: 0, packet: &sph);
1825 if (__improbable(err != 0)) {
1826 STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT);
1827 SK_ERR("packet alloc failed (err %d)", err);
1828 _free_dbuf_array(pp: dpp, dbuf_array: &dbuf_array);
1829 __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt);
1830 continue;
1831 }
1832 spkt = SK_PTR_ADDR_KPKT(sph);
1833 pkt_copy_metadata(spkt: pkt, dpkt: spkt);
1834 /* Packet length for super packet starts from L3 */
1835 spkt->pkt_length = plen;
1836 spkt->pkt_flow_ulen = pkt->pkt_flow_ulen;
1837 spkt->pkt_headroom = 0;
1838 spkt->pkt_l2_len = 0;
1839 spkt->pkt_seg_cnt = 1;
1840
1841 ASSERT(dbuf_array.dba_num_dbufs > 0);
1842 bufcnt = dbuf_array.dba_num_dbufs;
1843 sbuf = kern_packet_get_next_buflet(sph, NULL);
1844 _append_dbuf_array_to_kpkt(ph: sph, pbuf: sbuf, dbuf_array: &dbuf_array,
1845 lbuf: &sbuf);
1846
1847 KPKTQ_ENQUEUE(&pkts, spkt);
1848 _UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid);
1849 _UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1850 spkt->pkt_policy_id = fe->fe_policy_id;
1851 spkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1852 spkt->pkt_transport_protocol =
1853 fe->fe_transport_protocol;
1854 flow_agg_init_spkt(fsw, fa: &fa, spkt, pkt);
1855 }
1856next:
1857 pkt_agg_log(pkt, kernproc, true);
1858 prev_csum_ok = csum_ok;
1859 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1860 }
1861
1862 /* Free unused buflets */
1863 STATS_ADD(fsws, FSW_STATS_RX_WASTED_BFLT, bh_cnt);
1864 while (bh_cnt > 0) {
1865 pp_free_buflet(dpp, (kern_buflet_t)(buf_arr[iter]));
1866 buf_arr[iter] = 0;
1867 bh_cnt--;
1868 iter++;
1869 }
1870 /* Finalize the last super packet */
1871 if (sph != 0) {
1872 finalize_super_packet(spkt: &spkt, sph: &sph, fa: &fa, largest_spkt: &largest_spkt,
1873 spkts: &spkts, bufcnt);
1874 }
1875 converge_aggregation_size(fe, largest_agg_size: largest_spkt);
1876 DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts);
1877 if (__improbable(is_mbuf)) {
1878 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts);
1879 } else {
1880 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts);
1881 }
1882 FLOW_STATS_IN_ADD(fe, spackets, spkts);
1883
1884 KPKTQ_FINI(&fe->fe_rx_pktq);
1885 KPKTQ_CONCAT(&fe->fe_rx_pktq, &pkts);
1886 KPKTQ_FINI(&pkts);
1887
1888 fsw_ring_enqueue_tail_drop(fsw, ring, pktq: &fe->fe_rx_pktq);
1889
1890 pp_free_pktq(&disposed_pkts);
1891}
1892
1893/* streamline a smbuf */
1894static bool
1895_finalize_smbuf(struct mbuf *smbuf)
1896{
1897 /* the 1st mbuf always contains something, so start with the 2nd one */
1898 struct mbuf *m_chained = smbuf->m_next;
1899 struct mbuf *prev_m = smbuf;
1900 bool freed = false;
1901
1902 while (m_chained != NULL) {
1903 if (m_chained->m_len != 0) {
1904 prev_m = m_chained;
1905 m_chained = m_chained->m_next;
1906 continue;
1907 }
1908 prev_m->m_next = m_chained->m_next;
1909 m_free(m_chained);
1910 m_chained = prev_m->m_next;
1911 freed = true;
1912 }
1913 return freed;
1914}
1915
1916SK_NO_INLINE_ATTRIBUTE
1917static void
1918flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe,
1919 struct pktq *dropped_pkts, bool is_mbuf)
1920{
1921#define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt) do { \
1922 drop_packets++; \
1923 drop_bytes += (_pkt)->pkt_length; \
1924 KPKTQ_ENQUEUE(dropped_pkts, (_pkt)); \
1925 (_pkt) = NULL; \
1926 FLOW_AGG_CLEAR(&fa); \
1927 prev_csum_ok = false; \
1928} while (0)
1929 struct flow_agg fa; /* states */
1930 FLOW_AGG_CLEAR(&fa);
1931
1932 struct pktq disposed_pkts; /* done src packets */
1933 KPKTQ_INIT(&disposed_pkts);
1934
1935 struct __kern_packet *pkt, *tpkt;
1936 /* points to the first mbuf of chain */
1937 struct mbuf *m_chain = NULL;
1938 /* super mbuf, at the end it points to last mbuf packet */
1939 struct mbuf *smbuf = NULL, *curr_m = NULL;
1940 bool prev_csum_ok = false, csum_ok, agg_ok;
1941 uint16_t smbufs = 0, smbuf_finalized = 0;
1942 uint32_t bytes = 0, rcvd_ulen = 0;
1943 uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */
1944 uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */
1945 uint32_t largest_smbuf = 0;
1946 int err = 0;
1947
1948 struct fsw_stats *fsws = &fsw->fsw_stats;
1949 bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION);
1950
1951 SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX));
1952
1953 /* state for mbuf batch alloc */
1954 uint32_t mhead_cnt = 0;
1955 uint32_t mhead_bufsize = 0;
1956 struct mbuf * mhead = NULL;
1957
1958 uint16_t l2len = KPKTQ_FIRST(&fe->fe_rx_pktq)->pkt_l2_len;
1959
1960 SK_DF(logflags, "Rx input queue bytes %u", fe->fe_rx_pktq_bytes);
1961
1962 if (__probable(!is_mbuf)) {
1963 /*
1964 * Batch mbuf alloc is based on
1965 * convert_native_pkt_to_mbuf_chain
1966 */
1967 if (__probable(fe->fe_rx_largest_size != 0 &&
1968 NX_FSW_TCP_RX_AGG_ENABLED())) {
1969 unsigned int num_segs = 1;
1970 int pktq_len = KPKTQ_LEN(&fe->fe_rx_pktq);
1971
1972 if (fe->fe_rx_largest_size <= MCLBYTES &&
1973 fe->fe_rx_pktq_bytes / pktq_len <= MCLBYTES) {
1974 mhead_bufsize = MCLBYTES;
1975 } else if (fe->fe_rx_largest_size <= MBIGCLBYTES &&
1976 fe->fe_rx_pktq_bytes / pktq_len <= MBIGCLBYTES) {
1977 mhead_bufsize = MBIGCLBYTES;
1978 } else if (fe->fe_rx_largest_size <= M16KCLBYTES &&
1979 fe->fe_rx_pktq_bytes / pktq_len <= M16KCLBYTES) {
1980 mhead_bufsize = M16KCLBYTES;
1981 } else {
1982 mhead_bufsize = M16KCLBYTES * 2;
1983 num_segs = 2;
1984 }
1985
1986try_again:
1987 if (fe->fe_rx_pktq_bytes != 0) {
1988 mhead_cnt = estimate_buf_cnt(fe, MCLBYTES,
1989 agg_bufsize: mhead_bufsize);
1990 } else {
1991 /* No payload, thus it's all small-sized ACKs/... */
1992 mhead_bufsize = MHLEN;
1993 mhead_cnt = pktq_len;
1994 }
1995
1996 mhead = m_allocpacket_internal(&mhead_cnt,
1997 mhead_bufsize, &num_segs, M_NOWAIT, 1, 0);
1998
1999 if (mhead == NULL) {
2000 if (mhead_bufsize > M16KCLBYTES) {
2001 mhead_bufsize = M16KCLBYTES;
2002 num_segs = 1;
2003 goto try_again;
2004 }
2005
2006 if (mhead_bufsize == M16KCLBYTES) {
2007 mhead_bufsize = MBIGCLBYTES;
2008 goto try_again;
2009 }
2010
2011 if (mhead_bufsize == MBIGCLBYTES) {
2012 mhead_bufsize = MCLBYTES;
2013 goto try_again;
2014 }
2015 }
2016 } else {
2017 mhead = NULL;
2018 mhead_bufsize = mhead_cnt = 0;
2019 }
2020 SK_DF(logflags, "batch alloc'ed %u mbufs of size %u", mhead_cnt,
2021 mhead_bufsize);
2022 }
2023
2024 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
2025 if (tpkt != NULL) {
2026 void *baddr;
2027 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
2028 SK_PREFETCH(baddr, 0);
2029 }
2030
2031 /* Validate l2 len, ip vers, is_mbuf */
2032 ASSERT(pkt->pkt_l2_len == l2len);
2033 ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt)));
2034 ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver);
2035 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
2036 ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0);
2037 ASSERT(!pkt->pkt_flow_ip_is_frag);
2038 ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP);
2039
2040 csum_ok = false;
2041 agg_ok = false;
2042 /*
2043 * As we only agg packets with same hdr length,
2044 * leverage the pkt metadata
2045 */
2046 uint32_t thlen = (pkt->pkt_flow_ip_hlen +
2047 pkt->pkt_flow_tcp_hlen);
2048 uint32_t plen = (thlen + pkt->pkt_flow_ulen);
2049
2050 /*
2051 * Rather than calling flow_pkt_track() for each
2052 * packet here, we accumulate received packet stats
2053 * for the call to flow_track_stats() below. This
2054 * is because flow tracking is a no-op for traffic
2055 * that belongs to the host stack.
2056 */
2057 rcvd_ulen += pkt->pkt_flow_ulen;
2058 rcvd_bytes += pkt->pkt_length;
2059 rcvd_packets++;
2060
2061 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
2062 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
2063
2064 /* packet is for BSD flow, create a mbuf chain */
2065 uint32_t len = (l2len + plen);
2066 uint16_t data_csum = 0;
2067 struct mbuf *m;
2068 bool is_wake_pkt = false;
2069 if (__improbable(is_mbuf)) {
2070 m = pkt->pkt_mbuf;
2071
2072 if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) {
2073 is_wake_pkt = true;
2074 }
2075
2076 /* Detach mbuf from source pkt */
2077 KPKT_CLEAR_MBUF_DATA(pkt);
2078
2079 uint32_t trailer = (m_pktlen(m) - len);
2080 ASSERT((uint32_t)m_pktlen(m) >= plen);
2081 /* Remove the trailer */
2082 if (trailer > 0) {
2083 m_adj(m, -trailer);
2084 }
2085 /* attached mbuf is already allocated */
2086 csum_ok = mbuf_csum(pkt, m, verify_l3: is_ipv4, data_csum: &data_csum);
2087 } else { /* native */
2088 uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) -
2089 l2len;
2090 uint32_t tot_len = (len + pad);
2091 /* remember largest aggregated packet size */
2092 if (smbuf) {
2093 /* plus 4 bytes to account for padding */
2094 if (largest_smbuf <
2095 (uint32_t)m_pktlen(smbuf) + pad) {
2096 largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad;
2097 }
2098 }
2099
2100 if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
2101 is_wake_pkt = true;
2102 }
2103
2104 if (prev_csum_ok && curr_m) {
2105 ASSERT(fa.fa_smbuf == smbuf);
2106 ASSERT(!fa.fa_sobj_is_pkt);
2107 agg_ok = flow_agg_is_ok(fa: &fa, pkt, fsws);
2108
2109 if (agg_ok &&
2110 M_TRAILINGSPACE(curr_m) >= plen - thlen) {
2111 /*
2112 * No need for a new mbuf,
2113 * just append to curr_m.
2114 */
2115 csum_ok = copy_pkt_csum_packed(spkt: pkt,
2116 plen, NULL, verify_l3: is_ipv4, currm: curr_m, NULL,
2117 data_csum: &data_csum, NULL);
2118
2119 if (!csum_ok) {
2120 STATS_INC(fsws,
2121 FSW_STATS_RX_AGG_BAD_CSUM);
2122 SK_ERR("Checksum for "
2123 "aggregation is wrong");
2124 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1);
2125 /*
2126 * Turns out, checksum is wrong!
2127 * Fallback to no-agg mode.
2128 */
2129 agg_ok = 0;
2130 } else {
2131 /*
2132 * We only added payload,
2133 * thus -thlen.
2134 */
2135 bytes += (plen - thlen);
2136 flow_agg_merge_hdr(fa: &fa, pkt,
2137 data_csum, fsws);
2138 goto next;
2139 }
2140 }
2141 }
2142
2143 /*
2144 * If the batch allocation returned partial success,
2145 * we try blocking allocation here again
2146 */
2147 m = mhead;
2148 if (__improbable(m == NULL ||
2149 tot_len > mhead_bufsize)) {
2150 unsigned int num_segs = 1;
2151 if (tot_len > M16KCLBYTES) {
2152 num_segs = 0;
2153 }
2154
2155 ASSERT(mhead_cnt == 0 || mhead != NULL);
2156 err = mbuf_allocpacket(how: MBUF_DONTWAIT, packetlen: tot_len,
2157 maxchunks: &num_segs, mbuf: &m);
2158 if (err != 0) {
2159 STATS_INC(fsws,
2160 FSW_STATS_RX_DROP_NOMEM_BUF);
2161 SK_ERR("mbuf alloc failed (err %d), "
2162 "maxchunks %d, len %d", err, num_segs,
2163 tot_len);
2164 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2165 continue;
2166 }
2167 } else {
2168 ASSERT(mhead_cnt > 0);
2169 mhead = m->m_nextpkt;
2170 m->m_nextpkt = NULL;
2171 mhead_cnt--;
2172 }
2173 m->m_data += pad;
2174 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2175
2176 /*
2177 * copy and checksum l3, l4 and payload
2178 * l2 header is copied later only if we
2179 * can't agg as an optimization
2180 */
2181 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
2182 _dbuf_array_t dbuf_array = {.dba_is_buflet = false};
2183 if (agg_ok) {
2184 int added = 0, dbuf_idx = 0;
2185 struct mbuf *m_tmp = m;
2186 dbuf_array.dba_num_dbufs = 0;
2187 uint32_t m_chain_max_len = 0;
2188 while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2189 dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2190 dbuf_array.dba_num_dbufs += 1;
2191 m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2192 m_tmp = m_tmp->m_next;
2193 dbuf_idx++;
2194 }
2195 ASSERT(m_tmp == NULL);
2196
2197 csum_ok = copy_pkt_csum_packed(spkt: pkt, plen,
2198 dbuf: &dbuf_array, verify_l3: is_ipv4, currm: curr_m, NULL,
2199 data_csum: &data_csum, added: &added);
2200
2201 if (!csum_ok) {
2202 STATS_INC(fsws,
2203 FSW_STATS_RX_AGG_BAD_CSUM);
2204 SK_ERR("Checksum for aggregation "
2205 "on new mbuf is wrong");
2206 DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2);
2207 agg_ok = false;
2208 goto non_agg;
2209 }
2210
2211 /*
2212 * There was not enough space in curr_m,
2213 * thus we must have added to m->m_data.
2214 */
2215 VERIFY(added > 0);
2216 VERIFY(m->m_len <= m->m_pkthdr.len &&
2217 (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2218
2219 /*
2220 * We account for whatever we added
2221 * to m later on, thus - added.
2222 */
2223 bytes += plen - thlen - added;
2224 } else {
2225non_agg:
2226 dbuf_array.dba_num_dbufs = 0;
2227 uint32_t m_chain_max_len = 0;
2228 struct mbuf *m_tmp = m;
2229 int dbuf_idx = 0;
2230 while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) {
2231 dbuf_array.dba_mbuf[dbuf_idx] = m_tmp;
2232 dbuf_array.dba_num_dbufs += 1;
2233 m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp);
2234 m_tmp = m_tmp->m_next;
2235 dbuf_idx++;
2236 }
2237 ASSERT(m_tmp == NULL);
2238
2239 m->m_len += l2len;
2240 m->m_pkthdr.len += l2len;
2241 csum_ok = copy_pkt_csum(pkt, plen, dbuf: &dbuf_array,
2242 data_csum: &data_csum, verify_l3: is_ipv4);
2243 if (__improbable(!csum_ok)) {
2244 STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM);
2245 SK_ERR("%d incorrect csum", __LINE__);
2246 DTRACE_SKYWALK(aggr__host_tcp_csum_fail);
2247 }
2248 VERIFY(m->m_len <= m->m_pkthdr.len &&
2249 (uint32_t)m->m_pkthdr.len <= m_chain_max_len);
2250 }
2251
2252 STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF);
2253 STATS_INC(fsws, FSW_STATS_RX_COPY_SUM);
2254
2255 m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off;
2256 m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value;
2257 /*
2258 * Note that these flags have same value,
2259 * except PACKET_CSUM_PARTIAL
2260 */
2261 m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags &
2262 PACKET_CSUM_RX_FLAGS);
2263
2264 /* Set the rcvif */
2265 m->m_pkthdr.rcvif = fsw->fsw_ifp;
2266
2267 /* Make sure to propagate the wake pkt flag */
2268 if (is_wake_pkt) {
2269 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
2270 }
2271 }
2272 ASSERT(m != NULL);
2273 ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL);
2274 ASSERT((m->m_flags & M_HASFCS) == 0);
2275 ASSERT(m->m_nextpkt == NULL);
2276
2277 if (__improbable(is_mbuf)) {
2278 if ((uint32_t) m->m_len < (l2len + thlen)) {
2279 m = m_pullup(m, (l2len + thlen));
2280 if (m == NULL) {
2281 STATS_INC(fsws,
2282 FSW_STATS_RX_DROP_NOMEM_BUF);
2283 SK_ERR("mbuf pullup failed (err %d)",
2284 err);
2285 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2286 continue;
2287 }
2288 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2289 }
2290 if (prev_csum_ok && csum_ok) {
2291 ASSERT(fa.fa_smbuf == smbuf);
2292 agg_ok = flow_agg_is_ok(fa: &fa, pkt, fsws);
2293 }
2294 }
2295
2296 if (agg_ok) {
2297 ASSERT(is_wake_pkt == false);
2298 ASSERT(fa.fa_smbuf == smbuf);
2299 ASSERT(!fa.fa_sobj_is_pkt);
2300 if (__improbable(is_mbuf)) {
2301 bytes += (m_pktlen(m) - l2len);
2302 /* adjust mbuf by l2, l3 and l4 hdr */
2303 m_adj(m, l2len + thlen);
2304 } else {
2305 bytes += m_pktlen(m);
2306 }
2307
2308 m->m_flags &= ~M_PKTHDR;
2309 flow_agg_merge_hdr(fa: &fa, pkt, data_csum, fsws);
2310 while (curr_m->m_next != NULL) {
2311 curr_m = curr_m->m_next;
2312 }
2313 curr_m->m_next = m;
2314 curr_m = m;
2315 m = NULL;
2316 } else {
2317 if ((uint32_t) m->m_len < l2len) {
2318 m = m_pullup(m, l2len);
2319 if (m == NULL) {
2320 STATS_INC(fsws,
2321 FSW_STATS_RX_DROP_NOMEM_BUF);
2322 SK_ERR("mbuf pullup failed (err %d)",
2323 err);
2324 __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt);
2325 continue;
2326 }
2327 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
2328 }
2329
2330 /* copy l2 header for native */
2331 if (__probable(!is_mbuf)) {
2332 uint16_t llhoff = pkt->pkt_headroom;
2333 uint8_t *baddr;
2334 MD_BUFLET_ADDR_ABS(pkt, baddr);
2335 ASSERT(baddr != NULL);
2336 baddr += llhoff;
2337 pkt_copy(src: baddr, dst: m_mtod_current(m), len: l2len);
2338 }
2339 /* adjust mbuf by l2 hdr */
2340 m_adj(m, l2len);
2341 bytes += m_pktlen(m);
2342
2343 /*
2344 * aggregated packets can be skipped by pktap because
2345 * the original pre-aggregated chain already passed through
2346 * pktap (see fsw_snoop()) before entering this function.
2347 */
2348 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
2349
2350 if (m_chain == NULL) {
2351 /* this is the start of the chain */
2352 m_chain = m;
2353 smbuf = m;
2354 curr_m = m;
2355 } else if (smbuf != NULL) {
2356 /*
2357 * set m to be next packet
2358 */
2359 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2360 smbuf->m_nextpkt = m;
2361 /*
2362 * Clean up (finalize) a smbuf only if it pre-allocated >1 segments,
2363 * which only happens when mhead_bufsize > M16KCLBYTES
2364 */
2365 if (_finalize_smbuf(smbuf)) {
2366 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2367 }
2368 smbuf_finalized++;
2369 smbuf = m;
2370 curr_m = m;
2371 } else {
2372 VERIFY(0);
2373 }
2374
2375 smbufs++;
2376 m = NULL;
2377
2378 flow_agg_init_smbuf(fsw, fa: &fa, smbuf, pkt);
2379 /*
2380 * if the super packet is an mbuf which can't accomodate
2381 * (sizeof(struct ip6_tcp_mask) in a single buffer then
2382 * do the aggregation check in slow path.
2383 * Note that an mbuf without cluster has only 80 bytes
2384 * available for data, sizeof(struct ip6_tcp_mask) is
2385 * also 80 bytes, so if the packet contains an
2386 * ethernet header, this mbuf won't be able to fully
2387 * contain "struct ip6_tcp_mask" data in a single
2388 * buffer.
2389 */
2390 if (pkt->pkt_flow_ip_ver == IPV6_VERSION) {
2391 if (__improbable(smbuf->m_len <
2392 ((m_mtod_current(smbuf) -
2393 (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) +
2394 MASK_SIZE))) {
2395 fa.fa_sobj_is_short = true;
2396 }
2397 }
2398 }
2399next:
2400 pkt_agg_log(pkt, kernproc, true);
2401 prev_csum_ok = csum_ok;
2402 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
2403 }
2404
2405 KPKTQ_FINI(&fe->fe_rx_pktq);
2406
2407 /* Free any leftover mbufs, true only for native */
2408 if (__improbable(mhead != NULL)) {
2409 ASSERT(mhead_cnt != 0);
2410 STATS_ADD(fsws, FSW_STATS_RX_WASTED_MBUF, mhead_cnt);
2411 (void) m_freem_list(mhead);
2412 mhead = NULL;
2413 mhead_cnt = 0;
2414 }
2415
2416 converge_aggregation_size(fe, largest_agg_size: largest_smbuf);
2417
2418 if (smbufs > 0) {
2419 /* Last smbuf */
2420 mbuf_agg_log(smbuf, kernproc, is_mbuf);
2421 SK_DF(logflags, "smbuf count %u", smbufs);
2422
2423 ASSERT(m_chain != NULL);
2424 ASSERT(smbuf != NULL);
2425
2426 /*
2427 * If the last mbuf needs to be finalized (mhead_bufsize > M16KCLBYTES)
2428 * but is not (smbuf_finalized < smbuf), do it now.
2429 */
2430 if (smbuf_finalized < smbufs &&
2431 _finalize_smbuf(smbuf)) {
2432 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
2433 }
2434
2435 /*
2436 * Call fsw_host_sendup() with mbuf chain
2437 * directly.
2438 */
2439 mchain_agg_log(m_chain, kernproc, is_mbuf);
2440 fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes);
2441
2442 if (__improbable(is_mbuf)) {
2443 STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs);
2444 } else {
2445 STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs);
2446 }
2447 FLOW_STATS_IN_ADD(fe, spackets, smbufs);
2448
2449 ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0);
2450 }
2451
2452 /* record (raw) number of packets and bytes */
2453 ASSERT((int)(rcvd_bytes - drop_bytes) >= 0);
2454 ASSERT((int)(rcvd_packets - drop_packets) >= 0);
2455 flow_track_stats(fe, (rcvd_bytes - drop_bytes),
2456 (rcvd_packets - drop_packets), (rcvd_ulen != 0), true);
2457
2458 pp_free_pktq(&disposed_pkts);
2459}
2460
2461void
2462flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe,
2463 uint32_t flags)
2464{
2465#pragma unused(flags)
2466 struct pktq dropped_pkts;
2467 bool is_mbuf;
2468
2469 if (__improbable(fe->fe_rx_frag_count > 0)) {
2470 dp_flow_rx_process(fsw, fe, flags: 0);
2471 return;
2472 }
2473
2474 KPKTQ_INIT(&dropped_pkts);
2475
2476 if (!dp_flow_rx_route_process(fsw, fe)) {
2477 SK_ERR("Rx route bad");
2478 fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true);
2479 STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE,
2480 KPKTQ_LEN(&dropped_pkts));
2481 goto done;
2482 }
2483
2484 is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(&fe->fe_rx_pktq)));
2485
2486 if (fe->fe_nx_port == FSW_VP_HOST) {
2487 boolean_t do_rx_agg;
2488
2489 /* BSD flow */
2490 if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) {
2491 do_rx_agg = (sk_fsw_rx_agg_tcp_host ==
2492 SK_FSW_RX_AGG_TCP_HOST_ON);
2493 } else {
2494 do_rx_agg = !dlil_has_ip_filter() &&
2495 !dlil_has_if_filter(fsw->fsw_ifp);
2496 }
2497 if (__improbable(!do_rx_agg)) {
2498 fsw_host_rx(fsw, &fe->fe_rx_pktq);
2499 return;
2500 }
2501 if (__improbable(pktap_total_tap_count != 0)) {
2502 fsw_snoop(fsw, fe, true);
2503 }
2504 flow_rx_agg_host(fsw, fe, dropped_pkts: &dropped_pkts, is_mbuf);
2505 } else {
2506 /* channel flow */
2507 if (__improbable(pktap_total_tap_count != 0)) {
2508 fsw_snoop(fsw, fe, true);
2509 }
2510 flow_rx_agg_channel(fsw, fe, dropped_pkts: &dropped_pkts, is_mbuf);
2511 }
2512
2513done:
2514 pp_free_pktq(&dropped_pkts);
2515}
2516