1 | /* |
2 | * Copyright (c) 2019-2023 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <skywalk/os_skywalk_private.h> |
30 | #include <skywalk/nexus/flowswitch/nx_flowswitch.h> |
31 | #include <skywalk/nexus/flowswitch/fsw_var.h> |
32 | #include <skywalk/nexus/flowswitch/flow/flow_var.h> |
33 | #include <skywalk/nexus/netif/nx_netif.h> |
34 | #include <skywalk/nexus/netif/nx_netif_compat.h> |
35 | #include <netinet/tcp.h> |
36 | #include <netinet/ip.h> |
37 | #include <netinet/ip6.h> |
38 | #include <net/pktap.h> |
39 | #include <sys/sdt.h> |
40 | |
41 | #define MAX_AGG_IP_LEN() MIN(sk_fsw_rx_agg_tcp, IP_MAXPACKET) |
42 | #define MAX_BUFLET_COUNT (32) |
43 | #define TCP_FLAGS_IGNORE (TH_FIN|TH_SYN|TH_RST|TH_URG) |
44 | #define PKT_IS_MBUF(_pkt) (_pkt->pkt_pflags & PKT_F_MBUF_DATA) |
45 | #define PKT_IS_TRUNC_MBUF(_pkt) (PKT_IS_MBUF(_pkt) && \ |
46 | (_pkt->pkt_pflags & PKT_F_TRUNCATED)) |
47 | #define PKT_IS_WAKE_PKT(_pkt) ((PKT_IS_MBUF(_pkt) && \ |
48 | (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) || \ |
49 | (!PKT_IS_MBUF(_pkt) && \ |
50 | (_pkt->pkt_pflags & PKT_F_WAKE_PKT))) |
51 | |
52 | |
53 | typedef uint16_t (* flow_agg_fix_pkt_sum_func)(uint16_t, uint16_t, uint16_t); |
54 | |
55 | static uint16_t |
56 | flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new); |
57 | |
58 | static uint16_t |
59 | flow_agg_pkt_fix_sum_no_op(uint16_t csum, uint16_t old, uint16_t new); |
60 | |
61 | /* |
62 | * This structure holds per-super object (mbuf/packet) flow aggregation. |
63 | */ |
64 | struct flow_agg { |
65 | union { |
66 | struct { |
67 | union { |
68 | void * _fa_sobj; |
69 | struct mbuf * _fa_smbuf; /* super mbuf */ |
70 | struct __kern_packet *_fa_spkt; /* super pkt */ |
71 | }; |
72 | uint8_t *_fa_sptr; /* ptr to super IP header */ |
73 | bool _fa_sobj_is_pkt; /* super obj is pkt or mbuf */ |
74 | /* |
75 | * super obj is not large enough to hold the IP & TCP |
76 | * header in a contiguous buffer. |
77 | */ |
78 | bool _fa_sobj_is_short; |
79 | uint32_t _fa_tcp_seq; /* expected next sequence # */ |
80 | uint32_t _fa_ulen; /* expected next ulen */ |
81 | uint32_t _fa_total; /* total aggregated bytes */ |
82 | /* function that fix packet checksum */ |
83 | flow_agg_fix_pkt_sum_func _fa_fix_pkt_sum; |
84 | } __flow_agg; |
85 | uint64_t __flow_agg_data[5]; |
86 | }; |
87 | #define fa_sobj __flow_agg._fa_sobj |
88 | #define fa_smbuf __flow_agg._fa_smbuf |
89 | #define fa_spkt __flow_agg._fa_spkt |
90 | #define fa_sptr __flow_agg._fa_sptr |
91 | #define fa_sobj_is_pkt __flow_agg._fa_sobj_is_pkt |
92 | #define fa_sobj_is_short __flow_agg._fa_sobj_is_short |
93 | #define fa_tcp_seq __flow_agg._fa_tcp_seq |
94 | #define fa_ulen __flow_agg._fa_ulen |
95 | #define fa_total __flow_agg._fa_total |
96 | #define fa_fix_pkt_sum __flow_agg._fa_fix_pkt_sum |
97 | }; |
98 | |
99 | #define FLOW_AGG_CLEAR(_fa) do { \ |
100 | _CASSERT(sizeof(struct flow_agg) == 40); \ |
101 | _CASSERT(offsetof(struct flow_agg, fa_fix_pkt_sum) == 32); \ |
102 | sk_zero_32(_fa); \ |
103 | (_fa)->fa_fix_pkt_sum = 0; \ |
104 | } while (0) |
105 | |
106 | #define MASK_SIZE 80 /* size of struct {ip,ip6}_tcp_mask */ |
107 | |
108 | struct ip_tcp_mask { |
109 | struct ip ip_m; |
110 | struct tcphdr tcp_m; |
111 | uint32_t tcp_option_m[MAX_TCPOPTLEN / sizeof(uint32_t)]; |
112 | }; |
113 | |
114 | static const struct ip_tcp_mask ip_tcp_mask |
115 | __sk_aligned(16) = |
116 | { |
117 | .ip_m = { |
118 | .ip_hl = 0xf, |
119 | .ip_v = 0xf, |
120 | .ip_tos = 0xff, |
121 | /* Not checked; aggregated packet's ip_len is increasing */ |
122 | .ip_len = 0, |
123 | .ip_id = 0, |
124 | .ip_off = 0xffff, |
125 | .ip_ttl = 0xff, |
126 | .ip_p = 0xff, |
127 | .ip_sum = 0, |
128 | .ip_src.s_addr = 0xffffffff, |
129 | .ip_dst.s_addr = 0xffffffff, |
130 | }, |
131 | .tcp_m = { |
132 | .th_sport = 0xffff, |
133 | .th_dport = 0xffff, |
134 | .th_seq = 0, |
135 | .th_ack = 0xffffffff, |
136 | .th_x2 = 0xf, |
137 | .th_off = 0xf, |
138 | .th_flags = ~TH_PUSH, |
139 | .th_win = 0xffff, |
140 | .th_sum = 0, |
141 | .th_urp = 0xffff, |
142 | }, |
143 | .tcp_option_m = { |
144 | /* Max 40 bytes of TCP options */ |
145 | 0xffffffff, |
146 | 0xffffffff, |
147 | 0xffffffff, |
148 | 0, /* Filling up to MASK_SIZE */ |
149 | 0, /* Filling up to MASK_SIZE */ |
150 | 0, /* Filling up to MASK_SIZE */ |
151 | 0, /* Filling up to MASK_SIZE */ |
152 | 0, /* Filling up to MASK_SIZE */ |
153 | 0, /* Filling up to MASK_SIZE */ |
154 | 0, /* Filling up to MASK_SIZE */ |
155 | }, |
156 | }; |
157 | |
158 | struct ip6_tcp_mask { |
159 | struct ip6_hdr ip6_m; |
160 | struct tcphdr tcp_m; |
161 | uint32_t tcp_option_m[5]; /* 5 bytes to fill up to MASK_SIZE */ |
162 | }; |
163 | |
164 | static const struct ip6_tcp_mask ip6_tcp_mask |
165 | __sk_aligned(16) = |
166 | { |
167 | .ip6_m = { |
168 | .ip6_ctlun.ip6_un1.ip6_un1_flow = 0xffffffff, |
169 | /* Not checked; aggregated packet's ip_len is increasing */ |
170 | .ip6_ctlun.ip6_un1.ip6_un1_plen = 0, |
171 | .ip6_ctlun.ip6_un1.ip6_un1_nxt = 0xff, |
172 | .ip6_ctlun.ip6_un1.ip6_un1_hlim = 0xff, |
173 | .ip6_src.__u6_addr.__u6_addr32[0] = 0xffffff, |
174 | .ip6_src.__u6_addr.__u6_addr32[1] = 0xffffff, |
175 | .ip6_src.__u6_addr.__u6_addr32[2] = 0xffffff, |
176 | .ip6_src.__u6_addr.__u6_addr32[3] = 0xffffff, |
177 | .ip6_dst.__u6_addr.__u6_addr32[0] = 0xffffff, |
178 | .ip6_dst.__u6_addr.__u6_addr32[1] = 0xffffff, |
179 | .ip6_dst.__u6_addr.__u6_addr32[2] = 0xffffff, |
180 | .ip6_dst.__u6_addr.__u6_addr32[3] = 0xffffff, |
181 | }, |
182 | .tcp_m = { |
183 | .th_sport = 0xffff, |
184 | .th_dport = 0xffff, |
185 | .th_seq = 0, |
186 | .th_ack = 0xffffffff, |
187 | .th_x2 = 0xf, |
188 | .th_off = 0xf, |
189 | .th_flags = ~TH_PUSH, |
190 | .th_win = 0xffff, |
191 | .th_sum = 0, |
192 | .th_urp = 0xffff, |
193 | }, |
194 | .tcp_option_m = { |
195 | /* Max 40 bytes of TCP options */ |
196 | 0xffffffff, |
197 | 0xffffffff, |
198 | 0xffffffff, |
199 | 0, /* Filling up to MASK_SIZE */ |
200 | 0, /* Filling up to MASK_SIZE */ |
201 | }, |
202 | }; |
203 | |
204 | #if SK_LOG |
205 | SK_LOG_ATTRIBUTE |
206 | static void |
207 | _pkt_agg_log(struct __kern_packet *pkt, struct proc *p, bool is_input) |
208 | { |
209 | SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) | |
210 | (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY))); |
211 | |
212 | kern_packet_t ph = SK_PKT2PH(pkt); |
213 | uint64_t bufcnt = 1; |
214 | if (!is_input) { |
215 | bufcnt = kern_packet_get_buflet_count(ph); |
216 | } |
217 | |
218 | SK_DF(logflags, "%s(%d) %spkt 0x%llx plen %u" , |
219 | sk_proc_name_address(p), sk_proc_pid(p), is_input ? "s" :"d" , |
220 | SK_KVA(pkt), pkt->pkt_length); |
221 | |
222 | SK_DF(logflags, "%spkt csumf/rxstart/rxval 0x%x/%u/0x%04x" , |
223 | is_input ? "s" :"d" , pkt->pkt_csum_flags, |
224 | (uint32_t)pkt->pkt_csum_rx_start_off, |
225 | (uint32_t)pkt->pkt_csum_rx_value); |
226 | |
227 | if (!is_input) { |
228 | kern_buflet_t buf = kern_packet_get_next_buflet(ph, NULL); |
229 | |
230 | /* Individual buflets */ |
231 | for (uint64_t i = 0; i < bufcnt && buf != NULL; i++) { |
232 | SK_DF(logflags | SK_VERB_DUMP, "%s" , |
233 | sk_dump("buf" , kern_buflet_get_data_address(buf), |
234 | pkt->pkt_length, 128, NULL, 0)); |
235 | buf = kern_packet_get_next_buflet(ph, buf); |
236 | } |
237 | } |
238 | } |
239 | |
240 | #define pkt_agg_log(_pkt, _p, _is_input) do { \ |
241 | if (__improbable(sk_verbose != 0)) { \ |
242 | _pkt_agg_log(_pkt, _p, _is_input); \ |
243 | } \ |
244 | } while (0) |
245 | |
246 | SK_LOG_ATTRIBUTE |
247 | static void |
248 | _mbuf_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf) |
249 | { |
250 | SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) | |
251 | (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY))); |
252 | |
253 | SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u" , |
254 | sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m), |
255 | m->m_pkthdr.len); |
256 | |
257 | SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x" , |
258 | m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start, |
259 | (uint32_t)m->m_pkthdr.csum_rx_val); |
260 | |
261 | /* Dump the first mbuf */ |
262 | ASSERT(m_mtod_current(m) != NULL); |
263 | SK_DF(logflags | SK_VERB_DUMP, "%s" , sk_dump("buf" , |
264 | (uint8_t *)m_mtod_current(m), m->m_len, 128, NULL, 0)); |
265 | } |
266 | |
267 | #define mbuf_agg_log(_m, _p, _is_mbuf) do { \ |
268 | if (__improbable(sk_verbose != 0)) { \ |
269 | _mbuf_agg_log(_m, _p, _is_mbuf); \ |
270 | } \ |
271 | } while (0) |
272 | |
273 | SK_LOG_ATTRIBUTE |
274 | static void |
275 | _mchain_agg_log(struct mbuf *m, struct proc *p, bool is_mbuf) |
276 | { |
277 | SK_LOG_VAR(uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) | |
278 | (is_mbuf ? SK_VERB_COPY_MBUF : SK_VERB_COPY))); |
279 | |
280 | while (m != NULL) { |
281 | SK_DF(logflags, "%s(%d) dest mbuf 0x%llx pktlen %u" , |
282 | sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(m), |
283 | m->m_pkthdr.len); |
284 | |
285 | SK_DF(logflags, "dest mbuf csumf/rxstart/rxval 0x%x/%u/0x%04x" , |
286 | m->m_pkthdr.csum_flags, (uint32_t)m->m_pkthdr.csum_rx_start, |
287 | (uint32_t)m->m_pkthdr.csum_rx_val); |
288 | |
289 | m = m->m_nextpkt; |
290 | } |
291 | } |
292 | |
293 | #define mchain_agg_log(_m, _p, _is_mbuf) do { \ |
294 | if (__improbable(sk_verbose != 0)) { \ |
295 | _mchain_agg_log(_m, _p, _is_mbuf); \ |
296 | } \ |
297 | } while (0) |
298 | #else |
299 | #define pkt_agg_log(...) |
300 | #define mbuf_agg_log(...) |
301 | #define mchain_agg_log(...) |
302 | #endif /* SK_LOG */ |
303 | |
304 | /* |
305 | * Checksum only for packet with mbuf. |
306 | */ |
307 | static bool |
308 | mbuf_csum(struct __kern_packet *pkt, struct mbuf *m, bool verify_l3, |
309 | uint16_t *data_csum) |
310 | { |
311 | ASSERT(data_csum != NULL); |
312 | |
313 | SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX)); |
314 | uint32_t plen = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen + |
315 | pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen; |
316 | uint16_t l4len = plen - pkt->pkt_l2_len - pkt->pkt_flow_ip_hlen; |
317 | uint16_t start = pkt->pkt_l2_len; |
318 | uint32_t partial = 0; |
319 | uint16_t csum = 0; |
320 | |
321 | ASSERT(plen == m_pktlen(m)); |
322 | |
323 | /* Some compat drivers compute full checksum */ |
324 | if ((m->m_pkthdr.csum_flags & CSUM_RX_FULL_FLAGS) == |
325 | CSUM_RX_FULL_FLAGS) { |
326 | SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x" , |
327 | m->m_pkthdr.csum_flags, m->m_pkthdr.csum_rx_start, |
328 | m->m_pkthdr.csum_rx_val); |
329 | |
330 | /* Compute the data_csum */ |
331 | struct tcphdr *tcp = |
332 | (struct tcphdr *)(void *)(mtod(m, uint8_t *) + |
333 | pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen); |
334 | /* 16-bit alignment is sufficient */ |
335 | ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t))); |
336 | |
337 | uint16_t th_sum = tcp->th_sum; |
338 | tcp->th_sum = 0; |
339 | |
340 | partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen, |
341 | pkt->pkt_flow_tcp_hlen); |
342 | partial += htons(l4len + IPPROTO_TCP); |
343 | if (pkt->pkt_flow_ip_ver == IPVERSION) { |
344 | csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr, |
345 | pkt->pkt_flow_ipv4_dst.s_addr, partial); |
346 | } else { |
347 | ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION); |
348 | csum = in6_pseudo(&pkt->pkt_flow_ipv6_src, |
349 | &pkt->pkt_flow_ipv6_dst, partial); |
350 | } |
351 | /* Restore the original checksum */ |
352 | tcp->th_sum = th_sum; |
353 | th_sum = __packet_fix_sum(csum: th_sum, old: csum, new: 0); |
354 | *data_csum = ~th_sum & 0xffff; |
355 | |
356 | /* pkt metadata will be transfer to super packet */ |
357 | __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS, |
358 | start: 0, stuff_val: m->m_pkthdr.csum_rx_val, false); |
359 | |
360 | if ((m->m_pkthdr.csum_rx_val ^ 0xffff) == 0) { |
361 | return true; |
362 | } else { |
363 | return false; |
364 | } |
365 | } |
366 | /* Reset the csum RX flags */ |
367 | m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS; |
368 | if (verify_l3) { |
369 | csum = m_sum16(m, start, pkt->pkt_flow_ip_hlen); |
370 | SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)" , |
371 | start, pkt->pkt_flow_ip_hlen, csum); |
372 | m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; |
373 | if ((csum ^ 0xffff) != 0) { |
374 | return false; |
375 | } else { |
376 | m->m_pkthdr.csum_flags |= CSUM_IP_VALID; |
377 | } |
378 | } |
379 | /* Compute L4 header checksum */ |
380 | partial = m_sum16(m, start + pkt->pkt_flow_ip_hlen, |
381 | pkt->pkt_flow_tcp_hlen); |
382 | /* Compute payload checksum */ |
383 | start += (pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen); |
384 | *data_csum = m_sum16(m, start, (plen - start)); |
385 | |
386 | /* Fold in the data checksum to TCP checksum */ |
387 | partial += *data_csum; |
388 | partial += htons(l4len + IPPROTO_TCP); |
389 | if (pkt->pkt_flow_ip_ver == IPVERSION) { |
390 | csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr, |
391 | pkt->pkt_flow_ipv4_dst.s_addr, partial); |
392 | } else { |
393 | ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION); |
394 | csum = in6_pseudo(&pkt->pkt_flow_ipv6_src, |
395 | &pkt->pkt_flow_ipv6_dst, partial); |
396 | } |
397 | SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)" , |
398 | start - pkt->pkt_flow_tcp_hlen, l4len, csum); |
399 | // Set start to 0 for full checksum |
400 | m->m_pkthdr.csum_rx_start = 0; |
401 | m->m_pkthdr.csum_rx_val = csum; |
402 | m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); |
403 | |
404 | /* pkt metadata will be transfer to super packet */ |
405 | __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS, |
406 | start: 0, stuff_val: csum, false); |
407 | |
408 | if ((csum ^ 0xffff) != 0) { |
409 | return false; |
410 | } |
411 | |
412 | return true; |
413 | } |
414 | |
415 | /* structure to pass an array of data buffers */ |
416 | typedef struct _dbuf_array { |
417 | union { |
418 | struct __kern_buflet *dba_buflet[MAX_BUFLET_COUNT]; |
419 | struct mbuf *dba_mbuf[MAX_BUFLET_COUNT]; |
420 | }; |
421 | uint8_t dba_num_dbufs; |
422 | bool dba_is_buflet; |
423 | } _dbuf_array_t; |
424 | |
425 | static inline void |
426 | _copy_data_sum_dbuf(struct __kern_packet *spkt, uint16_t soff, uint16_t plen, |
427 | uint32_t *partial_sum, boolean_t *odd_start, _dbuf_array_t *dbuf, |
428 | boolean_t do_csum) |
429 | { |
430 | uint8_t i = 0; |
431 | uint32_t buflet_dlim, buflet_dlen, buf_off = 0; |
432 | |
433 | ASSERT(plen > 0); |
434 | while (plen > 0) { |
435 | ASSERT(i < dbuf->dba_num_dbufs); |
436 | uint32_t dbuf_lim, tmplen; |
437 | uint8_t *dbuf_addr; |
438 | |
439 | if (dbuf->dba_is_buflet) { |
440 | ASSERT(kern_buflet_get_data_offset(dbuf->dba_buflet[i]) == 0); |
441 | dbuf_addr = kern_buflet_get_data_address(dbuf->dba_buflet[i]); |
442 | |
443 | buflet_dlim = kern_buflet_get_data_limit(dbuf->dba_buflet[i]); |
444 | buflet_dlen = kern_buflet_get_data_length(dbuf->dba_buflet[i]); |
445 | buf_off = buflet_dlen; |
446 | dbuf_lim = buflet_dlim - buf_off; |
447 | dbuf_addr += buf_off; |
448 | } else { |
449 | dbuf_lim = (uint32_t) M_TRAILINGSPACE(dbuf->dba_mbuf[i]); |
450 | dbuf_addr = mtod(dbuf->dba_mbuf[i], uint8_t *); |
451 | buf_off = dbuf->dba_mbuf[i]->m_len; |
452 | dbuf_addr += buf_off; |
453 | } |
454 | tmplen = min(a: plen, b: dbuf_lim); |
455 | if (PKT_IS_TRUNC_MBUF(spkt)) { |
456 | if (do_csum) { |
457 | *partial_sum = m_copydata_sum(m: spkt->pkt_mbuf, |
458 | off: soff, len: tmplen, vp: dbuf_addr, initial_sum: *partial_sum, |
459 | odd_start); |
460 | } else { |
461 | m_copydata(spkt->pkt_mbuf, soff, tmplen, |
462 | dbuf_addr); |
463 | } |
464 | } else { |
465 | *partial_sum = pkt_copyaddr_sum(SK_PKT2PH(spkt), |
466 | soff, dbaddr: dbuf_addr, len: tmplen, do_csum, initial_sum: *partial_sum, |
467 | odd_start); |
468 | } |
469 | if (dbuf->dba_is_buflet) { |
470 | VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[i], |
471 | tmplen + buf_off) == 0); |
472 | } else { |
473 | dbuf->dba_mbuf[i]->m_len += tmplen; |
474 | dbuf->dba_mbuf[0]->m_pkthdr.len += tmplen; |
475 | } |
476 | soff += tmplen; |
477 | plen -= tmplen; |
478 | buf_off = 0; |
479 | i++; |
480 | } |
481 | ASSERT(plen == 0); |
482 | } |
483 | |
484 | /* |
485 | * Copy (fill) and checksum for packet. |
486 | * spkt: source IP packet. |
487 | * plen: length of data in spkt (IP hdr + TCP hdr + TCP payload). |
488 | * verify_l3: verify IPv4 header checksum. |
489 | * currm: destination mbuf. |
490 | * currp: destination skywalk packet. |
491 | * dbuf: additional destination data buffer(s), used when current destination |
492 | * packet is out of space. |
493 | * added: amount of data copied from spkt to the additional buffer. |
494 | * data_sum: 16-bit folded partial checksum of the copied TCP payload. |
495 | */ |
496 | static bool |
497 | copy_pkt_csum_packed(struct __kern_packet *spkt, uint32_t plen, |
498 | _dbuf_array_t *dbuf, bool verify_l3, struct mbuf *currm, |
499 | struct __kern_buflet *currp, uint16_t *data_csum, int *added) |
500 | { |
501 | ASSERT(data_csum != NULL); |
502 | |
503 | SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX | |
504 | SK_VERB_COPY)); |
505 | |
506 | uint16_t start = 0, csum = 0; |
507 | uint32_t len = 0; |
508 | uint32_t l4len; |
509 | /* soff is only used for packets */ |
510 | uint16_t soff = spkt->pkt_headroom + spkt->pkt_l2_len; |
511 | uint32_t data_partial = 0, partial = 0; |
512 | int32_t curr_oldlen; |
513 | uint32_t curr_trailing; |
514 | char *curr_ptr; |
515 | int32_t curr_len; |
516 | uint16_t data_off; |
517 | uint32_t tmplen; |
518 | boolean_t odd_start = FALSE; |
519 | bool verify_l4; |
520 | |
521 | /* One of them must be != NULL, but they can't be both set */ |
522 | VERIFY((currm != NULL || currp != NULL) && |
523 | ((currm != NULL) != (currp != NULL))); |
524 | |
525 | if (currm != NULL) { |
526 | curr_oldlen = currm->m_len; |
527 | curr_trailing = (uint32_t)M_TRAILINGSPACE(currm); |
528 | curr_ptr = mtod(currm, char *) + currm->m_len; |
529 | curr_len = currm->m_len; |
530 | } else { |
531 | curr_oldlen = currp->buf_dlen; |
532 | curr_trailing = currp->buf_dlim - currp->buf_doff - |
533 | currp->buf_dlen; |
534 | curr_ptr = (char *)(currp->buf_addr + currp->buf_doff + |
535 | currp->buf_dlen); |
536 | curr_len = currp->buf_dlen; |
537 | } |
538 | |
539 | /* Verify checksum only for IPv4 */ |
540 | len = spkt->pkt_flow_ip_hlen; |
541 | verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(spkt)); |
542 | if (verify_l3) { |
543 | if (PKT_IS_TRUNC_MBUF(spkt)) { |
544 | partial = os_cpu_in_cksum_mbuf(m: spkt->pkt_mbuf, |
545 | len, off: 0, initial_sum: 0); |
546 | } else { |
547 | partial = pkt_sum(SK_PKT2PH(spkt), soff, len); |
548 | } |
549 | |
550 | csum = __packet_fold_sum(sum: partial); |
551 | SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)" , 0, |
552 | len, csum); |
553 | spkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED; |
554 | if ((csum ^ 0xffff) != 0) { |
555 | /* No need to copy & checkum TCP+payload */ |
556 | return false; |
557 | } else { |
558 | spkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID; |
559 | } |
560 | } |
561 | |
562 | verify_l4 = !PACKET_HAS_FULL_CHECKSUM_FLAGS(spkt); |
563 | |
564 | /* Copy & verify TCP checksum */ |
565 | start = spkt->pkt_flow_ip_hlen + spkt->pkt_flow_tcp_hlen; |
566 | l4len = plen - spkt->pkt_flow_ip_hlen; |
567 | len = plen - start; |
568 | if (PKT_IS_TRUNC_MBUF(spkt)) { |
569 | tmplen = min(a: len, b: curr_trailing); |
570 | odd_start = FALSE; |
571 | |
572 | /* First, simple checksum on the TCP header */ |
573 | if (verify_l4) { |
574 | partial = os_cpu_in_cksum_mbuf(m: spkt->pkt_mbuf, |
575 | len: spkt->pkt_flow_tcp_hlen, off: spkt->pkt_flow_ip_hlen, initial_sum: 0); |
576 | } |
577 | |
578 | /* Now, copy & sum the payload */ |
579 | if (tmplen > 0) { |
580 | data_partial = m_copydata_sum(m: spkt->pkt_mbuf, |
581 | off: start, len: tmplen, vp: curr_ptr, initial_sum: 0, odd_start: &odd_start); |
582 | curr_len += tmplen; |
583 | } |
584 | data_off = start + tmplen; |
585 | } else { |
586 | tmplen = min(a: len, b: curr_trailing); |
587 | odd_start = FALSE; |
588 | |
589 | /* First, simple checksum on the TCP header */ |
590 | if (verify_l4) { |
591 | partial = pkt_sum(SK_PKT2PH(spkt), (soff + |
592 | spkt->pkt_flow_ip_hlen), spkt->pkt_flow_tcp_hlen); |
593 | } |
594 | |
595 | /* Now, copy & sum the payload */ |
596 | if (tmplen > 0) { |
597 | data_partial = pkt_copyaddr_sum(SK_PKT2PH(spkt), |
598 | soff: (soff + start), dbaddr: (uint8_t *)curr_ptr, len: tmplen, |
599 | true, initial_sum: 0, odd_start: &odd_start); |
600 | curr_len += tmplen; |
601 | } |
602 | data_off = soff + start + tmplen; |
603 | } |
604 | |
605 | /* copy & sum remaining payload in additional buffers */ |
606 | if ((len - tmplen) > 0) { |
607 | ASSERT(dbuf != NULL); |
608 | _copy_data_sum_dbuf(spkt, soff: data_off, plen: (len - tmplen), |
609 | partial_sum: &data_partial, odd_start: &odd_start, dbuf, true); |
610 | *added = (len - tmplen); |
611 | } |
612 | |
613 | /* Fold data checksum to 16 bit */ |
614 | *data_csum = __packet_fold_sum(sum: data_partial); |
615 | |
616 | if (currm != NULL) { |
617 | currm->m_len = curr_len; |
618 | } else { |
619 | currp->buf_dlen = curr_len; |
620 | } |
621 | |
622 | if (verify_l4) { |
623 | /* Fold in the data checksum to TCP checksum */ |
624 | partial += *data_csum; |
625 | partial += htons(l4len + IPPROTO_TCP); |
626 | if (spkt->pkt_flow_ip_ver == IPVERSION) { |
627 | csum = in_pseudo(spkt->pkt_flow_ipv4_src.s_addr, |
628 | spkt->pkt_flow_ipv4_dst.s_addr, partial); |
629 | } else { |
630 | ASSERT(spkt->pkt_flow_ip_ver == IPV6_VERSION); |
631 | csum = in6_pseudo(&spkt->pkt_flow_ipv6_src, |
632 | &spkt->pkt_flow_ipv6_dst, partial); |
633 | } |
634 | /* pkt metadata will be transfer to super packet */ |
635 | __packet_set_inet_checksum(SK_PKT2PH(spkt), |
636 | PACKET_CSUM_RX_FULL_FLAGS, start: 0, stuff_val: csum, false); |
637 | } else { |
638 | /* grab csum value from offload */ |
639 | csum = spkt->pkt_csum_rx_value; |
640 | } |
641 | |
642 | SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)" , |
643 | start - spkt->pkt_flow_tcp_hlen, l4len, ntohs(csum)); |
644 | |
645 | if ((csum ^ 0xffff) != 0) { |
646 | /* |
647 | * Revert whatever we did here! |
648 | * currm/currp should be restored to previous value. |
649 | * dbuf (for additional payload) should be restore to 0. |
650 | */ |
651 | if (currm != NULL) { |
652 | currm->m_len = curr_oldlen; |
653 | } else { |
654 | currp->buf_dlen = curr_oldlen; |
655 | } |
656 | if (dbuf != NULL) { |
657 | for (int i = 0; i < dbuf->dba_num_dbufs; i++) { |
658 | if (dbuf->dba_is_buflet) { |
659 | struct __kern_buflet *b = dbuf->dba_buflet[i]; |
660 | kern_buflet_set_data_length(b, 0); |
661 | kern_buflet_set_data_offset(b, 0); |
662 | } else { |
663 | struct mbuf *m = dbuf->dba_mbuf[i]; |
664 | m->m_len = m->m_pkthdr.len = 0; |
665 | } |
666 | } |
667 | } |
668 | |
669 | return false; |
670 | } |
671 | |
672 | return true; |
673 | } |
674 | |
675 | /* |
676 | * Copy and checksum for packet or packet with mbuf |
677 | * data_csum is only supported for bsd flows |
678 | */ |
679 | static bool |
680 | copy_pkt_csum(struct __kern_packet *pkt, uint32_t plen, _dbuf_array_t *dbuf, |
681 | uint16_t *data_csum, bool verify_l3) |
682 | { |
683 | /* |
684 | * To keep this routine simple and optimal, we are asserting on the |
685 | * assumption that the smallest flowswitch packet pool buffer should |
686 | * be large enough to hold the IP and TCP headers in the first buflet. |
687 | */ |
688 | _CASSERT(NX_FSW_MINBUFSIZE >= NETIF_COMPAT_MAX_MBUF_DATA_COPY); |
689 | |
690 | SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX | |
691 | (PKT_IS_MBUF(pkt) ? SK_VERB_COPY_MBUF : SK_VERB_COPY))); |
692 | |
693 | uint16_t start = 0, csum = 0; |
694 | uint32_t len = 0; |
695 | /* soff is only used for packets */ |
696 | uint16_t soff = pkt->pkt_headroom + pkt->pkt_l2_len; |
697 | uint32_t data_partial = 0, partial = 0; |
698 | boolean_t odd_start = false; |
699 | uint32_t data_len; |
700 | uint16_t dbuf_off; |
701 | uint16_t copied_len = 0; |
702 | bool l3_csum_ok; |
703 | uint8_t *daddr; |
704 | |
705 | if (dbuf->dba_is_buflet) { |
706 | daddr = kern_buflet_get_data_address(dbuf->dba_buflet[0]); |
707 | daddr += kern_buflet_get_data_length(dbuf->dba_buflet[0]); |
708 | } else { |
709 | daddr = mtod(dbuf->dba_mbuf[0], uint8_t *); |
710 | daddr += dbuf->dba_mbuf[0]->m_len; |
711 | /* |
712 | * available space check for payload is done later |
713 | * in _copy_data_sum_dbuf |
714 | */ |
715 | ASSERT(M_TRAILINGSPACE(dbuf->dba_mbuf[0]) >= |
716 | pkt->pkt_flow_ip_hlen + pkt->pkt_flow_tcp_hlen); |
717 | } |
718 | |
719 | if (PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)) { |
720 | /* copy only */ |
721 | _copy_data_sum_dbuf(spkt: pkt, PKT_IS_TRUNC_MBUF(pkt) ? 0: soff, |
722 | plen, partial_sum: &partial, odd_start: &odd_start, dbuf, false); |
723 | if (PKT_IS_MBUF(pkt)) { |
724 | csum = pkt->pkt_mbuf->m_pkthdr.csum_rx_val; |
725 | SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x" , |
726 | pkt->pkt_mbuf->m_pkthdr.csum_flags, |
727 | pkt->pkt_mbuf->m_pkthdr.csum_rx_start, csum); |
728 | } else { |
729 | csum = pkt->pkt_csum_rx_value; |
730 | SK_DF(logflags, "HW csumf/rxstart/rxval 0x%x/%u/0x%04x" , |
731 | pkt->pkt_csum_flags, |
732 | pkt->pkt_csum_rx_start_off, csum); |
733 | } |
734 | |
735 | /* pkt metadata will be transfer to super packet */ |
736 | __packet_set_inet_checksum(SK_PKT2PH(pkt), |
737 | PACKET_CSUM_RX_FULL_FLAGS, start: 0, stuff_val: csum, false); |
738 | if ((csum ^ 0xffff) == 0) { |
739 | return true; |
740 | } else { |
741 | return false; |
742 | } |
743 | } |
744 | |
745 | /* Copy l3 & verify checksum only for IPv4 */ |
746 | start = 0; |
747 | len = pkt->pkt_flow_ip_hlen; |
748 | if (PKT_IS_TRUNC_MBUF(pkt)) { |
749 | partial = m_copydata_sum(m: pkt->pkt_mbuf, off: start, len, |
750 | vp: (daddr + start), initial_sum: 0, NULL); |
751 | } else { |
752 | partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff, |
753 | dbaddr: (daddr + start), len, true, initial_sum: 0, NULL); |
754 | } |
755 | verify_l3 = (verify_l3 && !PACKET_HAS_VALID_IP_CSUM(pkt)); |
756 | l3_csum_ok = !verify_l3; |
757 | if (verify_l3) { |
758 | csum = __packet_fold_sum(sum: partial); |
759 | SK_DF(logflags, "IP copy+sum %u(%u) (csum 0x%04x)" , |
760 | start, len, csum); |
761 | pkt->pkt_csum_flags |= PACKET_CSUM_IP_CHECKED; |
762 | if ((csum ^ 0xffff) != 0) { |
763 | /* proceed to copy the rest of packet */ |
764 | } else { |
765 | pkt->pkt_csum_flags |= PACKET_CSUM_IP_VALID; |
766 | l3_csum_ok = true; |
767 | } |
768 | } |
769 | copied_len += pkt->pkt_flow_ip_hlen; |
770 | |
771 | /* Copy & verify TCP checksum */ |
772 | start = pkt->pkt_flow_ip_hlen; |
773 | len = plen - start; |
774 | |
775 | if (PKT_IS_TRUNC_MBUF(pkt)) { |
776 | /* First, copy and sum TCP header */ |
777 | partial = m_copydata_sum(m: pkt->pkt_mbuf, off: start, |
778 | len: pkt->pkt_flow_tcp_hlen, vp: (daddr + start), initial_sum: 0, NULL); |
779 | |
780 | data_len = len - pkt->pkt_flow_tcp_hlen; |
781 | start += pkt->pkt_flow_tcp_hlen; |
782 | dbuf_off = start; |
783 | /* Next, copy and sum payload (if any) */ |
784 | } else { |
785 | /* First, copy and sum TCP header */ |
786 | partial = pkt_copyaddr_sum(SK_PKT2PH(pkt), soff: (soff + start), |
787 | dbaddr: (daddr + start), len: pkt->pkt_flow_tcp_hlen, true, initial_sum: 0, NULL); |
788 | |
789 | data_len = len - pkt->pkt_flow_tcp_hlen; |
790 | start += pkt->pkt_flow_tcp_hlen; |
791 | dbuf_off = start; |
792 | start += soff; |
793 | } |
794 | copied_len += pkt->pkt_flow_tcp_hlen; |
795 | |
796 | if (dbuf->dba_is_buflet) { |
797 | VERIFY(kern_buflet_set_data_length(dbuf->dba_buflet[0], |
798 | kern_buflet_get_data_length(dbuf->dba_buflet[0]) + |
799 | copied_len) == 0); |
800 | } else { |
801 | dbuf->dba_mbuf[0]->m_len += copied_len; |
802 | dbuf->dba_mbuf[0]->m_pkthdr.len += copied_len; |
803 | } |
804 | |
805 | /* copy and sum payload (if any) */ |
806 | if (data_len > 0) { |
807 | odd_start = false; |
808 | _copy_data_sum_dbuf(spkt: pkt, soff: start, plen: data_len, partial_sum: &data_partial, |
809 | odd_start: &odd_start, dbuf, do_csum: l3_csum_ok); |
810 | } |
811 | |
812 | if (__improbable(!l3_csum_ok)) { |
813 | return false; |
814 | } |
815 | |
816 | /* Fold data sum to 16 bit and then into the partial */ |
817 | *data_csum = __packet_fold_sum(sum: data_partial); |
818 | |
819 | /* Fold in the data checksum to TCP checksum */ |
820 | partial += *data_csum; |
821 | |
822 | partial += htons(len + IPPROTO_TCP); |
823 | if (pkt->pkt_flow_ip_ver == IPVERSION) { |
824 | csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr, |
825 | pkt->pkt_flow_ipv4_dst.s_addr, partial); |
826 | } else { |
827 | ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION); |
828 | csum = in6_pseudo(&pkt->pkt_flow_ipv6_src, |
829 | &pkt->pkt_flow_ipv6_dst, partial); |
830 | } |
831 | |
832 | SK_DF(logflags, "TCP copy+sum %u(%u) (csum 0x%04x)" , |
833 | pkt->pkt_flow_ip_hlen, len, csum); |
834 | |
835 | /* pkt metadata will be transfer to super packet */ |
836 | __packet_set_inet_checksum(SK_PKT2PH(pkt), PACKET_CSUM_RX_FULL_FLAGS, |
837 | start: 0, stuff_val: csum, false); |
838 | if ((csum ^ 0xffff) != 0) { |
839 | return false; |
840 | } |
841 | |
842 | return true; |
843 | } |
844 | |
845 | SK_INLINE_ATTRIBUTE |
846 | static void |
847 | flow_agg_init_common(struct nx_flowswitch *fsw, struct flow_agg *fa, |
848 | struct __kern_packet *pkt) |
849 | { |
850 | struct ifnet *ifp; |
851 | |
852 | switch (pkt->pkt_flow_ip_ver) { |
853 | case IPVERSION: |
854 | if (pkt->pkt_flow_ip_hlen != sizeof(struct ip)) { |
855 | return; |
856 | } |
857 | break; |
858 | case IPV6_VERSION: |
859 | if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) { |
860 | return; |
861 | } |
862 | break; |
863 | default: |
864 | VERIFY(0); |
865 | /* NOTREACHED */ |
866 | __builtin_unreachable(); |
867 | } |
868 | |
869 | fa->fa_tcp_seq = ntohl(pkt->pkt_flow_tcp_seq) + pkt->pkt_flow_ulen; |
870 | fa->fa_ulen = pkt->pkt_flow_ulen; |
871 | fa->fa_total = pkt->pkt_flow_ip_hlen + |
872 | pkt->pkt_flow_tcp_hlen + pkt->pkt_flow_ulen; |
873 | |
874 | ifp = fsw->fsw_ifp; |
875 | ASSERT(ifp != NULL); |
876 | if (__improbable((ifp->if_hwassist & IFNET_LRO) != 0)) { |
877 | /* in case hardware supports LRO, don't fix checksum in the header */ |
878 | fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum_no_op; |
879 | } else { |
880 | fa->fa_fix_pkt_sum = flow_agg_pkt_fix_sum; |
881 | } |
882 | } |
883 | |
884 | static void |
885 | flow_agg_init_smbuf(struct nx_flowswitch *fsw, struct flow_agg *fa, |
886 | struct mbuf *smbuf, struct __kern_packet *pkt) |
887 | { |
888 | FLOW_AGG_CLEAR(fa); |
889 | |
890 | ASSERT(smbuf != NULL); |
891 | fa->fa_smbuf = smbuf; |
892 | |
893 | fa->fa_sptr = mtod(smbuf, uint8_t *); |
894 | ASSERT(fa->fa_sptr != NULL); |
895 | |
896 | /* |
897 | * Note here we use 'pkt' instead of 'smbuf', since we rely on the |
898 | * contents of the flow structure which don't exist in 'smbuf'. |
899 | */ |
900 | flow_agg_init_common(fsw, fa, pkt); |
901 | } |
902 | |
903 | static void |
904 | flow_agg_init_spkt(struct nx_flowswitch *fsw, struct flow_agg *fa, |
905 | struct __kern_packet *spkt, struct __kern_packet *pkt) |
906 | { |
907 | FLOW_AGG_CLEAR(fa); |
908 | |
909 | ASSERT(spkt != NULL); |
910 | fa->fa_spkt = spkt; |
911 | fa->fa_sobj_is_pkt = true; |
912 | VERIFY(spkt->pkt_headroom == 0 && spkt->pkt_l2_len == 0); |
913 | |
914 | MD_BUFLET_ADDR_ABS(spkt, fa->fa_sptr); |
915 | ASSERT(fa->fa_sptr != NULL); |
916 | |
917 | /* |
918 | * Note here we use 'pkt' instead of 'spkt', since we rely on the |
919 | * contents of the flow structure which don't exist in 'spkt'. |
920 | */ |
921 | flow_agg_init_common(fsw, fa, pkt); |
922 | } |
923 | |
924 | SK_INLINE_ATTRIBUTE |
925 | static bool |
926 | ipv4_tcp_memcmp(const uint8_t *h1, const uint8_t *h2) |
927 | { |
928 | return sk_memcmp_mask_64B(src1: h1, src2: h2, byte_mask: (const uint8_t *)&ip_tcp_mask) == 0; |
929 | } |
930 | |
931 | SK_INLINE_ATTRIBUTE |
932 | static bool |
933 | ipv6_tcp_memcmp(const uint8_t *h1, const uint8_t *h2) |
934 | { |
935 | return sk_memcmp_mask_80B(src1: h1, src2: h2, byte_mask: (const uint8_t *)&ip6_tcp_mask) == 0; |
936 | } |
937 | |
938 | SK_INLINE_ATTRIBUTE |
939 | static bool |
940 | can_agg_fastpath(struct flow_agg *fa, struct __kern_packet *pkt, |
941 | struct fsw_stats *fsws) |
942 | { |
943 | bool match; |
944 | |
945 | ASSERT(fa->fa_sptr != NULL); |
946 | _CASSERT(sizeof(struct ip6_tcp_mask) == MASK_SIZE); |
947 | _CASSERT(sizeof(struct ip_tcp_mask) == MASK_SIZE); |
948 | |
949 | if (__improbable(pkt->pkt_length < MASK_SIZE)) { |
950 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_TCP); |
951 | goto slow_path; |
952 | } |
953 | |
954 | if (__improbable(fa->fa_sobj_is_short)) { |
955 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SHORT_MBUF); |
956 | goto slow_path; |
957 | } |
958 | |
959 | if (__improbable(pkt->pkt_flow_tcp_hlen != |
960 | (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA))) { |
961 | goto slow_path; |
962 | } |
963 | |
964 | switch (pkt->pkt_flow_ip_ver) { |
965 | case IPVERSION: |
966 | match = ipv4_tcp_memcmp(h1: fa->fa_sptr, |
967 | h2: (uint8_t *)pkt->pkt_flow_ip_hdr); |
968 | break; |
969 | case IPV6_VERSION: |
970 | match = ipv6_tcp_memcmp(h1: fa->fa_sptr, |
971 | h2: (uint8_t *)pkt->pkt_flow_ip_hdr); |
972 | break; |
973 | default: |
974 | VERIFY(0); |
975 | /* NOTREACHED */ |
976 | __builtin_unreachable(); |
977 | } |
978 | |
979 | if (__improbable(!match)) { |
980 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_MASK_TCP); |
981 | goto slow_path; |
982 | } |
983 | if (__improbable(pkt->pkt_flow_ulen != fa->fa_ulen)) { |
984 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ULEN_TCP); |
985 | goto slow_path; |
986 | } |
987 | |
988 | STATS_INC(fsws, FSW_STATS_RX_AGG_OK_FASTPATH_TCP); |
989 | fa->fa_tcp_seq += pkt->pkt_flow_ulen; |
990 | fa->fa_ulen = pkt->pkt_flow_ulen; |
991 | return true; |
992 | |
993 | slow_path: |
994 | return false; |
995 | } |
996 | |
997 | SK_NO_INLINE_ATTRIBUTE |
998 | static bool |
999 | can_agg_slowpath(struct flow_agg *fa, struct __kern_packet *pkt, |
1000 | struct fsw_stats *fsws) |
1001 | { |
1002 | uint8_t *sl3_hdr = fa->fa_sptr; |
1003 | uint32_t sl3tlen = 0; |
1004 | uint16_t sl3hlen = 0; |
1005 | |
1006 | DTRACE_SKYWALK2(aggr__slow, struct __kern_packet *, pkt, |
1007 | uint8_t *, sl3_hdr); |
1008 | |
1009 | ASSERT(sl3_hdr != NULL); |
1010 | |
1011 | /* |
1012 | * Compare IP header length, TOS, frag flags and IP options |
1013 | * For IPv4, the options should match exactly |
1014 | * For IPv6, if options are present, bail out |
1015 | */ |
1016 | if (pkt->pkt_flow_ip_ver == IPVERSION) { |
1017 | struct ip *siph = (struct ip *)(void *)sl3_hdr; |
1018 | struct ip *iph = (struct ip *)pkt->pkt_flow_ip_hdr; |
1019 | |
1020 | ASSERT(siph->ip_v == IPVERSION); |
1021 | /* 16-bit alignment is sufficient (handles mbuf case) */ |
1022 | ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t))); |
1023 | ASSERT(IS_P2ALIGNED(iph, sizeof(uint16_t))); |
1024 | |
1025 | sl3hlen = (siph->ip_hl << 2); |
1026 | if (sl3hlen != pkt->pkt_flow_ip_hlen) { |
1027 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP); |
1028 | DTRACE_SKYWALK2(aggr__fail2, uint16_t, sl3hlen, uint8_t, |
1029 | pkt->pkt_flow_ip_hlen); |
1030 | return false; |
1031 | } |
1032 | |
1033 | if (siph->ip_ttl != iph->ip_ttl) { |
1034 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP); |
1035 | DTRACE_SKYWALK2(aggr__fail3, uint8_t, siph->ip_ttl, |
1036 | uint8_t, iph->ip_ttl); |
1037 | return false; |
1038 | } |
1039 | |
1040 | if (siph->ip_tos != iph->ip_tos) { |
1041 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP); |
1042 | DTRACE_SKYWALK2(aggr__fail4, uint8_t, siph->ip_tos, |
1043 | uint8_t, iph->ip_tos); |
1044 | return false; |
1045 | } |
1046 | /* For IPv4, DF bit should match */ |
1047 | if ((ntohs(siph->ip_off) & (IP_DF | IP_RF)) != |
1048 | (ntohs(iph->ip_off) & (IP_DF | IP_RF))) { |
1049 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OFF_IP); |
1050 | DTRACE_SKYWALK2(aggr__fail5, uint16_t, |
1051 | ntohs(siph->ip_off), uint16_t, ntohs(iph->ip_off)); |
1052 | return false; |
1053 | } |
1054 | |
1055 | uint8_t ip_opts_len = pkt->pkt_flow_ip_hlen - |
1056 | sizeof(struct ip); |
1057 | if (ip_opts_len > 0 && |
1058 | memcmp(s1: (uint8_t *)(siph + 1), s2: (uint8_t *)(iph + 1), |
1059 | n: ip_opts_len) != 0) { |
1060 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPT_IP); |
1061 | DTRACE_SKYWALK3(aggr__fail6, uint8_t, ip_opts_len, |
1062 | uint8_t *, (uint8_t *)(siph + 1), uint8_t *, |
1063 | (uint8_t *)(iph + 1)); |
1064 | return false; |
1065 | } |
1066 | sl3tlen = ntohs(siph->ip_len); |
1067 | } else { |
1068 | struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr; |
1069 | struct ip6_hdr *ip6 = (struct ip6_hdr *)pkt->pkt_flow_ip_hdr; |
1070 | |
1071 | ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION); |
1072 | ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION); |
1073 | /* 16-bit alignment is sufficient (handles mbuf case) */ |
1074 | ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t))); |
1075 | |
1076 | if (pkt->pkt_flow_ip_hlen != sizeof(struct ip6_hdr)) { |
1077 | /* |
1078 | * Don't aggregate if extension header is present in |
1079 | * packet. N.B. currently flow switch only classifies |
1080 | * frag header |
1081 | */ |
1082 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_IP); |
1083 | DTRACE_SKYWALK1(aggr__fail7, uint8_t, |
1084 | pkt->pkt_flow_ip_hlen); |
1085 | return false; |
1086 | } |
1087 | |
1088 | sl3hlen = sizeof(struct ip6_hdr); |
1089 | /* For IPv6, flow info mask covers TOS and flow label */ |
1090 | if (memcmp(s1: &sip6->ip6_flow, s2: &ip6->ip6_flow, |
1091 | n: sizeof(sip6->ip6_flow)) != 0) { |
1092 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TOS_IP); |
1093 | DTRACE_SKYWALK2(aggr__fail8, uint32_t, |
1094 | ntohl(sip6->ip6_flow), uint32_t, |
1095 | ntohl(ip6->ip6_flow)); |
1096 | return false; |
1097 | } |
1098 | |
1099 | if (sip6->ip6_hlim != ip6->ip6_hlim) { |
1100 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_TTL_IP); |
1101 | DTRACE_SKYWALK2(aggr__fail9, uint8_t, sip6->ip6_hlim, |
1102 | uint8_t, ip6->ip6_hlim); |
1103 | return false; |
1104 | } |
1105 | |
1106 | sl3tlen = (sizeof(struct ip6_hdr) + ntohs(sip6->ip6_plen)); |
1107 | } |
1108 | |
1109 | /* |
1110 | * For TCP header, compare ACK number and window size |
1111 | * Compare TCP flags |
1112 | * Compare TCP header length and TCP options |
1113 | */ |
1114 | struct tcphdr *stcp = (struct tcphdr *)(void *)(sl3_hdr + sl3hlen); |
1115 | struct tcphdr *tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr; |
1116 | |
1117 | uint16_t sl4hlen = (stcp->th_off << 2); |
1118 | if (memcmp(s1: &stcp->th_ack, s2: &tcp->th_ack, n: sizeof(stcp->th_ack)) != 0 || |
1119 | memcmp(s1: &stcp->th_win, s2: &tcp->th_win, n: sizeof(stcp->th_win)) != 0) { |
1120 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_ACKWIN_TCP); |
1121 | DTRACE_SKYWALK4(aggr__fail9, uint32_t, ntohl(stcp->th_ack), |
1122 | uint32_t, ntohl(tcp->th_ack), uint16_t, ntohs(stcp->th_win), |
1123 | uint16_t, ntohs(tcp->th_win)); |
1124 | return false; |
1125 | } |
1126 | |
1127 | if ((stcp->th_flags & ~(TH_PUSH)) != (tcp->th_flags & ~(TH_PUSH))) { |
1128 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_FLAGS_TCP); |
1129 | DTRACE_SKYWALK2(aggr__fail10, uint8_t, stcp->th_flags, |
1130 | uint8_t, tcp->th_flags); |
1131 | return false; |
1132 | } |
1133 | |
1134 | if (sl4hlen != pkt->pkt_flow_tcp_hlen) { |
1135 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_HLEN_TCP); |
1136 | DTRACE_SKYWALK2(aggr__fail11, uint8_t, sl4hlen, |
1137 | uint8_t, pkt->pkt_flow_tcp_hlen); |
1138 | return false; |
1139 | } |
1140 | |
1141 | uint8_t tcp_opts_len = pkt->pkt_flow_tcp_hlen - sizeof(struct tcphdr); |
1142 | /* |
1143 | * We know that the TCP-option lengthes are the same thanks to the above |
1144 | * sl4hlen check |
1145 | */ |
1146 | if (tcp_opts_len > 0 && memcmp(s1: (uint8_t *)(stcp + 1), |
1147 | s2: (uint8_t *)(tcp + 1), n: tcp_opts_len) != 0) { |
1148 | /* |
1149 | * Fast-path header prediction: |
1150 | * |
1151 | * TCP Timestamp option is usually put after two NOP-headers, |
1152 | * and thus total TCP-option length is 12. If that's the case, |
1153 | * we can aggregate as only the TCP time-stamp option differs. |
1154 | */ |
1155 | if (tcp_opts_len != TCPOLEN_TSTAMP_APPA) { |
1156 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_EXOPT_TCP); |
1157 | DTRACE_SKYWALK1(aggr__fail13, uint8_t, tcp_opts_len); |
1158 | return false; |
1159 | } else { |
1160 | uint32_t sts_hdr, ts_hdr; |
1161 | if (IS_P2ALIGNED(stcp + 1, sizeof(uint32_t))) { |
1162 | sts_hdr = *((uint32_t *)(stcp + 1)); |
1163 | } else { |
1164 | bcopy(src: stcp + 1, dst: &sts_hdr, n: sizeof(sts_hdr)); |
1165 | } |
1166 | if (IS_P2ALIGNED(tcp + 1, sizeof(uint32_t))) { |
1167 | ts_hdr = *((uint32_t *)(tcp + 1)); |
1168 | } else { |
1169 | bcopy(src: tcp + 1, dst: &ts_hdr, n: sizeof(ts_hdr)); |
1170 | } |
1171 | |
1172 | if (sts_hdr != htonl(TCPOPT_TSTAMP_HDR) || |
1173 | ts_hdr != htonl(TCPOPT_TSTAMP_HDR)) { |
1174 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_OPTTS_TCP); |
1175 | DTRACE_SKYWALK2(aggr__fail14, uint32_t, |
1176 | sts_hdr, uint32_t, ts_hdr); |
1177 | return false; |
1178 | } |
1179 | } |
1180 | } |
1181 | STATS_INC(fsws, FSW_STATS_RX_AGG_OK_SLOWPATH_TCP); |
1182 | fa->fa_tcp_seq += pkt->pkt_flow_ulen; |
1183 | fa->fa_ulen = pkt->pkt_flow_ulen; |
1184 | return true; |
1185 | } |
1186 | |
1187 | static bool |
1188 | flow_agg_is_ok(struct flow_agg *fa, struct __kern_packet *pkt, |
1189 | struct fsw_stats *fsws) |
1190 | { |
1191 | /* Shouldn't exceed the ip_len beyond MIN(custom ip_len, 64K) */ |
1192 | const uint32_t max_ip_len = MAX_AGG_IP_LEN(); |
1193 | bool can_agg = false; |
1194 | |
1195 | DTRACE_SKYWALK2(aggr__check, struct flow_agg *, fa, |
1196 | struct __kern_packet *, pkt); |
1197 | |
1198 | ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP); |
1199 | if (__improbable(pkt->pkt_flow_tcp_agg_fast != 0)) { |
1200 | pkt->pkt_flow_tcp_agg_fast = 0; |
1201 | } |
1202 | /* |
1203 | * Don't aggregate if any of the following is true: |
1204 | * 1. TCP flag is other than TH_{ACK,PUSH} |
1205 | * 2. Payload length is 0 (pure ACK) |
1206 | * 3. This is the first packet |
1207 | * 4. TCP sequence number is not expected |
1208 | * 5. We would've exceeded the maximum aggregated size |
1209 | * 6. It's not the first packet and the wake flag is set |
1210 | */ |
1211 | if (__improbable((pkt->pkt_flow_tcp_flags & TCP_FLAGS_IGNORE) != 0 || |
1212 | pkt->pkt_flow_ulen == 0 || fa->fa_sobj == NULL)) { |
1213 | DTRACE_SKYWALK1(aggr__fail1a, struct __kern_packet *, pkt); |
1214 | goto done; |
1215 | } |
1216 | if (__improbable(ntohl(pkt->pkt_flow_tcp_seq) != fa->fa_tcp_seq)) { |
1217 | DTRACE_SKYWALK2(aggr__fail1b, uint32_t, |
1218 | ntohl(pkt->pkt_flow_tcp_seq), uint32_t, fa->fa_tcp_seq); |
1219 | STATS_INC(fsws, FSW_STATS_RX_AGG_NO_SEQN_TCP); |
1220 | goto done; |
1221 | } |
1222 | if (__improbable((fa->fa_total + pkt->pkt_flow_ulen) > max_ip_len)) { |
1223 | DTRACE_SKYWALK3(aggr__fail1c, uint32_t, fa->fa_total, |
1224 | uint32_t, pkt->pkt_flow_ulen, uint32_t, max_ip_len); |
1225 | /* We've reached aggregation limit */ |
1226 | STATS_INC(fsws, FSW_STATS_RX_AGG_LIMIT); |
1227 | goto done; |
1228 | } |
1229 | if (__improbable(PKT_IS_WAKE_PKT(pkt) && fa->fa_total > 0)) { |
1230 | DTRACE_SKYWALK1(aggr__fail1d, struct __kern_packet *, pkt); |
1231 | goto done; |
1232 | } |
1233 | |
1234 | can_agg = can_agg_fastpath(fa, pkt, fsws); |
1235 | if (can_agg) { |
1236 | pkt->pkt_flow_tcp_agg_fast = 1; |
1237 | goto done; |
1238 | } |
1239 | |
1240 | can_agg = can_agg_slowpath(fa, pkt, fsws); |
1241 | ASSERT(!pkt->pkt_flow_tcp_agg_fast); |
1242 | |
1243 | done: |
1244 | return can_agg; |
1245 | } |
1246 | |
1247 | static uint16_t |
1248 | flow_agg_pkt_fix_sum(uint16_t csum, uint16_t old, uint16_t new) |
1249 | { |
1250 | return __packet_fix_sum(csum, old, new); |
1251 | } |
1252 | |
1253 | static uint16_t |
1254 | flow_agg_pkt_fix_sum_no_op(uint16_t __unused csum, uint16_t __unused old, |
1255 | uint16_t __unused new) |
1256 | { |
1257 | return 0; |
1258 | } |
1259 | |
1260 | static inline void |
1261 | flow_agg_pkt_fix_hdr_sum(struct flow_agg *fa, uint8_t *field, uint16_t *csum, |
1262 | uint32_t new) |
1263 | { |
1264 | uint32_t old; |
1265 | memcpy(dst: &old, src: field, n: sizeof(old)); |
1266 | memcpy(dst: field, src: &new, n: sizeof(uint32_t)); |
1267 | *csum = fa->fa_fix_pkt_sum(fa->fa_fix_pkt_sum(*csum, |
1268 | (uint16_t)(old >> 16), (uint16_t)(new >> 16)), |
1269 | (uint16_t)(old & 0xffff), |
1270 | (uint16_t)(new & 0xffff)); |
1271 | } |
1272 | |
1273 | static void |
1274 | flow_agg_merge_hdr(struct flow_agg *fa, struct __kern_packet *pkt, |
1275 | __unused uint16_t data_csum, struct fsw_stats *fsws) |
1276 | { |
1277 | struct tcphdr *stcp, *tcp; |
1278 | uint8_t *l3hdr, l3hlen; |
1279 | uint16_t old_l3len = 0; |
1280 | uint8_t result; |
1281 | |
1282 | SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX)); |
1283 | |
1284 | /* |
1285 | * The packet being merged should always have full checksum flags |
1286 | * and a valid checksum. Otherwise, it would fail copy_pkt_csum_packed |
1287 | * and not enter this function. |
1288 | */ |
1289 | ASSERT(PACKET_HAS_FULL_CHECKSUM_FLAGS(pkt)); |
1290 | ASSERT((pkt->pkt_csum_rx_value ^ 0xffff) == 0); |
1291 | |
1292 | ASSERT(fa->fa_sobj != NULL); |
1293 | ASSERT(!fa->fa_sobj_is_pkt || |
1294 | (fa->fa_spkt->pkt_headroom == 0 && fa->fa_spkt->pkt_l2_len == 0)); |
1295 | uint8_t *sl3_hdr = fa->fa_sptr; |
1296 | ASSERT(sl3_hdr != NULL); |
1297 | ASSERT(fa->fa_fix_pkt_sum != NULL); |
1298 | |
1299 | fa->fa_total += pkt->pkt_flow_ulen; |
1300 | |
1301 | /* |
1302 | * Update the IP header as: |
1303 | * 1. Set the IP ID (IPv4 only) to that of the new packet |
1304 | * 2. Set the ttl to the lowest of the two |
1305 | * 3. Increment the IP length by the payload length of new packet |
1306 | * 4. Leave the IP (IPv4 only) checksum as is |
1307 | * Update the resp. flow classification fields, if any |
1308 | * Nothing to update for TCP header for now |
1309 | */ |
1310 | if (pkt->pkt_flow_ip_ver == IPVERSION) { |
1311 | struct ip *siph = (struct ip *)(void *)sl3_hdr; |
1312 | |
1313 | /* 16-bit alignment is sufficient (handles mbuf case) */ |
1314 | ASSERT(IS_P2ALIGNED(siph, sizeof(uint16_t))); |
1315 | |
1316 | l3hdr = (uint8_t *)siph; |
1317 | l3hlen = siph->ip_hl << 2; |
1318 | |
1319 | old_l3len = ntohs(siph->ip_len); |
1320 | uint16_t l3tlen = ntohs(siph->ip_len) + pkt->pkt_flow_ulen; |
1321 | siph->ip_len = htons(l3tlen); |
1322 | siph->ip_sum = fa->fa_fix_pkt_sum(siph->ip_sum, 0, |
1323 | htons(pkt->pkt_flow_ulen)); |
1324 | |
1325 | SK_DF(logflags, "Agg IP len %u" , ntohs(siph->ip_len)); |
1326 | } else { |
1327 | struct ip6_hdr *sip6 = (struct ip6_hdr *)(void *)sl3_hdr; |
1328 | |
1329 | /* 16-bit alignment is sufficient (handles mbuf case) */ |
1330 | ASSERT(IS_P2ALIGNED(sip6, sizeof(uint16_t))); |
1331 | ASSERT((sip6->ip6_vfc & IPV6_VERSION_MASK) == IPV6_VERSION); |
1332 | ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION); |
1333 | |
1334 | l3hdr = (uint8_t *)sip6; |
1335 | l3hlen = sizeof(struct ip6_hdr); |
1336 | |
1337 | /* No extension headers should be present */ |
1338 | ASSERT(pkt->pkt_flow_ip_hlen == sizeof(struct ip6_hdr)); |
1339 | |
1340 | old_l3len = ntohs(sip6->ip6_plen) + sizeof(struct ip6_hdr); |
1341 | uint16_t l3plen = ntohs(sip6->ip6_plen) + pkt->pkt_flow_ulen; |
1342 | sip6->ip6_plen = htons(l3plen); |
1343 | |
1344 | SK_DF(logflags, "Agg IP6 len %u" , ntohs(sip6->ip6_plen)); |
1345 | } |
1346 | |
1347 | if (__probable(pkt->pkt_flow_tcp_agg_fast)) { |
1348 | STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_IP); |
1349 | } else { |
1350 | STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_IP); |
1351 | } |
1352 | |
1353 | stcp = (struct tcphdr *)(void *)(l3hdr + l3hlen); |
1354 | tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr; |
1355 | /* 16-bit alignment is sufficient (handles mbuf case) */ |
1356 | ASSERT(IS_P2ALIGNED(stcp, sizeof(uint16_t))); |
1357 | ASSERT(IS_P2ALIGNED(tcp, sizeof(uint16_t))); |
1358 | |
1359 | /* |
1360 | * If it is bigger, that means there are TCP-options that need to be |
1361 | * copied over. |
1362 | */ |
1363 | if (pkt->pkt_flow_tcp_hlen > sizeof(struct tcphdr) || |
1364 | (stcp->th_flags & TH_PUSH) == 0) { |
1365 | VERIFY(stcp->th_off << 2 == pkt->pkt_flow_tcp_hlen); |
1366 | if (__improbable(!pkt->pkt_flow_tcp_agg_fast && |
1367 | memcmp(stcp + 1, tcp + 1, (pkt->pkt_flow_tcp_hlen - |
1368 | sizeof(struct tcphdr))) != 0)) { |
1369 | uint8_t *sopt = (uint8_t *)(stcp + 1); |
1370 | uint8_t *opt = (uint8_t *)(tcp + 1); |
1371 | |
1372 | uint32_t ntsval, ntsecr; |
1373 | bcopy(src: (void *)(opt + 4), dst: &ntsval, n: sizeof(ntsval)); |
1374 | bcopy(src: (void *)(opt + 8), dst: &ntsecr, n: sizeof(ntsecr)); |
1375 | |
1376 | flow_agg_pkt_fix_hdr_sum(fa, field: sopt + 4, csum: &stcp->th_sum, new: ntsval); |
1377 | flow_agg_pkt_fix_hdr_sum(fa, field: sopt + 8, csum: &stcp->th_sum, new: ntsecr); |
1378 | |
1379 | STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_SLOWPATH_TCP); |
1380 | } else { |
1381 | STATS_INC(fsws, FSW_STATS_RX_AGG_MERGE_FASTPATH_TCP); |
1382 | } |
1383 | |
1384 | if ((stcp->th_flags & TH_PUSH) == 0 && |
1385 | (tcp->th_flags & TH_PUSH) != 0) { |
1386 | uint16_t old, new; |
1387 | old = *(uint16_t *)(void *)(&stcp->th_ack + 1); |
1388 | /* If the new segment has a PUSH-flag, append it! */ |
1389 | stcp->th_flags |= tcp->th_flags & TH_PUSH; |
1390 | new = *(uint16_t *)(void *)(&stcp->th_ack + 1); |
1391 | stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, old, new); |
1392 | } |
1393 | } |
1394 | |
1395 | /* Update pseudo header checksum */ |
1396 | stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, |
1397 | htons(pkt->pkt_flow_ulen)); |
1398 | |
1399 | /* Update data checksum */ |
1400 | if (__improbable(old_l3len & 0x1)) { |
1401 | /* swap the byte order, refer to rfc 1071 section 2 */ |
1402 | stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, |
1403 | ntohs(data_csum)); |
1404 | } else { |
1405 | stcp->th_sum = fa->fa_fix_pkt_sum(stcp->th_sum, 0, data_csum); |
1406 | } |
1407 | |
1408 | if (fa->fa_sobj_is_pkt) { |
1409 | struct __kern_packet *spkt = fa->fa_spkt; |
1410 | spkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP; |
1411 | spkt->pkt_flow_ulen += pkt->pkt_flow_ulen; |
1412 | /* |
1413 | * Super packet length includes L3 and L4 |
1414 | * header length for first packet only. |
1415 | */ |
1416 | spkt->pkt_length += pkt->pkt_flow_ulen; |
1417 | if (spkt->pkt_seg_cnt == 0) { |
1418 | /* First time we append packets, need to set it to 1 */ |
1419 | spkt->pkt_seg_cnt = 1; |
1420 | } |
1421 | _CASSERT(sizeof(result) == sizeof(spkt->pkt_seg_cnt)); |
1422 | if (!os_add_overflow(1, spkt->pkt_seg_cnt, &result)) { |
1423 | spkt->pkt_seg_cnt = result; |
1424 | } |
1425 | SK_DF(logflags, "Agg pkt len %u TCP csum 0x%04x" , |
1426 | spkt->pkt_length, ntohs(stcp->th_sum)); |
1427 | } else { |
1428 | struct mbuf *smbuf = fa->fa_smbuf; |
1429 | smbuf->m_pkthdr.len += pkt->pkt_flow_ulen; |
1430 | if (smbuf->m_pkthdr.seg_cnt == 0) { |
1431 | /* First time we append packets, need to set it to 1 */ |
1432 | smbuf->m_pkthdr.seg_cnt = 1; |
1433 | } |
1434 | _CASSERT(sizeof(result) == sizeof(smbuf->m_pkthdr.seg_cnt)); |
1435 | if (!os_add_overflow(1, smbuf->m_pkthdr.seg_cnt, &result)) { |
1436 | smbuf->m_pkthdr.seg_cnt = result; |
1437 | } |
1438 | SK_DF(logflags, "Agg mbuf len %u TCP csum 0x%04x" , |
1439 | smbuf->m_pkthdr.len, ntohs(stcp->th_sum)); |
1440 | } |
1441 | } |
1442 | |
1443 | /* |
1444 | * Copy metadata from source packet to destination packet |
1445 | */ |
1446 | static void |
1447 | pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt) |
1448 | { |
1449 | /* Copy packet metadata */ |
1450 | _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum); |
1451 | _PKT_COPY(spkt, dpkt); |
1452 | } |
1453 | |
1454 | static void |
1455 | pkt_finalize(kern_packet_t ph) |
1456 | { |
1457 | int err = __packet_finalize(ph); |
1458 | VERIFY(err == 0); |
1459 | #if (DEVELOPMENT || DEBUG) |
1460 | struct __kern_packet *pkt = SK_PTR_ADDR_KPKT(ph); |
1461 | uint8_t *buf; |
1462 | MD_BUFLET_ADDR_ABS(pkt, buf); |
1463 | buf += pkt->pkt_headroom + pkt->pkt_l2_len; |
1464 | DTRACE_SKYWALK2(aggr__finalize, struct __kern_packet *, pkt, |
1465 | uint8_t *, buf); |
1466 | #endif |
1467 | } |
1468 | |
1469 | static inline uint32_t |
1470 | estimate_buf_cnt(struct flow_entry *fe, uint32_t min_bufsize, |
1471 | uint32_t agg_bufsize) |
1472 | { |
1473 | uint32_t max_ip_len = MAX_AGG_IP_LEN(); |
1474 | uint32_t agg_size = MAX(fe->fe_rx_largest_size, min_bufsize); |
1475 | uint32_t hdr_overhead; |
1476 | |
1477 | agg_size = MIN(agg_size, agg_bufsize); |
1478 | |
1479 | hdr_overhead = (fe->fe_rx_pktq_bytes / max_ip_len) * |
1480 | (MAX(sizeof(struct ip), sizeof(struct ip6_hdr)) + |
1481 | sizeof(struct tcphdr)); |
1482 | |
1483 | return ((fe->fe_rx_pktq_bytes + hdr_overhead) / agg_size) + 1; |
1484 | } |
1485 | |
1486 | SK_INLINE_ATTRIBUTE |
1487 | static inline void |
1488 | _append_dbuf_array_to_kpkt(kern_packet_t ph, kern_buflet_t pbuf, |
1489 | _dbuf_array_t *dbuf_array, kern_buflet_t *lbuf) |
1490 | { |
1491 | for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) { |
1492 | kern_buflet_t buf = dbuf_array->dba_buflet[i]; |
1493 | VERIFY(kern_packet_add_buflet(ph, pbuf, buf) == 0); |
1494 | pbuf = buf; |
1495 | dbuf_array->dba_buflet[i] = NULL; |
1496 | } |
1497 | ASSERT(pbuf != NULL); |
1498 | dbuf_array->dba_num_dbufs = 0; |
1499 | *lbuf = pbuf; |
1500 | } |
1501 | |
1502 | SK_INLINE_ATTRIBUTE |
1503 | static inline void |
1504 | _free_dbuf_array(struct kern_pbufpool *pp, |
1505 | _dbuf_array_t *dbuf_array) |
1506 | { |
1507 | for (uint8_t i = 0; i < dbuf_array->dba_num_dbufs; i++) { |
1508 | kern_buflet_t buf = dbuf_array->dba_buflet[i]; |
1509 | pp_free_buflet(pp, buf); |
1510 | dbuf_array->dba_buflet[i] = NULL; |
1511 | } |
1512 | dbuf_array->dba_num_dbufs = 0; |
1513 | } |
1514 | |
1515 | static inline void |
1516 | finalize_super_packet(struct __kern_packet **spkt, kern_packet_t *sph, |
1517 | struct flow_agg *fa, uint32_t *largest_spkt, uint16_t *spkts, |
1518 | uint16_t bufcnt) |
1519 | { |
1520 | (*spkts)++; |
1521 | if (bufcnt > 1) { |
1522 | (*spkt)->pkt_aggr_type = PKT_AGGR_SINGLE_IP; |
1523 | } |
1524 | pkt_finalize(ph: *sph); |
1525 | if ((*spkt)->pkt_length > *largest_spkt) { |
1526 | *largest_spkt = (*spkt)->pkt_length; |
1527 | } |
1528 | pkt_agg_log(*spkt, kernproc, false); |
1529 | DTRACE_SKYWALK1(aggr__buflet__count, uint16_t, bufcnt); |
1530 | *sph = 0; |
1531 | *spkt = NULL; |
1532 | FLOW_AGG_CLEAR(fa); |
1533 | } |
1534 | |
1535 | static inline void |
1536 | converge_aggregation_size(struct flow_entry *fe, uint32_t largest_agg_size) |
1537 | { |
1538 | if (fe->fe_rx_largest_size > largest_agg_size) { |
1539 | /* |
1540 | * Make it slowly move towards largest_agg_size if we |
1541 | * consistently get non-aggregatable size. |
1542 | * |
1543 | * If we start at 16K, this makes us go to 4K within 6 rounds |
1544 | * and down to 2K within 12 rounds. |
1545 | */ |
1546 | fe->fe_rx_largest_size -= |
1547 | ((fe->fe_rx_largest_size - largest_agg_size) >> 2); |
1548 | } else { |
1549 | fe->fe_rx_largest_size += |
1550 | ((largest_agg_size - fe->fe_rx_largest_size) >> 2); |
1551 | } |
1552 | } |
1553 | |
1554 | SK_NO_INLINE_ATTRIBUTE |
1555 | static void |
1556 | flow_rx_agg_channel(struct nx_flowswitch *fsw, struct flow_entry *fe, |
1557 | struct pktq *dropped_pkts, bool is_mbuf) |
1558 | { |
1559 | #define __RX_AGG_CHAN_DROP_SOURCE_PACKET(_pkt) do { \ |
1560 | KPKTQ_ENQUEUE(dropped_pkts, (_pkt)); \ |
1561 | (_pkt) = NULL; \ |
1562 | FLOW_AGG_CLEAR(&fa); \ |
1563 | prev_csum_ok = false; \ |
1564 | } while (0) |
1565 | struct flow_agg fa; /* states */ |
1566 | FLOW_AGG_CLEAR(&fa); |
1567 | |
1568 | struct pktq pkts; /* dst super packets */ |
1569 | struct pktq disposed_pkts; /* done src packets */ |
1570 | |
1571 | KPKTQ_INIT(&pkts); |
1572 | KPKTQ_INIT(&disposed_pkts); |
1573 | |
1574 | struct __kern_channel_ring *ring; |
1575 | ring = fsw_flow_get_rx_ring(fsw, fe); |
1576 | if (__improbable(ring == NULL)) { |
1577 | SK_ERR("Rx ring is NULL" ); |
1578 | KPKTQ_CONCAT(dropped_pkts, &fe->fe_rx_pktq); |
1579 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_DST_NXPORT_INVALID, |
1580 | KPKTQ_LEN(dropped_pkts)); |
1581 | return; |
1582 | } |
1583 | struct kern_pbufpool *dpp = ring->ckr_pp; |
1584 | ASSERT(dpp->pp_max_frags > 1); |
1585 | |
1586 | struct __kern_packet *pkt, *tpkt; |
1587 | /* state for super packet */ |
1588 | struct __kern_packet *spkt = NULL; |
1589 | kern_packet_t sph = 0; |
1590 | kern_buflet_t sbuf = NULL; |
1591 | bool prev_csum_ok = false, csum_ok, agg_ok; |
1592 | uint16_t spkts = 0, bufcnt = 0; |
1593 | int err; |
1594 | |
1595 | struct fsw_stats *fsws = &fsw->fsw_stats; |
1596 | |
1597 | /* state for buflet batch alloc */ |
1598 | uint32_t bh_cnt, bh_cnt_tmp; |
1599 | uint64_t buf_arr[MAX_BUFLET_COUNT]; |
1600 | _dbuf_array_t dbuf_array = {.dba_is_buflet = true, .dba_num_dbufs = 0}; |
1601 | uint32_t largest_spkt = 0; /* largest aggregated packet size */ |
1602 | uint32_t agg_bufsize; |
1603 | uint8_t iter = 0; |
1604 | bool large_buffer = false; |
1605 | |
1606 | SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX)); |
1607 | SK_DF(logflags, "Rx input queue len %u" , KPKTQ_LEN(&fe->fe_rx_pktq)); |
1608 | |
1609 | if (__probable(fe->fe_rx_largest_size != 0 && |
1610 | NX_FSW_TCP_RX_AGG_ENABLED())) { |
1611 | if (fe->fe_rx_largest_size <= PP_BUF_SIZE_DEF(dpp) || |
1612 | PP_BUF_SIZE_LARGE(dpp) == 0) { |
1613 | agg_bufsize = PP_BUF_SIZE_DEF(dpp); |
1614 | } else { |
1615 | agg_bufsize = PP_BUF_SIZE_LARGE(dpp); |
1616 | large_buffer = true; |
1617 | } |
1618 | bh_cnt = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp), |
1619 | agg_bufsize); |
1620 | DTRACE_SKYWALK1(needed_blt_cnt_agg, uint32_t, bh_cnt); |
1621 | bh_cnt = MIN(bh_cnt, MAX_BUFLET_COUNT); |
1622 | bh_cnt_tmp = bh_cnt; |
1623 | } else { |
1624 | /* |
1625 | * No payload, thus it's all small-sized ACKs/... |
1626 | * OR aggregation is disabled. |
1627 | */ |
1628 | agg_bufsize = PP_BUF_SIZE_DEF(dpp); |
1629 | bh_cnt_tmp = bh_cnt = MIN(KPKTQ_LEN(&fe->fe_rx_pktq), MAX_BUFLET_COUNT); |
1630 | DTRACE_SKYWALK1(needed_blt_cnt_no_agg, uint32_t, bh_cnt); |
1631 | } |
1632 | |
1633 | err = pp_alloc_buflet_batch(pp: dpp, array: buf_arr, size: &bh_cnt, SKMEM_NOSLEEP, |
1634 | large: large_buffer); |
1635 | if (__improbable(bh_cnt == 0)) { |
1636 | SK_ERR("failed to alloc %u buflets (err %d), use slow path" , |
1637 | bh_cnt_tmp, err); |
1638 | } |
1639 | bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION); |
1640 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) { |
1641 | if (tpkt != NULL) { |
1642 | void *baddr; |
1643 | MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr); |
1644 | SK_PREFETCH(baddr, 0); |
1645 | } |
1646 | |
1647 | ASSERT(pkt->pkt_qum.qum_pp != dpp); |
1648 | ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt))); |
1649 | ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver); |
1650 | ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0); |
1651 | ASSERT(!pkt->pkt_flow_ip_is_frag); |
1652 | ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP); |
1653 | |
1654 | csum_ok = false; |
1655 | agg_ok = false; |
1656 | /* supports TCP only */ |
1657 | uint32_t thlen = (pkt->pkt_flow_ip_hlen + |
1658 | pkt->pkt_flow_tcp_hlen); |
1659 | uint32_t plen = (thlen + pkt->pkt_flow_ulen); |
1660 | uint16_t data_csum = 0; |
1661 | |
1662 | KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt); |
1663 | fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen; |
1664 | err = flow_pkt_track(fe, pkt, true); |
1665 | if (__improbable(err != 0)) { |
1666 | STATS_INC(fsws, FSW_STATS_RX_FLOW_TRACK_ERR); |
1667 | /* if need to trigger RST */ |
1668 | if (err == ENETRESET) { |
1669 | flow_track_abort_tcp(fe, in_pkt: pkt, NULL); |
1670 | } |
1671 | SK_ERR("flow_pkt_track failed (err %d)" , err); |
1672 | __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt); |
1673 | continue; |
1674 | } |
1675 | |
1676 | if (is_mbuf) { /* compat */ |
1677 | m_adj(pkt->pkt_mbuf, pkt->pkt_l2_len); |
1678 | pkt->pkt_svc_class = m_get_service_class(pkt->pkt_mbuf); |
1679 | if (pkt->pkt_mbuf->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) { |
1680 | pkt->pkt_pflags |= PKT_F_WAKE_PKT; |
1681 | } |
1682 | } |
1683 | |
1684 | if (prev_csum_ok && sbuf) { |
1685 | ASSERT(fa.fa_spkt == spkt); |
1686 | ASSERT(spkt == NULL || fa.fa_sobj_is_pkt); |
1687 | agg_ok = flow_agg_is_ok(fa: &fa, pkt, fsws); |
1688 | agg_ok = (agg_ok && bufcnt < dpp->pp_max_frags); |
1689 | |
1690 | if (agg_ok && sbuf->buf_dlim - sbuf->buf_doff - |
1691 | sbuf->buf_dlen >= plen - thlen) { |
1692 | /* |
1693 | * No need for a new packet, just |
1694 | * append to curr_m. |
1695 | */ |
1696 | csum_ok = copy_pkt_csum_packed(spkt: pkt, plen, NULL, |
1697 | verify_l3: is_ipv4, NULL, currp: sbuf, data_csum: &data_csum, NULL); |
1698 | |
1699 | if (!csum_ok) { |
1700 | STATS_INC(fsws, |
1701 | FSW_STATS_RX_AGG_BAD_CSUM); |
1702 | SK_ERR("Checksum for aggregation " |
1703 | "is wrong" ); |
1704 | DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail1); |
1705 | /* |
1706 | * Turns out, checksum is wrong! |
1707 | * Fallback to no-agg mode. |
1708 | */ |
1709 | agg_ok = false; |
1710 | } else { |
1711 | flow_agg_merge_hdr(fa: &fa, pkt, |
1712 | data_csum, fsws); |
1713 | goto next; |
1714 | } |
1715 | } |
1716 | } |
1717 | |
1718 | /* calculate number of buflets required */ |
1719 | bh_cnt_tmp = howmany(plen, agg_bufsize); |
1720 | if (__improbable(bh_cnt_tmp > MAX_BUFLET_COUNT)) { |
1721 | STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT); |
1722 | SK_ERR("packet too big: bufcnt %d len %d" , bh_cnt_tmp, |
1723 | plen); |
1724 | __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt); |
1725 | continue; |
1726 | } |
1727 | if (bh_cnt < bh_cnt_tmp) { |
1728 | uint32_t tmp; |
1729 | |
1730 | if (iter != 0) { |
1731 | /* |
1732 | * rearrange the array for additional |
1733 | * allocation |
1734 | */ |
1735 | uint8_t i; |
1736 | for (i = 0; i < bh_cnt; i++, iter++) { |
1737 | buf_arr[i] = buf_arr[iter]; |
1738 | buf_arr[iter] = 0; |
1739 | } |
1740 | iter = 0; |
1741 | } |
1742 | tmp = estimate_buf_cnt(fe, PP_BUF_SIZE_DEF(dpp), |
1743 | agg_bufsize); |
1744 | tmp = MIN(tmp, MAX_BUFLET_COUNT); |
1745 | tmp = MAX(tmp, bh_cnt_tmp); |
1746 | tmp -= bh_cnt; |
1747 | ASSERT(tmp <= (MAX_BUFLET_COUNT - bh_cnt)); |
1748 | DTRACE_SKYWALK1(refilled_blt_cnt, uint32_t, tmp); |
1749 | err = pp_alloc_buflet_batch(pp: dpp, array: &buf_arr[bh_cnt], |
1750 | size: &tmp, SKMEM_NOSLEEP, large: large_buffer); |
1751 | bh_cnt += tmp; |
1752 | if (__improbable((tmp == 0) || (bh_cnt < bh_cnt_tmp))) { |
1753 | STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT); |
1754 | SK_ERR("buflet alloc failed (err %d)" , err); |
1755 | __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt); |
1756 | continue; |
1757 | } |
1758 | } |
1759 | /* Use pre-allocated buflets */ |
1760 | ASSERT(bh_cnt >= bh_cnt_tmp); |
1761 | dbuf_array.dba_num_dbufs = bh_cnt_tmp; |
1762 | while (bh_cnt_tmp-- > 0) { |
1763 | dbuf_array.dba_buflet[bh_cnt_tmp] = |
1764 | (kern_buflet_t)(buf_arr[iter]); |
1765 | buf_arr[iter] = 0; |
1766 | bh_cnt--; |
1767 | iter++; |
1768 | } |
1769 | /* copy and checksum TCP data */ |
1770 | if (agg_ok) { |
1771 | int added = 0; |
1772 | ASSERT(dbuf_array.dba_num_dbufs != 0); |
1773 | csum_ok = copy_pkt_csum_packed(spkt: pkt, plen, dbuf: &dbuf_array, |
1774 | verify_l3: is_ipv4, NULL, currp: sbuf, data_csum: &data_csum, added: &added); |
1775 | |
1776 | if (__improbable(!csum_ok)) { |
1777 | STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM); |
1778 | SK_ERR("Checksum for aggregation on new " |
1779 | "mbuf is wrong" ); |
1780 | DTRACE_SKYWALK(aggr__chan_packed_tcp_csum_fail2); |
1781 | agg_ok = false; |
1782 | /* reset the used buflets */ |
1783 | uint8_t j; |
1784 | for (j = 0; j < dbuf_array.dba_num_dbufs; j++) { |
1785 | VERIFY(kern_buflet_set_data_length( |
1786 | dbuf_array.dba_buflet[j], 0) == 0); |
1787 | } |
1788 | goto non_agg; |
1789 | } |
1790 | |
1791 | /* |
1792 | * There was not enough space in curr_m, thus we must |
1793 | * have added to m->m_data. |
1794 | */ |
1795 | VERIFY(added > 0); |
1796 | } else { |
1797 | non_agg: |
1798 | ASSERT(dbuf_array.dba_num_dbufs != 0); |
1799 | csum_ok = copy_pkt_csum(pkt, plen, dbuf: &dbuf_array, |
1800 | data_csum: &data_csum, verify_l3: is_ipv4); |
1801 | if (__improbable(!csum_ok)) { |
1802 | STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM); |
1803 | SK_ERR("%d incorrect csum" , __LINE__); |
1804 | DTRACE_SKYWALK(aggr__chan_tcp_csum_fail); |
1805 | } |
1806 | } |
1807 | if (agg_ok) { |
1808 | ASSERT(fa.fa_spkt == spkt); |
1809 | ASSERT(spkt == NULL || fa.fa_sobj_is_pkt); |
1810 | /* update current packet header */ |
1811 | flow_agg_merge_hdr(fa: &fa, pkt, data_csum, fsws); |
1812 | ASSERT(dbuf_array.dba_num_dbufs > 0); |
1813 | bufcnt += dbuf_array.dba_num_dbufs; |
1814 | _append_dbuf_array_to_kpkt(ph: sph, pbuf: sbuf, dbuf_array: &dbuf_array, |
1815 | lbuf: &sbuf); |
1816 | } else { |
1817 | /* Finalize the current super packet */ |
1818 | if (sph != 0) { |
1819 | finalize_super_packet(spkt: &spkt, sph: &sph, fa: &fa, |
1820 | largest_spkt: &largest_spkt, spkts: &spkts, bufcnt); |
1821 | } |
1822 | |
1823 | /* New super packet */ |
1824 | err = kern_pbufpool_alloc_nosleep(pbufpool: dpp, bufcnt: 0, packet: &sph); |
1825 | if (__improbable(err != 0)) { |
1826 | STATS_INC(fsws, FSW_STATS_DROP_NOMEM_PKT); |
1827 | SK_ERR("packet alloc failed (err %d)" , err); |
1828 | _free_dbuf_array(pp: dpp, dbuf_array: &dbuf_array); |
1829 | __RX_AGG_CHAN_DROP_SOURCE_PACKET(pkt); |
1830 | continue; |
1831 | } |
1832 | spkt = SK_PTR_ADDR_KPKT(sph); |
1833 | pkt_copy_metadata(spkt: pkt, dpkt: spkt); |
1834 | /* Packet length for super packet starts from L3 */ |
1835 | spkt->pkt_length = plen; |
1836 | spkt->pkt_flow_ulen = pkt->pkt_flow_ulen; |
1837 | spkt->pkt_headroom = 0; |
1838 | spkt->pkt_l2_len = 0; |
1839 | spkt->pkt_seg_cnt = 1; |
1840 | |
1841 | ASSERT(dbuf_array.dba_num_dbufs > 0); |
1842 | bufcnt = dbuf_array.dba_num_dbufs; |
1843 | sbuf = kern_packet_get_next_buflet(sph, NULL); |
1844 | _append_dbuf_array_to_kpkt(ph: sph, pbuf: sbuf, dbuf_array: &dbuf_array, |
1845 | lbuf: &sbuf); |
1846 | |
1847 | KPKTQ_ENQUEUE(&pkts, spkt); |
1848 | _UUID_COPY(spkt->pkt_flow_id, fe->fe_uuid); |
1849 | _UUID_COPY(spkt->pkt_policy_euuid, fe->fe_eproc_uuid); |
1850 | spkt->pkt_policy_id = fe->fe_policy_id; |
1851 | spkt->pkt_skip_policy_id = fe->fe_skip_policy_id; |
1852 | spkt->pkt_transport_protocol = |
1853 | fe->fe_transport_protocol; |
1854 | flow_agg_init_spkt(fsw, fa: &fa, spkt, pkt); |
1855 | } |
1856 | next: |
1857 | pkt_agg_log(pkt, kernproc, true); |
1858 | prev_csum_ok = csum_ok; |
1859 | KPKTQ_ENQUEUE(&disposed_pkts, pkt); |
1860 | } |
1861 | |
1862 | /* Free unused buflets */ |
1863 | STATS_ADD(fsws, FSW_STATS_RX_WASTED_BFLT, bh_cnt); |
1864 | while (bh_cnt > 0) { |
1865 | pp_free_buflet(dpp, (kern_buflet_t)(buf_arr[iter])); |
1866 | buf_arr[iter] = 0; |
1867 | bh_cnt--; |
1868 | iter++; |
1869 | } |
1870 | /* Finalize the last super packet */ |
1871 | if (sph != 0) { |
1872 | finalize_super_packet(spkt: &spkt, sph: &sph, fa: &fa, largest_spkt: &largest_spkt, |
1873 | spkts: &spkts, bufcnt); |
1874 | } |
1875 | converge_aggregation_size(fe, largest_agg_size: largest_spkt); |
1876 | DTRACE_SKYWALK1(aggr__spkt__count, uint16_t, spkts); |
1877 | if (__improbable(is_mbuf)) { |
1878 | STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2PKT, spkts); |
1879 | } else { |
1880 | STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2PKT, spkts); |
1881 | } |
1882 | FLOW_STATS_IN_ADD(fe, spackets, spkts); |
1883 | |
1884 | KPKTQ_FINI(&fe->fe_rx_pktq); |
1885 | KPKTQ_CONCAT(&fe->fe_rx_pktq, &pkts); |
1886 | KPKTQ_FINI(&pkts); |
1887 | |
1888 | fsw_ring_enqueue_tail_drop(fsw, ring, pktq: &fe->fe_rx_pktq); |
1889 | |
1890 | pp_free_pktq(&disposed_pkts); |
1891 | } |
1892 | |
1893 | /* streamline a smbuf */ |
1894 | static bool |
1895 | _finalize_smbuf(struct mbuf *smbuf) |
1896 | { |
1897 | /* the 1st mbuf always contains something, so start with the 2nd one */ |
1898 | struct mbuf *m_chained = smbuf->m_next; |
1899 | struct mbuf *prev_m = smbuf; |
1900 | bool freed = false; |
1901 | |
1902 | while (m_chained != NULL) { |
1903 | if (m_chained->m_len != 0) { |
1904 | prev_m = m_chained; |
1905 | m_chained = m_chained->m_next; |
1906 | continue; |
1907 | } |
1908 | prev_m->m_next = m_chained->m_next; |
1909 | m_free(m_chained); |
1910 | m_chained = prev_m->m_next; |
1911 | freed = true; |
1912 | } |
1913 | return freed; |
1914 | } |
1915 | |
1916 | SK_NO_INLINE_ATTRIBUTE |
1917 | static void |
1918 | flow_rx_agg_host(struct nx_flowswitch *fsw, struct flow_entry *fe, |
1919 | struct pktq *dropped_pkts, bool is_mbuf) |
1920 | { |
1921 | #define __RX_AGG_HOST_DROP_SOURCE_PACKET(_pkt) do { \ |
1922 | drop_packets++; \ |
1923 | drop_bytes += (_pkt)->pkt_length; \ |
1924 | KPKTQ_ENQUEUE(dropped_pkts, (_pkt)); \ |
1925 | (_pkt) = NULL; \ |
1926 | FLOW_AGG_CLEAR(&fa); \ |
1927 | prev_csum_ok = false; \ |
1928 | } while (0) |
1929 | struct flow_agg fa; /* states */ |
1930 | FLOW_AGG_CLEAR(&fa); |
1931 | |
1932 | struct pktq disposed_pkts; /* done src packets */ |
1933 | KPKTQ_INIT(&disposed_pkts); |
1934 | |
1935 | struct __kern_packet *pkt, *tpkt; |
1936 | /* points to the first mbuf of chain */ |
1937 | struct mbuf *m_chain = NULL; |
1938 | /* super mbuf, at the end it points to last mbuf packet */ |
1939 | struct mbuf *smbuf = NULL, *curr_m = NULL; |
1940 | bool prev_csum_ok = false, csum_ok, agg_ok; |
1941 | uint16_t smbufs = 0, smbuf_finalized = 0; |
1942 | uint32_t bytes = 0, rcvd_ulen = 0; |
1943 | uint32_t rcvd_packets = 0, rcvd_bytes = 0; /* raw packets & bytes */ |
1944 | uint32_t drop_packets = 0, drop_bytes = 0; /* dropped packets & bytes */ |
1945 | uint32_t largest_smbuf = 0; |
1946 | int err = 0; |
1947 | |
1948 | struct fsw_stats *fsws = &fsw->fsw_stats; |
1949 | bool is_ipv4 = (fe->fe_key.fk_ipver == IPVERSION); |
1950 | |
1951 | SK_LOG_VAR(uint64_t logflags = (SK_VERB_FSW | SK_VERB_RX)); |
1952 | |
1953 | /* state for mbuf batch alloc */ |
1954 | uint32_t mhead_cnt = 0; |
1955 | uint32_t mhead_bufsize = 0; |
1956 | struct mbuf * mhead = NULL; |
1957 | |
1958 | uint16_t l2len = KPKTQ_FIRST(&fe->fe_rx_pktq)->pkt_l2_len; |
1959 | |
1960 | SK_DF(logflags, "Rx input queue bytes %u" , fe->fe_rx_pktq_bytes); |
1961 | |
1962 | if (__probable(!is_mbuf)) { |
1963 | /* |
1964 | * Batch mbuf alloc is based on |
1965 | * convert_native_pkt_to_mbuf_chain |
1966 | */ |
1967 | if (__probable(fe->fe_rx_largest_size != 0 && |
1968 | NX_FSW_TCP_RX_AGG_ENABLED())) { |
1969 | unsigned int num_segs = 1; |
1970 | int pktq_len = KPKTQ_LEN(&fe->fe_rx_pktq); |
1971 | |
1972 | if (fe->fe_rx_largest_size <= MCLBYTES && |
1973 | fe->fe_rx_pktq_bytes / pktq_len <= MCLBYTES) { |
1974 | mhead_bufsize = MCLBYTES; |
1975 | } else if (fe->fe_rx_largest_size <= MBIGCLBYTES && |
1976 | fe->fe_rx_pktq_bytes / pktq_len <= MBIGCLBYTES) { |
1977 | mhead_bufsize = MBIGCLBYTES; |
1978 | } else if (fe->fe_rx_largest_size <= M16KCLBYTES && |
1979 | fe->fe_rx_pktq_bytes / pktq_len <= M16KCLBYTES) { |
1980 | mhead_bufsize = M16KCLBYTES; |
1981 | } else { |
1982 | mhead_bufsize = M16KCLBYTES * 2; |
1983 | num_segs = 2; |
1984 | } |
1985 | |
1986 | try_again: |
1987 | if (fe->fe_rx_pktq_bytes != 0) { |
1988 | mhead_cnt = estimate_buf_cnt(fe, MCLBYTES, |
1989 | agg_bufsize: mhead_bufsize); |
1990 | } else { |
1991 | /* No payload, thus it's all small-sized ACKs/... */ |
1992 | mhead_bufsize = MHLEN; |
1993 | mhead_cnt = pktq_len; |
1994 | } |
1995 | |
1996 | mhead = m_allocpacket_internal(&mhead_cnt, |
1997 | mhead_bufsize, &num_segs, M_NOWAIT, 1, 0); |
1998 | |
1999 | if (mhead == NULL) { |
2000 | if (mhead_bufsize > M16KCLBYTES) { |
2001 | mhead_bufsize = M16KCLBYTES; |
2002 | num_segs = 1; |
2003 | goto try_again; |
2004 | } |
2005 | |
2006 | if (mhead_bufsize == M16KCLBYTES) { |
2007 | mhead_bufsize = MBIGCLBYTES; |
2008 | goto try_again; |
2009 | } |
2010 | |
2011 | if (mhead_bufsize == MBIGCLBYTES) { |
2012 | mhead_bufsize = MCLBYTES; |
2013 | goto try_again; |
2014 | } |
2015 | } |
2016 | } else { |
2017 | mhead = NULL; |
2018 | mhead_bufsize = mhead_cnt = 0; |
2019 | } |
2020 | SK_DF(logflags, "batch alloc'ed %u mbufs of size %u" , mhead_cnt, |
2021 | mhead_bufsize); |
2022 | } |
2023 | |
2024 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) { |
2025 | if (tpkt != NULL) { |
2026 | void *baddr; |
2027 | MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr); |
2028 | SK_PREFETCH(baddr, 0); |
2029 | } |
2030 | |
2031 | /* Validate l2 len, ip vers, is_mbuf */ |
2032 | ASSERT(pkt->pkt_l2_len == l2len); |
2033 | ASSERT(is_mbuf == !!(PKT_IS_MBUF(pkt))); |
2034 | ASSERT(fe->fe_key.fk_ipver == pkt->pkt_flow_ip_ver); |
2035 | ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED); |
2036 | ASSERT((pkt->pkt_link_flags & PKT_LINKF_ETHFCS) == 0); |
2037 | ASSERT(!pkt->pkt_flow_ip_is_frag); |
2038 | ASSERT(pkt->pkt_flow_ip_proto == IPPROTO_TCP); |
2039 | |
2040 | csum_ok = false; |
2041 | agg_ok = false; |
2042 | /* |
2043 | * As we only agg packets with same hdr length, |
2044 | * leverage the pkt metadata |
2045 | */ |
2046 | uint32_t thlen = (pkt->pkt_flow_ip_hlen + |
2047 | pkt->pkt_flow_tcp_hlen); |
2048 | uint32_t plen = (thlen + pkt->pkt_flow_ulen); |
2049 | |
2050 | /* |
2051 | * Rather than calling flow_pkt_track() for each |
2052 | * packet here, we accumulate received packet stats |
2053 | * for the call to flow_track_stats() below. This |
2054 | * is because flow tracking is a no-op for traffic |
2055 | * that belongs to the host stack. |
2056 | */ |
2057 | rcvd_ulen += pkt->pkt_flow_ulen; |
2058 | rcvd_bytes += pkt->pkt_length; |
2059 | rcvd_packets++; |
2060 | |
2061 | KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt); |
2062 | fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen; |
2063 | |
2064 | /* packet is for BSD flow, create a mbuf chain */ |
2065 | uint32_t len = (l2len + plen); |
2066 | uint16_t data_csum = 0; |
2067 | struct mbuf *m; |
2068 | bool is_wake_pkt = false; |
2069 | if (__improbable(is_mbuf)) { |
2070 | m = pkt->pkt_mbuf; |
2071 | |
2072 | if (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT) { |
2073 | is_wake_pkt = true; |
2074 | } |
2075 | |
2076 | /* Detach mbuf from source pkt */ |
2077 | KPKT_CLEAR_MBUF_DATA(pkt); |
2078 | |
2079 | uint32_t trailer = (m_pktlen(m) - len); |
2080 | ASSERT((uint32_t)m_pktlen(m) >= plen); |
2081 | /* Remove the trailer */ |
2082 | if (trailer > 0) { |
2083 | m_adj(m, -trailer); |
2084 | } |
2085 | /* attached mbuf is already allocated */ |
2086 | csum_ok = mbuf_csum(pkt, m, verify_l3: is_ipv4, data_csum: &data_csum); |
2087 | } else { /* native */ |
2088 | uint16_t pad = P2ROUNDUP(l2len, sizeof(uint32_t)) - |
2089 | l2len; |
2090 | uint32_t tot_len = (len + pad); |
2091 | /* remember largest aggregated packet size */ |
2092 | if (smbuf) { |
2093 | /* plus 4 bytes to account for padding */ |
2094 | if (largest_smbuf < |
2095 | (uint32_t)m_pktlen(smbuf) + pad) { |
2096 | largest_smbuf = (uint32_t)m_pktlen(smbuf) + pad; |
2097 | } |
2098 | } |
2099 | |
2100 | if ((pkt->pkt_pflags & PKT_F_WAKE_PKT)) { |
2101 | is_wake_pkt = true; |
2102 | } |
2103 | |
2104 | if (prev_csum_ok && curr_m) { |
2105 | ASSERT(fa.fa_smbuf == smbuf); |
2106 | ASSERT(!fa.fa_sobj_is_pkt); |
2107 | agg_ok = flow_agg_is_ok(fa: &fa, pkt, fsws); |
2108 | |
2109 | if (agg_ok && |
2110 | M_TRAILINGSPACE(curr_m) >= plen - thlen) { |
2111 | /* |
2112 | * No need for a new mbuf, |
2113 | * just append to curr_m. |
2114 | */ |
2115 | csum_ok = copy_pkt_csum_packed(spkt: pkt, |
2116 | plen, NULL, verify_l3: is_ipv4, currm: curr_m, NULL, |
2117 | data_csum: &data_csum, NULL); |
2118 | |
2119 | if (!csum_ok) { |
2120 | STATS_INC(fsws, |
2121 | FSW_STATS_RX_AGG_BAD_CSUM); |
2122 | SK_ERR("Checksum for " |
2123 | "aggregation is wrong" ); |
2124 | DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail1); |
2125 | /* |
2126 | * Turns out, checksum is wrong! |
2127 | * Fallback to no-agg mode. |
2128 | */ |
2129 | agg_ok = 0; |
2130 | } else { |
2131 | /* |
2132 | * We only added payload, |
2133 | * thus -thlen. |
2134 | */ |
2135 | bytes += (plen - thlen); |
2136 | flow_agg_merge_hdr(fa: &fa, pkt, |
2137 | data_csum, fsws); |
2138 | goto next; |
2139 | } |
2140 | } |
2141 | } |
2142 | |
2143 | /* |
2144 | * If the batch allocation returned partial success, |
2145 | * we try blocking allocation here again |
2146 | */ |
2147 | m = mhead; |
2148 | if (__improbable(m == NULL || |
2149 | tot_len > mhead_bufsize)) { |
2150 | unsigned int num_segs = 1; |
2151 | if (tot_len > M16KCLBYTES) { |
2152 | num_segs = 0; |
2153 | } |
2154 | |
2155 | ASSERT(mhead_cnt == 0 || mhead != NULL); |
2156 | err = mbuf_allocpacket(how: MBUF_DONTWAIT, packetlen: tot_len, |
2157 | maxchunks: &num_segs, mbuf: &m); |
2158 | if (err != 0) { |
2159 | STATS_INC(fsws, |
2160 | FSW_STATS_RX_DROP_NOMEM_BUF); |
2161 | SK_ERR("mbuf alloc failed (err %d), " |
2162 | "maxchunks %d, len %d" , err, num_segs, |
2163 | tot_len); |
2164 | __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt); |
2165 | continue; |
2166 | } |
2167 | } else { |
2168 | ASSERT(mhead_cnt > 0); |
2169 | mhead = m->m_nextpkt; |
2170 | m->m_nextpkt = NULL; |
2171 | mhead_cnt--; |
2172 | } |
2173 | m->m_data += pad; |
2174 | m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *); |
2175 | |
2176 | /* |
2177 | * copy and checksum l3, l4 and payload |
2178 | * l2 header is copied later only if we |
2179 | * can't agg as an optimization |
2180 | */ |
2181 | m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS; |
2182 | _dbuf_array_t dbuf_array = {.dba_is_buflet = false}; |
2183 | if (agg_ok) { |
2184 | int added = 0, dbuf_idx = 0; |
2185 | struct mbuf *m_tmp = m; |
2186 | dbuf_array.dba_num_dbufs = 0; |
2187 | uint32_t m_chain_max_len = 0; |
2188 | while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) { |
2189 | dbuf_array.dba_mbuf[dbuf_idx] = m_tmp; |
2190 | dbuf_array.dba_num_dbufs += 1; |
2191 | m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp); |
2192 | m_tmp = m_tmp->m_next; |
2193 | dbuf_idx++; |
2194 | } |
2195 | ASSERT(m_tmp == NULL); |
2196 | |
2197 | csum_ok = copy_pkt_csum_packed(spkt: pkt, plen, |
2198 | dbuf: &dbuf_array, verify_l3: is_ipv4, currm: curr_m, NULL, |
2199 | data_csum: &data_csum, added: &added); |
2200 | |
2201 | if (!csum_ok) { |
2202 | STATS_INC(fsws, |
2203 | FSW_STATS_RX_AGG_BAD_CSUM); |
2204 | SK_ERR("Checksum for aggregation " |
2205 | "on new mbuf is wrong" ); |
2206 | DTRACE_SKYWALK(aggr__host_packed_tcp_csum_fail2); |
2207 | agg_ok = false; |
2208 | goto non_agg; |
2209 | } |
2210 | |
2211 | /* |
2212 | * There was not enough space in curr_m, |
2213 | * thus we must have added to m->m_data. |
2214 | */ |
2215 | VERIFY(added > 0); |
2216 | VERIFY(m->m_len <= m->m_pkthdr.len && |
2217 | (uint32_t)m->m_pkthdr.len <= m_chain_max_len); |
2218 | |
2219 | /* |
2220 | * We account for whatever we added |
2221 | * to m later on, thus - added. |
2222 | */ |
2223 | bytes += plen - thlen - added; |
2224 | } else { |
2225 | non_agg: |
2226 | dbuf_array.dba_num_dbufs = 0; |
2227 | uint32_t m_chain_max_len = 0; |
2228 | struct mbuf *m_tmp = m; |
2229 | int dbuf_idx = 0; |
2230 | while (m_tmp != NULL && dbuf_idx < MAX_BUFLET_COUNT) { |
2231 | dbuf_array.dba_mbuf[dbuf_idx] = m_tmp; |
2232 | dbuf_array.dba_num_dbufs += 1; |
2233 | m_chain_max_len += (uint32_t)M_TRAILINGSPACE(m_tmp); |
2234 | m_tmp = m_tmp->m_next; |
2235 | dbuf_idx++; |
2236 | } |
2237 | ASSERT(m_tmp == NULL); |
2238 | |
2239 | m->m_len += l2len; |
2240 | m->m_pkthdr.len += l2len; |
2241 | csum_ok = copy_pkt_csum(pkt, plen, dbuf: &dbuf_array, |
2242 | data_csum: &data_csum, verify_l3: is_ipv4); |
2243 | if (__improbable(!csum_ok)) { |
2244 | STATS_INC(fsws, FSW_STATS_RX_AGG_BAD_CSUM); |
2245 | SK_ERR("%d incorrect csum" , __LINE__); |
2246 | DTRACE_SKYWALK(aggr__host_tcp_csum_fail); |
2247 | } |
2248 | VERIFY(m->m_len <= m->m_pkthdr.len && |
2249 | (uint32_t)m->m_pkthdr.len <= m_chain_max_len); |
2250 | } |
2251 | |
2252 | STATS_INC(fsws, FSW_STATS_RX_COPY_PKT2MBUF); |
2253 | STATS_INC(fsws, FSW_STATS_RX_COPY_SUM); |
2254 | |
2255 | m->m_pkthdr.csum_rx_start = pkt->pkt_csum_rx_start_off; |
2256 | m->m_pkthdr.csum_rx_val = pkt->pkt_csum_rx_value; |
2257 | /* |
2258 | * Note that these flags have same value, |
2259 | * except PACKET_CSUM_PARTIAL |
2260 | */ |
2261 | m->m_pkthdr.csum_flags |= (pkt->pkt_csum_flags & |
2262 | PACKET_CSUM_RX_FLAGS); |
2263 | |
2264 | /* Set the rcvif */ |
2265 | m->m_pkthdr.rcvif = fsw->fsw_ifp; |
2266 | |
2267 | /* Make sure to propagate the wake pkt flag */ |
2268 | if (is_wake_pkt) { |
2269 | m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT; |
2270 | } |
2271 | } |
2272 | ASSERT(m != NULL); |
2273 | ASSERT((m->m_flags & M_PKTHDR) && m->m_pkthdr.pkt_hdr != NULL); |
2274 | ASSERT((m->m_flags & M_HASFCS) == 0); |
2275 | ASSERT(m->m_nextpkt == NULL); |
2276 | |
2277 | if (__improbable(is_mbuf)) { |
2278 | if ((uint32_t) m->m_len < (l2len + thlen)) { |
2279 | m = m_pullup(m, (l2len + thlen)); |
2280 | if (m == NULL) { |
2281 | STATS_INC(fsws, |
2282 | FSW_STATS_RX_DROP_NOMEM_BUF); |
2283 | SK_ERR("mbuf pullup failed (err %d)" , |
2284 | err); |
2285 | __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt); |
2286 | continue; |
2287 | } |
2288 | m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *); |
2289 | } |
2290 | if (prev_csum_ok && csum_ok) { |
2291 | ASSERT(fa.fa_smbuf == smbuf); |
2292 | agg_ok = flow_agg_is_ok(fa: &fa, pkt, fsws); |
2293 | } |
2294 | } |
2295 | |
2296 | if (agg_ok) { |
2297 | ASSERT(is_wake_pkt == false); |
2298 | ASSERT(fa.fa_smbuf == smbuf); |
2299 | ASSERT(!fa.fa_sobj_is_pkt); |
2300 | if (__improbable(is_mbuf)) { |
2301 | bytes += (m_pktlen(m) - l2len); |
2302 | /* adjust mbuf by l2, l3 and l4 hdr */ |
2303 | m_adj(m, l2len + thlen); |
2304 | } else { |
2305 | bytes += m_pktlen(m); |
2306 | } |
2307 | |
2308 | m->m_flags &= ~M_PKTHDR; |
2309 | flow_agg_merge_hdr(fa: &fa, pkt, data_csum, fsws); |
2310 | while (curr_m->m_next != NULL) { |
2311 | curr_m = curr_m->m_next; |
2312 | } |
2313 | curr_m->m_next = m; |
2314 | curr_m = m; |
2315 | m = NULL; |
2316 | } else { |
2317 | if ((uint32_t) m->m_len < l2len) { |
2318 | m = m_pullup(m, l2len); |
2319 | if (m == NULL) { |
2320 | STATS_INC(fsws, |
2321 | FSW_STATS_RX_DROP_NOMEM_BUF); |
2322 | SK_ERR("mbuf pullup failed (err %d)" , |
2323 | err); |
2324 | __RX_AGG_HOST_DROP_SOURCE_PACKET(pkt); |
2325 | continue; |
2326 | } |
2327 | m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *); |
2328 | } |
2329 | |
2330 | /* copy l2 header for native */ |
2331 | if (__probable(!is_mbuf)) { |
2332 | uint16_t llhoff = pkt->pkt_headroom; |
2333 | uint8_t *baddr; |
2334 | MD_BUFLET_ADDR_ABS(pkt, baddr); |
2335 | ASSERT(baddr != NULL); |
2336 | baddr += llhoff; |
2337 | pkt_copy(src: baddr, dst: m_mtod_current(m), len: l2len); |
2338 | } |
2339 | /* adjust mbuf by l2 hdr */ |
2340 | m_adj(m, l2len); |
2341 | bytes += m_pktlen(m); |
2342 | |
2343 | /* |
2344 | * aggregated packets can be skipped by pktap because |
2345 | * the original pre-aggregated chain already passed through |
2346 | * pktap (see fsw_snoop()) before entering this function. |
2347 | */ |
2348 | m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP; |
2349 | |
2350 | if (m_chain == NULL) { |
2351 | /* this is the start of the chain */ |
2352 | m_chain = m; |
2353 | smbuf = m; |
2354 | curr_m = m; |
2355 | } else if (smbuf != NULL) { |
2356 | /* |
2357 | * set m to be next packet |
2358 | */ |
2359 | mbuf_agg_log(smbuf, kernproc, is_mbuf); |
2360 | smbuf->m_nextpkt = m; |
2361 | /* |
2362 | * Clean up (finalize) a smbuf only if it pre-allocated >1 segments, |
2363 | * which only happens when mhead_bufsize > M16KCLBYTES |
2364 | */ |
2365 | if (_finalize_smbuf(smbuf)) { |
2366 | FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF); |
2367 | } |
2368 | smbuf_finalized++; |
2369 | smbuf = m; |
2370 | curr_m = m; |
2371 | } else { |
2372 | VERIFY(0); |
2373 | } |
2374 | |
2375 | smbufs++; |
2376 | m = NULL; |
2377 | |
2378 | flow_agg_init_smbuf(fsw, fa: &fa, smbuf, pkt); |
2379 | /* |
2380 | * if the super packet is an mbuf which can't accomodate |
2381 | * (sizeof(struct ip6_tcp_mask) in a single buffer then |
2382 | * do the aggregation check in slow path. |
2383 | * Note that an mbuf without cluster has only 80 bytes |
2384 | * available for data, sizeof(struct ip6_tcp_mask) is |
2385 | * also 80 bytes, so if the packet contains an |
2386 | * ethernet header, this mbuf won't be able to fully |
2387 | * contain "struct ip6_tcp_mask" data in a single |
2388 | * buffer. |
2389 | */ |
2390 | if (pkt->pkt_flow_ip_ver == IPV6_VERSION) { |
2391 | if (__improbable(smbuf->m_len < |
2392 | ((m_mtod_current(smbuf) - |
2393 | (caddr_t)(smbuf->m_pkthdr.pkt_hdr)) + |
2394 | MASK_SIZE))) { |
2395 | fa.fa_sobj_is_short = true; |
2396 | } |
2397 | } |
2398 | } |
2399 | next: |
2400 | pkt_agg_log(pkt, kernproc, true); |
2401 | prev_csum_ok = csum_ok; |
2402 | KPKTQ_ENQUEUE(&disposed_pkts, pkt); |
2403 | } |
2404 | |
2405 | KPKTQ_FINI(&fe->fe_rx_pktq); |
2406 | |
2407 | /* Free any leftover mbufs, true only for native */ |
2408 | if (__improbable(mhead != NULL)) { |
2409 | ASSERT(mhead_cnt != 0); |
2410 | STATS_ADD(fsws, FSW_STATS_RX_WASTED_MBUF, mhead_cnt); |
2411 | (void) m_freem_list(mhead); |
2412 | mhead = NULL; |
2413 | mhead_cnt = 0; |
2414 | } |
2415 | |
2416 | converge_aggregation_size(fe, largest_agg_size: largest_smbuf); |
2417 | |
2418 | if (smbufs > 0) { |
2419 | /* Last smbuf */ |
2420 | mbuf_agg_log(smbuf, kernproc, is_mbuf); |
2421 | SK_DF(logflags, "smbuf count %u" , smbufs); |
2422 | |
2423 | ASSERT(m_chain != NULL); |
2424 | ASSERT(smbuf != NULL); |
2425 | |
2426 | /* |
2427 | * If the last mbuf needs to be finalized (mhead_bufsize > M16KCLBYTES) |
2428 | * but is not (smbuf_finalized < smbuf), do it now. |
2429 | */ |
2430 | if (smbuf_finalized < smbufs && |
2431 | _finalize_smbuf(smbuf)) { |
2432 | FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF); |
2433 | } |
2434 | |
2435 | /* |
2436 | * Call fsw_host_sendup() with mbuf chain |
2437 | * directly. |
2438 | */ |
2439 | mchain_agg_log(m_chain, kernproc, is_mbuf); |
2440 | fsw_host_sendup(fsw->fsw_ifp, m_chain, smbuf, smbufs, bytes); |
2441 | |
2442 | if (__improbable(is_mbuf)) { |
2443 | STATS_ADD(fsws, FSW_STATS_RX_AGG_MBUF2MBUF, smbufs); |
2444 | } else { |
2445 | STATS_ADD(fsws, FSW_STATS_RX_AGG_PKT2MBUF, smbufs); |
2446 | } |
2447 | FLOW_STATS_IN_ADD(fe, spackets, smbufs); |
2448 | |
2449 | ASSERT((fe->fe_flags & FLOWENTF_TRACK) == 0); |
2450 | } |
2451 | |
2452 | /* record (raw) number of packets and bytes */ |
2453 | ASSERT((int)(rcvd_bytes - drop_bytes) >= 0); |
2454 | ASSERT((int)(rcvd_packets - drop_packets) >= 0); |
2455 | flow_track_stats(fe, (rcvd_bytes - drop_bytes), |
2456 | (rcvd_packets - drop_packets), (rcvd_ulen != 0), true); |
2457 | |
2458 | pp_free_pktq(&disposed_pkts); |
2459 | } |
2460 | |
2461 | void |
2462 | flow_rx_agg_tcp(struct nx_flowswitch *fsw, struct flow_entry *fe, |
2463 | uint32_t flags) |
2464 | { |
2465 | #pragma unused(flags) |
2466 | struct pktq dropped_pkts; |
2467 | bool is_mbuf; |
2468 | |
2469 | if (__improbable(fe->fe_rx_frag_count > 0)) { |
2470 | dp_flow_rx_process(fsw, fe, flags: 0); |
2471 | return; |
2472 | } |
2473 | |
2474 | KPKTQ_INIT(&dropped_pkts); |
2475 | |
2476 | if (!dp_flow_rx_route_process(fsw, fe)) { |
2477 | SK_ERR("Rx route bad" ); |
2478 | fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true); |
2479 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_RX_FLOW_NONVIABLE, |
2480 | KPKTQ_LEN(&dropped_pkts)); |
2481 | goto done; |
2482 | } |
2483 | |
2484 | is_mbuf = !!(PKT_IS_MBUF(KPKTQ_FIRST(&fe->fe_rx_pktq))); |
2485 | |
2486 | if (fe->fe_nx_port == FSW_VP_HOST) { |
2487 | boolean_t do_rx_agg; |
2488 | |
2489 | /* BSD flow */ |
2490 | if (sk_fsw_rx_agg_tcp_host != SK_FSW_RX_AGG_TCP_HOST_AUTO) { |
2491 | do_rx_agg = (sk_fsw_rx_agg_tcp_host == |
2492 | SK_FSW_RX_AGG_TCP_HOST_ON); |
2493 | } else { |
2494 | do_rx_agg = !dlil_has_ip_filter() && |
2495 | !dlil_has_if_filter(fsw->fsw_ifp); |
2496 | } |
2497 | if (__improbable(!do_rx_agg)) { |
2498 | fsw_host_rx(fsw, &fe->fe_rx_pktq); |
2499 | return; |
2500 | } |
2501 | if (__improbable(pktap_total_tap_count != 0)) { |
2502 | fsw_snoop(fsw, fe, true); |
2503 | } |
2504 | flow_rx_agg_host(fsw, fe, dropped_pkts: &dropped_pkts, is_mbuf); |
2505 | } else { |
2506 | /* channel flow */ |
2507 | if (__improbable(pktap_total_tap_count != 0)) { |
2508 | fsw_snoop(fsw, fe, true); |
2509 | } |
2510 | flow_rx_agg_channel(fsw, fe, dropped_pkts: &dropped_pkts, is_mbuf); |
2511 | } |
2512 | |
2513 | done: |
2514 | pp_free_pktq(&dropped_pkts); |
2515 | } |
2516 | |