1 | /* |
2 | * Copyright (c) 2015-2022 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <skywalk/os_skywalk_private.h> |
30 | #include <skywalk/nexus/flowswitch/flow/flow_var.h> |
31 | #include <netinet/tcp.h> |
32 | #include <netinet/udp.h> |
33 | #include <netinet/ip.h> |
34 | #include <netinet/ip6.h> |
35 | |
36 | #define CL_SKIP_ON(t) \ |
37 | if (__improbable(t)) { \ |
38 | SK_ERR("%d: skip " #t, __LINE__); \ |
39 | SK_ERR("%s %s", if_name(ifp), sk_dump("buf", \ |
40 | pkt_buf + pkt->pkt_headroom, pkt->pkt_length, \ |
41 | MIN(128, bdlen), NULL, 0)); \ |
42 | error = ENOTSUP; \ |
43 | goto done; \ |
44 | } |
45 | |
46 | #define CL_SKIP_L4() \ |
47 | do { \ |
48 | pkt->pkt_flow_ip_hlen = l3hlen; \ |
49 | pkt->pkt_flow_tcp_src = 0; \ |
50 | pkt->pkt_flow_tcp_dst = 0; \ |
51 | error = 0; \ |
52 | goto done; \ |
53 | } while (0); |
54 | |
55 | /* |
56 | * Packet flow parser |
57 | * |
58 | * Parse a continuous chunk of packet header fields. |
59 | * |
60 | * The idea here is that while we have the headers in the CPU cache, |
61 | * do as much parsing as necessary and store the results in __flow. |
62 | * |
63 | * We assume that outbound packets from the host (BSD) stack never |
64 | * get here, i.e. we only handle channel-based outbound traffic. |
65 | * |
66 | * @param pkt |
67 | * packet to be classified |
68 | * @param ifp |
69 | * associated network interface |
70 | * @param af |
71 | * address family |
72 | * @param input |
73 | * is it input |
74 | * |
75 | * @return |
76 | * We return ENOTSUP to indicate that we can't classify the packet, |
77 | * and that the packet should still be forwarded to the lookup path. |
78 | * Any other non-zero value will cause the packet to be dropped. |
79 | * |
80 | */ |
81 | int |
82 | flow_pkt_classify(struct __kern_packet *pkt, struct ifnet *ifp, sa_family_t af, |
83 | bool input) |
84 | { |
85 | #pragma unused(ifp) |
86 | /* these begin at the same offset in the packet, hence the unions */ |
87 | union { |
88 | volatile struct ip *_iph; |
89 | volatile struct ip6_hdr *_ip6; |
90 | } _l3; |
91 | #define iph _l3._iph |
92 | #define ip6 _l3._ip6 |
93 | union { |
94 | volatile struct tcphdr *_tcph; |
95 | volatile struct udphdr *_udph; |
96 | } _l4; |
97 | #define tcph _l4._tcph |
98 | #define udph _l4._udph |
99 | uint32_t mtu = ifp->if_mtu; |
100 | |
101 | size_t pkt_len; /* remaining packet length left for parsing */ |
102 | uint32_t cls_len; |
103 | |
104 | /* |
105 | * These are length parsed from packet header, needs to be |
106 | * incrementally validated from l3 to l4 |
107 | */ |
108 | uint8_t l3hlen = 0; /* IP header length */ |
109 | uint16_t l3tlen = 0; /* total length of IP packet */ |
110 | uint8_t l4hlen = 0; /* TCP/UDP header length */ |
111 | uint16_t ulen = 0; /* user data length */ |
112 | |
113 | int error = 0; |
114 | |
115 | /* must be 16-bytes aligned due to use of sk_copy* below */ |
116 | _CASSERT((offsetof(struct __flow, flow_l3) % 16) == 0); |
117 | _CASSERT((offsetof(struct __flow, flow_ipv4_src) % 16) == 0); |
118 | _CASSERT((offsetof(struct __flow, flow_ipv6_src) % 16) == 0); |
119 | _CASSERT((offsetof(struct __flow, flow_l4) % 16) == 0); |
120 | _CASSERT((offsetof(struct __flow, flow_tcp_src) % 16) == 0); |
121 | _CASSERT((offsetof(struct __flow, flow_udp_src) % 16) == 0); |
122 | _CASSERT((offsetof(struct __flow, flow_esp_spi) % 16) == 0); |
123 | |
124 | _CASSERT(sizeof(struct __flow_l3_ipv4_addrs) == 8); |
125 | _CASSERT((offsetof(struct __flow_l3_ipv4_addrs, _dst) - |
126 | offsetof(struct __flow_l3_ipv4_addrs, _src)) == |
127 | (offsetof(struct ip, ip_dst) - offsetof(struct ip, ip_src))); |
128 | |
129 | _CASSERT(sizeof(struct __flow_l3_ipv6_addrs) == 32); |
130 | _CASSERT((offsetof(struct __flow_l3_ipv6_addrs, _dst) - |
131 | offsetof(struct __flow_l3_ipv6_addrs, _src)) == |
132 | (offsetof(struct ip6_hdr, ip6_dst) - |
133 | offsetof(struct ip6_hdr, ip6_src))); |
134 | |
135 | /* __flow_l4_tcp must mirror tcphdr for the first 16-bytes */ |
136 | _CASSERT(sizeof(struct __flow_l4_tcp) == 16); |
137 | _CASSERT((offsetof(struct __flow_l4_tcp, _dst) - |
138 | offsetof(struct __flow_l4_tcp, _src)) == |
139 | (offsetof(struct tcphdr, th_dport) - |
140 | offsetof(struct tcphdr, th_sport))); |
141 | _CASSERT((offsetof(struct __flow_l4_tcp, _seq) - |
142 | offsetof(struct __flow_l4_tcp, _src)) == |
143 | (offsetof(struct tcphdr, th_seq) - |
144 | offsetof(struct tcphdr, th_sport))); |
145 | _CASSERT((offsetof(struct __flow_l4_tcp, _ack) - |
146 | offsetof(struct __flow_l4_tcp, _src)) == |
147 | (offsetof(struct tcphdr, th_ack) - |
148 | offsetof(struct tcphdr, th_sport))); |
149 | _CASSERT((offsetof(struct __flow_l4_tcp, _flags) - |
150 | offsetof(struct __flow_l4_tcp, _src)) == |
151 | (offsetof(struct tcphdr, th_flags) - |
152 | offsetof(struct tcphdr, th_sport))); |
153 | _CASSERT((offsetof(struct __flow_l4_tcp, _win) - |
154 | offsetof(struct __flow_l4_tcp, _src)) == |
155 | (offsetof(struct tcphdr, th_win) - |
156 | offsetof(struct tcphdr, th_sport))); |
157 | |
158 | /* ensure same offsets use for TCP and UDP */ |
159 | _CASSERT(sizeof(struct __flow_l4_udp) == 8); |
160 | _CASSERT(offsetof(struct __flow, flow_tcp_src) == |
161 | offsetof(struct __flow, flow_udp_src)); |
162 | _CASSERT(offsetof(struct __flow, flow_tcp_dst) == |
163 | offsetof(struct __flow, flow_udp_dst)); |
164 | |
165 | |
166 | /* parsing starts from l3, count SDU length after l2 header */ |
167 | ASSERT(pkt->pkt_l2_len <= pkt->pkt_length); |
168 | pkt_len = pkt->pkt_length - pkt->pkt_l2_len; |
169 | |
170 | /* |
171 | * we restrict the data length available for classification to the |
172 | * portion of L3 datagram available in the first buflet. |
173 | */ |
174 | /* |
175 | * compat netif sets the packet length and buflet data length |
176 | * metadata to the original length of the packet although the |
177 | * actual buffer is limited to NETIF_COMPAT_BUF_SIZE (128 bytes). |
178 | */ |
179 | uint8_t *pkt_buf, *l3_hdr; |
180 | uint32_t bdlen, bdlim, bdoff; |
181 | |
182 | MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff); |
183 | cls_len = bdlim - bdoff; |
184 | cls_len -= pkt->pkt_l2_len; |
185 | cls_len = (uint16_t)MIN(cls_len, pkt_len); |
186 | VERIFY(pkt_len >= cls_len); |
187 | |
188 | /* takes care of ip6 assignment too */ |
189 | l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len; |
190 | iph = (volatile struct ip *)(void *)l3_hdr; |
191 | |
192 | VERIFY(af != AF_UNSPEC); |
193 | |
194 | pkt->pkt_flow_ip_ver = 0; |
195 | |
196 | /* |
197 | * This code is in the hot data path, so we try to be as efficient |
198 | * as possible, and hence the use of unrolled loads/stores. |
199 | */ |
200 | |
201 | /***************** L3 header (IP/IPv6) *****************/ |
202 | switch (af) { |
203 | case AF_INET: |
204 | CL_SKIP_ON(cls_len < sizeof(struct ip)); |
205 | l3hlen = (uint8_t)(iph->ip_hl << 2); |
206 | CL_SKIP_ON(l3hlen < sizeof(struct ip)); |
207 | CL_SKIP_ON(cls_len < l3hlen); |
208 | |
209 | /* don't allow outgoing channel-based packet with option(s) */ |
210 | CL_SKIP_ON(!input && l3hlen != sizeof(struct ip)); |
211 | |
212 | l3tlen = ntohs(iph->ip_len); |
213 | |
214 | CL_SKIP_ON(l3tlen < l3hlen); |
215 | CL_SKIP_ON(pkt_len < l3tlen); |
216 | CL_SKIP_ON(iph->ip_v != IPVERSION); |
217 | |
218 | if (__probable(IS_P2ALIGNED(&iph->ip_src, 8))) { |
219 | sk_copy64_8(__DECONST(uint64_t *, &iph->ip_src), |
220 | dst: (uint64_t *)(void *)&pkt->pkt_flow_ipv4_src); |
221 | } else if (IS_P2ALIGNED(&iph->ip_src, 4)) { |
222 | sk_copy32_8(__DECONST(uint32_t *, &iph->ip_src), |
223 | dst: (uint32_t *)(void *)&pkt->pkt_flow_ipv4_src); |
224 | } else { |
225 | bcopy(__DECONST(void *, &iph->ip_src), |
226 | dst: (void *)&pkt->pkt_flow_ipv4_addrs, |
227 | n: sizeof(struct __flow_l3_ipv4_addrs)); |
228 | } |
229 | |
230 | pkt->pkt_flow_ip_ver = IPVERSION; |
231 | pkt->pkt_flow_ip_proto = iph->ip_p; |
232 | pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iph; |
233 | |
234 | if (__improbable(ntohs(iph->ip_off) & ~(IP_DF | IP_RF))) { |
235 | pkt->pkt_flow_ip_is_frag = TRUE; |
236 | pkt->pkt_flow_ip_frag_id = iph->ip_id; |
237 | /* we only parse l4 in the 1st frag */ |
238 | if ((ntohs(iph->ip_off) & IP_OFFMASK) != 0) { |
239 | pkt->pkt_flow_ip_is_first_frag = FALSE; |
240 | CL_SKIP_L4(); |
241 | } else { |
242 | pkt->pkt_flow_ip_is_first_frag = TRUE; |
243 | } |
244 | } |
245 | break; |
246 | |
247 | case AF_INET6: |
248 | l3hlen = sizeof(struct ip6_hdr); |
249 | CL_SKIP_ON(cls_len < l3hlen); |
250 | |
251 | l3tlen = l3hlen + ntohs(ip6->ip6_plen); |
252 | CL_SKIP_ON(pkt_len < l3tlen); |
253 | CL_SKIP_ON((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION); |
254 | |
255 | if (__probable(IS_P2ALIGNED(&ip6->ip6_src, 8))) { |
256 | sk_copy64_32(__DECONST(uint64_t *, &ip6->ip6_src), |
257 | dst: (uint64_t *)(void *)&pkt->pkt_flow_ipv6_src); |
258 | } else if (IS_P2ALIGNED(&ip6->ip6_src, 4)) { |
259 | sk_copy32_32(__DECONST(uint32_t *, &ip6->ip6_src), |
260 | dst: (uint32_t *)(void *)&pkt->pkt_flow_ipv6_src); |
261 | } else { |
262 | bcopy(__DECONST(void *, &ip6->ip6_src), |
263 | dst: (void *)&pkt->pkt_flow_ipv6_addrs, |
264 | n: sizeof(struct __flow_l3_ipv6_addrs)); |
265 | } |
266 | |
267 | pkt->pkt_flow_ip_ver = IPV6_VERSION; |
268 | pkt->pkt_flow_ip_proto = ip6->ip6_nxt; |
269 | pkt->pkt_flow_ip_hdr = (mach_vm_address_t)ip6; |
270 | |
271 | /* only parse the next immediate extension header for frags */ |
272 | if (__improbable(ip6->ip6_nxt == IPPROTO_FRAGMENT)) { |
273 | volatile struct ip6_frag *ip6f; |
274 | ip6f = (volatile struct ip6_frag *)(ip6 + 1); |
275 | CL_SKIP_ON(cls_len < l3hlen + sizeof(struct ip6_frag)); |
276 | pkt->pkt_flow_ip_is_frag = 1; |
277 | pkt->pkt_flow_ip_frag_id = ip6f->ip6f_ident; |
278 | pkt->pkt_flow_ip_proto = ip6f->ip6f_nxt; |
279 | l3hlen += sizeof(struct ip6_frag); |
280 | CL_SKIP_ON(l3tlen < l3hlen); |
281 | /* we only parse l4 in the 1st frag */ |
282 | if ((ip6f->ip6f_offlg & IP6F_OFF_MASK) != 0) { |
283 | pkt->pkt_flow_ip_is_first_frag = FALSE; |
284 | CL_SKIP_L4(); |
285 | } else { |
286 | pkt->pkt_flow_ip_is_first_frag = TRUE; |
287 | } |
288 | /* process atomic frag as non-frag */ |
289 | if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) { |
290 | pkt->pkt_flow_ip_is_frag = 0; |
291 | } |
292 | } |
293 | break; |
294 | |
295 | default: |
296 | error = ENOTSUP; |
297 | goto done; |
298 | } |
299 | |
300 | pkt->pkt_flow_ip_hlen = l3hlen; |
301 | if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_TCP && |
302 | pkt->pkt_flow_ip_proto != IPPROTO_UDP)) { |
303 | error = 0; |
304 | goto done; |
305 | } |
306 | |
307 | /**************** L4 header (TCP/UDP) *****************/ |
308 | |
309 | /* this takes care of UDP header as well (see l4 union var) */ |
310 | tcph = __DECONST(volatile struct tcphdr *, |
311 | (volatile uint8_t *)iph + l3hlen); |
312 | ulen = (l3tlen - l3hlen); |
313 | if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) { |
314 | CL_SKIP_ON((cls_len < l3hlen + sizeof(*tcph)) || |
315 | (ulen < sizeof(*tcph))); |
316 | l4hlen = (uint8_t)(tcph->th_off << 2); |
317 | CL_SKIP_ON(l4hlen < sizeof(*tcph)); |
318 | CL_SKIP_ON(l4hlen > ulen); |
319 | pkt->pkt_flow_tcp_hlen = l4hlen; |
320 | pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcph; |
321 | } else { |
322 | CL_SKIP_ON((cls_len < l3hlen + sizeof(*udph)) || |
323 | (ulen < sizeof(*udph))); |
324 | l4hlen = sizeof(*udph); |
325 | CL_SKIP_ON(l4hlen > ulen); |
326 | pkt->pkt_flow_udp_hlen = l4hlen; |
327 | pkt->pkt_flow_udp_hdr = (mach_vm_address_t)udph; |
328 | } |
329 | |
330 | if (__probable(!pkt->pkt_flow_ip_is_frag)) { |
331 | ulen -= l4hlen; |
332 | pkt->pkt_flow_ulen = ulen; |
333 | } else { |
334 | /* |
335 | * We can't determine user data length for fragment until |
336 | * it is reassembled. |
337 | */ |
338 | pkt->pkt_flow_ulen = 0; |
339 | } |
340 | |
341 | if (__probable(IS_P2ALIGNED(&tcph->th_sport, 4))) { |
342 | if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) { |
343 | sk_copy32_16(__DECONST(uint32_t *, &tcph->th_sport), |
344 | dst: (uint32_t *)(void *)&pkt->pkt_flow_tcp_src); |
345 | } else { |
346 | sk_copy32_8(__DECONST(uint32_t *, &udph->uh_sport), |
347 | dst: (uint32_t *)(void *)&pkt->pkt_flow_udp_src); |
348 | } |
349 | } else { |
350 | if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) { |
351 | bcopy(__DECONST(void *, &tcph->th_sport), |
352 | dst: (void *)&pkt->pkt_flow_tcp, |
353 | n: sizeof(struct __flow_l4_tcp)); |
354 | } else { |
355 | bcopy(__DECONST(void *, &udph->uh_sport), |
356 | dst: (void *)&pkt->pkt_flow_udp, |
357 | n: sizeof(struct __flow_l4_udp)); |
358 | } |
359 | } |
360 | |
361 | if (!input && pkt->pkt_flow_ip_proto == IPPROTO_TCP && |
362 | pkt->pkt_flow_ulen != 0) { |
363 | /* |
364 | * Following the logic in tcp_output(), we mark |
365 | * this if the payload is non-zero; note that |
366 | * the pkt_flow_tcp_seq is in network byte order. |
367 | */ |
368 | pkt->pkt_pflags |= PKT_F_START_SEQ; |
369 | } |
370 | done: |
371 | if (__probable(error == 0)) { |
372 | SK_DF(SK_VERB_FLOW_CLASSIFY, "pkt_length %u l3_ip_len %u " |
373 | "l3_ip_ver 0x%x l3_proto %u l4_sport %u l4_dport %u" , |
374 | pkt->pkt_length, l3tlen, pkt->pkt_flow_ip_ver, |
375 | pkt->pkt_flow_ip_proto, ntohs(pkt->pkt_flow_tcp_src), |
376 | ntohs(pkt->pkt_flow_tcp_dst)); |
377 | /* on output, trim metadata length if not same as IP length */ |
378 | if (!input) { |
379 | if (__improbable(pkt->pkt_length != (l3tlen + pkt->pkt_l2_len))) { |
380 | SK_ERR("packet is too long (%u), trimming to " |
381 | "IP + L2 length (%d)" , pkt->pkt_length, |
382 | l3tlen + pkt->pkt_l2_len); |
383 | METADATA_SET_LEN(pkt, l3tlen + pkt->pkt_l2_len, bdoff); |
384 | } |
385 | if (__improbable(((pkt->pkt_length > mtu) && |
386 | (pkt->pkt_proto_seg_sz == 0)) || |
387 | (pkt->pkt_proto_seg_sz > mtu))) { |
388 | SK_ERR("dropped; length (%u) exceeds MTU (%d) " |
389 | " proto_seg_sz %d" , |
390 | pkt->pkt_length, mtu, |
391 | pkt->pkt_proto_seg_sz); |
392 | SK_ERR("%s" , sk_dump("buf" , l3_hdr, cls_len, |
393 | 128, NULL, 0)); |
394 | error = EMSGSIZE; |
395 | goto fail; |
396 | } |
397 | } |
398 | /* |
399 | * Mark QUM_F_FLOW_CLASSIFIED on the packet to indicate |
400 | * that the __flow structure has valid info now. |
401 | */ |
402 | pkt->pkt_qum_qflags |= QUM_F_FLOW_CLASSIFIED; |
403 | return 0; |
404 | } |
405 | |
406 | fail: |
407 | ASSERT(error != 0 && !(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED)); |
408 | KPKT_CLEAR_FLOW_ALL(pkt->pkt_flow); |
409 | |
410 | return error; |
411 | } |
412 | |