| 1 | /* |
| 2 | * Copyright (c) 2015-2022 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | #include <skywalk/os_skywalk_private.h> |
| 30 | #include <skywalk/nexus/flowswitch/flow/flow_var.h> |
| 31 | #include <netinet/tcp.h> |
| 32 | #include <netinet/udp.h> |
| 33 | #include <netinet/ip.h> |
| 34 | #include <netinet/ip6.h> |
| 35 | |
| 36 | #define CL_SKIP_ON(t) \ |
| 37 | if (__improbable(t)) { \ |
| 38 | SK_ERR("%d: skip " #t, __LINE__); \ |
| 39 | SK_ERR("%s %s", if_name(ifp), sk_dump("buf", \ |
| 40 | pkt_buf + pkt->pkt_headroom, pkt->pkt_length, \ |
| 41 | MIN(128, bdlen), NULL, 0)); \ |
| 42 | error = ENOTSUP; \ |
| 43 | goto done; \ |
| 44 | } |
| 45 | |
| 46 | #define CL_SKIP_L4() \ |
| 47 | do { \ |
| 48 | pkt->pkt_flow_ip_hlen = l3hlen; \ |
| 49 | pkt->pkt_flow_tcp_src = 0; \ |
| 50 | pkt->pkt_flow_tcp_dst = 0; \ |
| 51 | error = 0; \ |
| 52 | goto done; \ |
| 53 | } while (0); |
| 54 | |
| 55 | /* |
| 56 | * Packet flow parser |
| 57 | * |
| 58 | * Parse a continuous chunk of packet header fields. |
| 59 | * |
| 60 | * The idea here is that while we have the headers in the CPU cache, |
| 61 | * do as much parsing as necessary and store the results in __flow. |
| 62 | * |
| 63 | * We assume that outbound packets from the host (BSD) stack never |
| 64 | * get here, i.e. we only handle channel-based outbound traffic. |
| 65 | * |
| 66 | * @param pkt |
| 67 | * packet to be classified |
| 68 | * @param ifp |
| 69 | * associated network interface |
| 70 | * @param af |
| 71 | * address family |
| 72 | * @param input |
| 73 | * is it input |
| 74 | * |
| 75 | * @return |
| 76 | * We return ENOTSUP to indicate that we can't classify the packet, |
| 77 | * and that the packet should still be forwarded to the lookup path. |
| 78 | * Any other non-zero value will cause the packet to be dropped. |
| 79 | * |
| 80 | */ |
| 81 | int |
| 82 | flow_pkt_classify(struct __kern_packet *pkt, struct ifnet *ifp, sa_family_t af, |
| 83 | bool input) |
| 84 | { |
| 85 | #pragma unused(ifp) |
| 86 | /* these begin at the same offset in the packet, hence the unions */ |
| 87 | union { |
| 88 | volatile struct ip *_iph; |
| 89 | volatile struct ip6_hdr *_ip6; |
| 90 | } _l3; |
| 91 | #define iph _l3._iph |
| 92 | #define ip6 _l3._ip6 |
| 93 | union { |
| 94 | volatile struct tcphdr *_tcph; |
| 95 | volatile struct udphdr *_udph; |
| 96 | } _l4; |
| 97 | #define tcph _l4._tcph |
| 98 | #define udph _l4._udph |
| 99 | uint32_t mtu = ifp->if_mtu; |
| 100 | |
| 101 | size_t pkt_len; /* remaining packet length left for parsing */ |
| 102 | uint32_t cls_len; |
| 103 | |
| 104 | /* |
| 105 | * These are length parsed from packet header, needs to be |
| 106 | * incrementally validated from l3 to l4 |
| 107 | */ |
| 108 | uint8_t l3hlen = 0; /* IP header length */ |
| 109 | uint16_t l3tlen = 0; /* total length of IP packet */ |
| 110 | uint8_t l4hlen = 0; /* TCP/UDP header length */ |
| 111 | uint16_t ulen = 0; /* user data length */ |
| 112 | |
| 113 | int error = 0; |
| 114 | |
| 115 | /* must be 16-bytes aligned due to use of sk_copy* below */ |
| 116 | _CASSERT((offsetof(struct __flow, flow_l3) % 16) == 0); |
| 117 | _CASSERT((offsetof(struct __flow, flow_ipv4_src) % 16) == 0); |
| 118 | _CASSERT((offsetof(struct __flow, flow_ipv6_src) % 16) == 0); |
| 119 | _CASSERT((offsetof(struct __flow, flow_l4) % 16) == 0); |
| 120 | _CASSERT((offsetof(struct __flow, flow_tcp_src) % 16) == 0); |
| 121 | _CASSERT((offsetof(struct __flow, flow_udp_src) % 16) == 0); |
| 122 | _CASSERT((offsetof(struct __flow, flow_esp_spi) % 16) == 0); |
| 123 | |
| 124 | _CASSERT(sizeof(struct __flow_l3_ipv4_addrs) == 8); |
| 125 | _CASSERT((offsetof(struct __flow_l3_ipv4_addrs, _dst) - |
| 126 | offsetof(struct __flow_l3_ipv4_addrs, _src)) == |
| 127 | (offsetof(struct ip, ip_dst) - offsetof(struct ip, ip_src))); |
| 128 | |
| 129 | _CASSERT(sizeof(struct __flow_l3_ipv6_addrs) == 32); |
| 130 | _CASSERT((offsetof(struct __flow_l3_ipv6_addrs, _dst) - |
| 131 | offsetof(struct __flow_l3_ipv6_addrs, _src)) == |
| 132 | (offsetof(struct ip6_hdr, ip6_dst) - |
| 133 | offsetof(struct ip6_hdr, ip6_src))); |
| 134 | |
| 135 | /* __flow_l4_tcp must mirror tcphdr for the first 16-bytes */ |
| 136 | _CASSERT(sizeof(struct __flow_l4_tcp) == 16); |
| 137 | _CASSERT((offsetof(struct __flow_l4_tcp, _dst) - |
| 138 | offsetof(struct __flow_l4_tcp, _src)) == |
| 139 | (offsetof(struct tcphdr, th_dport) - |
| 140 | offsetof(struct tcphdr, th_sport))); |
| 141 | _CASSERT((offsetof(struct __flow_l4_tcp, _seq) - |
| 142 | offsetof(struct __flow_l4_tcp, _src)) == |
| 143 | (offsetof(struct tcphdr, th_seq) - |
| 144 | offsetof(struct tcphdr, th_sport))); |
| 145 | _CASSERT((offsetof(struct __flow_l4_tcp, _ack) - |
| 146 | offsetof(struct __flow_l4_tcp, _src)) == |
| 147 | (offsetof(struct tcphdr, th_ack) - |
| 148 | offsetof(struct tcphdr, th_sport))); |
| 149 | _CASSERT((offsetof(struct __flow_l4_tcp, _flags) - |
| 150 | offsetof(struct __flow_l4_tcp, _src)) == |
| 151 | (offsetof(struct tcphdr, th_flags) - |
| 152 | offsetof(struct tcphdr, th_sport))); |
| 153 | _CASSERT((offsetof(struct __flow_l4_tcp, _win) - |
| 154 | offsetof(struct __flow_l4_tcp, _src)) == |
| 155 | (offsetof(struct tcphdr, th_win) - |
| 156 | offsetof(struct tcphdr, th_sport))); |
| 157 | |
| 158 | /* ensure same offsets use for TCP and UDP */ |
| 159 | _CASSERT(sizeof(struct __flow_l4_udp) == 8); |
| 160 | _CASSERT(offsetof(struct __flow, flow_tcp_src) == |
| 161 | offsetof(struct __flow, flow_udp_src)); |
| 162 | _CASSERT(offsetof(struct __flow, flow_tcp_dst) == |
| 163 | offsetof(struct __flow, flow_udp_dst)); |
| 164 | |
| 165 | |
| 166 | /* parsing starts from l3, count SDU length after l2 header */ |
| 167 | ASSERT(pkt->pkt_l2_len <= pkt->pkt_length); |
| 168 | pkt_len = pkt->pkt_length - pkt->pkt_l2_len; |
| 169 | |
| 170 | /* |
| 171 | * we restrict the data length available for classification to the |
| 172 | * portion of L3 datagram available in the first buflet. |
| 173 | */ |
| 174 | /* |
| 175 | * compat netif sets the packet length and buflet data length |
| 176 | * metadata to the original length of the packet although the |
| 177 | * actual buffer is limited to NETIF_COMPAT_BUF_SIZE (128 bytes). |
| 178 | */ |
| 179 | uint8_t *pkt_buf, *l3_hdr; |
| 180 | uint32_t bdlen, bdlim, bdoff; |
| 181 | |
| 182 | MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff); |
| 183 | cls_len = bdlim - bdoff; |
| 184 | cls_len -= pkt->pkt_l2_len; |
| 185 | cls_len = (uint16_t)MIN(cls_len, pkt_len); |
| 186 | VERIFY(pkt_len >= cls_len); |
| 187 | |
| 188 | /* takes care of ip6 assignment too */ |
| 189 | l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len; |
| 190 | iph = (volatile struct ip *)(void *)l3_hdr; |
| 191 | |
| 192 | VERIFY(af != AF_UNSPEC); |
| 193 | |
| 194 | pkt->pkt_flow_ip_ver = 0; |
| 195 | |
| 196 | /* |
| 197 | * This code is in the hot data path, so we try to be as efficient |
| 198 | * as possible, and hence the use of unrolled loads/stores. |
| 199 | */ |
| 200 | |
| 201 | /***************** L3 header (IP/IPv6) *****************/ |
| 202 | switch (af) { |
| 203 | case AF_INET: |
| 204 | CL_SKIP_ON(cls_len < sizeof(struct ip)); |
| 205 | l3hlen = (uint8_t)(iph->ip_hl << 2); |
| 206 | CL_SKIP_ON(l3hlen < sizeof(struct ip)); |
| 207 | CL_SKIP_ON(cls_len < l3hlen); |
| 208 | |
| 209 | /* don't allow outgoing channel-based packet with option(s) */ |
| 210 | CL_SKIP_ON(!input && l3hlen != sizeof(struct ip)); |
| 211 | |
| 212 | l3tlen = ntohs(iph->ip_len); |
| 213 | |
| 214 | CL_SKIP_ON(l3tlen < l3hlen); |
| 215 | CL_SKIP_ON(pkt_len < l3tlen); |
| 216 | CL_SKIP_ON(iph->ip_v != IPVERSION); |
| 217 | |
| 218 | if (__probable(IS_P2ALIGNED(&iph->ip_src, 8))) { |
| 219 | sk_copy64_8(__DECONST(uint64_t *, &iph->ip_src), |
| 220 | dst: (uint64_t *)(void *)&pkt->pkt_flow_ipv4_src); |
| 221 | } else if (IS_P2ALIGNED(&iph->ip_src, 4)) { |
| 222 | sk_copy32_8(__DECONST(uint32_t *, &iph->ip_src), |
| 223 | dst: (uint32_t *)(void *)&pkt->pkt_flow_ipv4_src); |
| 224 | } else { |
| 225 | bcopy(__DECONST(void *, &iph->ip_src), |
| 226 | dst: (void *)&pkt->pkt_flow_ipv4_addrs, |
| 227 | n: sizeof(struct __flow_l3_ipv4_addrs)); |
| 228 | } |
| 229 | |
| 230 | pkt->pkt_flow_ip_ver = IPVERSION; |
| 231 | pkt->pkt_flow_ip_proto = iph->ip_p; |
| 232 | pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iph; |
| 233 | |
| 234 | if (__improbable(ntohs(iph->ip_off) & ~(IP_DF | IP_RF))) { |
| 235 | pkt->pkt_flow_ip_is_frag = TRUE; |
| 236 | pkt->pkt_flow_ip_frag_id = iph->ip_id; |
| 237 | /* we only parse l4 in the 1st frag */ |
| 238 | if ((ntohs(iph->ip_off) & IP_OFFMASK) != 0) { |
| 239 | pkt->pkt_flow_ip_is_first_frag = FALSE; |
| 240 | CL_SKIP_L4(); |
| 241 | } else { |
| 242 | pkt->pkt_flow_ip_is_first_frag = TRUE; |
| 243 | } |
| 244 | } |
| 245 | break; |
| 246 | |
| 247 | case AF_INET6: |
| 248 | l3hlen = sizeof(struct ip6_hdr); |
| 249 | CL_SKIP_ON(cls_len < l3hlen); |
| 250 | |
| 251 | l3tlen = l3hlen + ntohs(ip6->ip6_plen); |
| 252 | CL_SKIP_ON(pkt_len < l3tlen); |
| 253 | CL_SKIP_ON((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION); |
| 254 | |
| 255 | if (__probable(IS_P2ALIGNED(&ip6->ip6_src, 8))) { |
| 256 | sk_copy64_32(__DECONST(uint64_t *, &ip6->ip6_src), |
| 257 | dst: (uint64_t *)(void *)&pkt->pkt_flow_ipv6_src); |
| 258 | } else if (IS_P2ALIGNED(&ip6->ip6_src, 4)) { |
| 259 | sk_copy32_32(__DECONST(uint32_t *, &ip6->ip6_src), |
| 260 | dst: (uint32_t *)(void *)&pkt->pkt_flow_ipv6_src); |
| 261 | } else { |
| 262 | bcopy(__DECONST(void *, &ip6->ip6_src), |
| 263 | dst: (void *)&pkt->pkt_flow_ipv6_addrs, |
| 264 | n: sizeof(struct __flow_l3_ipv6_addrs)); |
| 265 | } |
| 266 | |
| 267 | pkt->pkt_flow_ip_ver = IPV6_VERSION; |
| 268 | pkt->pkt_flow_ip_proto = ip6->ip6_nxt; |
| 269 | pkt->pkt_flow_ip_hdr = (mach_vm_address_t)ip6; |
| 270 | |
| 271 | /* only parse the next immediate extension header for frags */ |
| 272 | if (__improbable(ip6->ip6_nxt == IPPROTO_FRAGMENT)) { |
| 273 | volatile struct ip6_frag *ip6f; |
| 274 | ip6f = (volatile struct ip6_frag *)(ip6 + 1); |
| 275 | CL_SKIP_ON(cls_len < l3hlen + sizeof(struct ip6_frag)); |
| 276 | pkt->pkt_flow_ip_is_frag = 1; |
| 277 | pkt->pkt_flow_ip_frag_id = ip6f->ip6f_ident; |
| 278 | pkt->pkt_flow_ip_proto = ip6f->ip6f_nxt; |
| 279 | l3hlen += sizeof(struct ip6_frag); |
| 280 | CL_SKIP_ON(l3tlen < l3hlen); |
| 281 | /* we only parse l4 in the 1st frag */ |
| 282 | if ((ip6f->ip6f_offlg & IP6F_OFF_MASK) != 0) { |
| 283 | pkt->pkt_flow_ip_is_first_frag = FALSE; |
| 284 | CL_SKIP_L4(); |
| 285 | } else { |
| 286 | pkt->pkt_flow_ip_is_first_frag = TRUE; |
| 287 | } |
| 288 | /* process atomic frag as non-frag */ |
| 289 | if ((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) == 0) { |
| 290 | pkt->pkt_flow_ip_is_frag = 0; |
| 291 | } |
| 292 | } |
| 293 | break; |
| 294 | |
| 295 | default: |
| 296 | error = ENOTSUP; |
| 297 | goto done; |
| 298 | } |
| 299 | |
| 300 | pkt->pkt_flow_ip_hlen = l3hlen; |
| 301 | if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_TCP && |
| 302 | pkt->pkt_flow_ip_proto != IPPROTO_UDP)) { |
| 303 | error = 0; |
| 304 | goto done; |
| 305 | } |
| 306 | |
| 307 | /**************** L4 header (TCP/UDP) *****************/ |
| 308 | |
| 309 | /* this takes care of UDP header as well (see l4 union var) */ |
| 310 | tcph = __DECONST(volatile struct tcphdr *, |
| 311 | (volatile uint8_t *)iph + l3hlen); |
| 312 | ulen = (l3tlen - l3hlen); |
| 313 | if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) { |
| 314 | CL_SKIP_ON((cls_len < l3hlen + sizeof(*tcph)) || |
| 315 | (ulen < sizeof(*tcph))); |
| 316 | l4hlen = (uint8_t)(tcph->th_off << 2); |
| 317 | CL_SKIP_ON(l4hlen < sizeof(*tcph)); |
| 318 | CL_SKIP_ON(l4hlen > ulen); |
| 319 | pkt->pkt_flow_tcp_hlen = l4hlen; |
| 320 | pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcph; |
| 321 | } else { |
| 322 | CL_SKIP_ON((cls_len < l3hlen + sizeof(*udph)) || |
| 323 | (ulen < sizeof(*udph))); |
| 324 | l4hlen = sizeof(*udph); |
| 325 | CL_SKIP_ON(l4hlen > ulen); |
| 326 | pkt->pkt_flow_udp_hlen = l4hlen; |
| 327 | pkt->pkt_flow_udp_hdr = (mach_vm_address_t)udph; |
| 328 | } |
| 329 | |
| 330 | if (__probable(!pkt->pkt_flow_ip_is_frag)) { |
| 331 | ulen -= l4hlen; |
| 332 | pkt->pkt_flow_ulen = ulen; |
| 333 | } else { |
| 334 | /* |
| 335 | * We can't determine user data length for fragment until |
| 336 | * it is reassembled. |
| 337 | */ |
| 338 | pkt->pkt_flow_ulen = 0; |
| 339 | } |
| 340 | |
| 341 | if (__probable(IS_P2ALIGNED(&tcph->th_sport, 4))) { |
| 342 | if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) { |
| 343 | sk_copy32_16(__DECONST(uint32_t *, &tcph->th_sport), |
| 344 | dst: (uint32_t *)(void *)&pkt->pkt_flow_tcp_src); |
| 345 | } else { |
| 346 | sk_copy32_8(__DECONST(uint32_t *, &udph->uh_sport), |
| 347 | dst: (uint32_t *)(void *)&pkt->pkt_flow_udp_src); |
| 348 | } |
| 349 | } else { |
| 350 | if (__probable(pkt->pkt_flow_ip_proto == IPPROTO_TCP)) { |
| 351 | bcopy(__DECONST(void *, &tcph->th_sport), |
| 352 | dst: (void *)&pkt->pkt_flow_tcp, |
| 353 | n: sizeof(struct __flow_l4_tcp)); |
| 354 | } else { |
| 355 | bcopy(__DECONST(void *, &udph->uh_sport), |
| 356 | dst: (void *)&pkt->pkt_flow_udp, |
| 357 | n: sizeof(struct __flow_l4_udp)); |
| 358 | } |
| 359 | } |
| 360 | |
| 361 | if (!input && pkt->pkt_flow_ip_proto == IPPROTO_TCP && |
| 362 | pkt->pkt_flow_ulen != 0) { |
| 363 | /* |
| 364 | * Following the logic in tcp_output(), we mark |
| 365 | * this if the payload is non-zero; note that |
| 366 | * the pkt_flow_tcp_seq is in network byte order. |
| 367 | */ |
| 368 | pkt->pkt_pflags |= PKT_F_START_SEQ; |
| 369 | } |
| 370 | done: |
| 371 | if (__probable(error == 0)) { |
| 372 | SK_DF(SK_VERB_FLOW_CLASSIFY, "pkt_length %u l3_ip_len %u " |
| 373 | "l3_ip_ver 0x%x l3_proto %u l4_sport %u l4_dport %u" , |
| 374 | pkt->pkt_length, l3tlen, pkt->pkt_flow_ip_ver, |
| 375 | pkt->pkt_flow_ip_proto, ntohs(pkt->pkt_flow_tcp_src), |
| 376 | ntohs(pkt->pkt_flow_tcp_dst)); |
| 377 | /* on output, trim metadata length if not same as IP length */ |
| 378 | if (!input) { |
| 379 | if (__improbable(pkt->pkt_length != (l3tlen + pkt->pkt_l2_len))) { |
| 380 | SK_ERR("packet is too long (%u), trimming to " |
| 381 | "IP + L2 length (%d)" , pkt->pkt_length, |
| 382 | l3tlen + pkt->pkt_l2_len); |
| 383 | METADATA_SET_LEN(pkt, l3tlen + pkt->pkt_l2_len, bdoff); |
| 384 | } |
| 385 | if (__improbable(((pkt->pkt_length > mtu) && |
| 386 | (pkt->pkt_proto_seg_sz == 0)) || |
| 387 | (pkt->pkt_proto_seg_sz > mtu))) { |
| 388 | SK_ERR("dropped; length (%u) exceeds MTU (%d) " |
| 389 | " proto_seg_sz %d" , |
| 390 | pkt->pkt_length, mtu, |
| 391 | pkt->pkt_proto_seg_sz); |
| 392 | SK_ERR("%s" , sk_dump("buf" , l3_hdr, cls_len, |
| 393 | 128, NULL, 0)); |
| 394 | error = EMSGSIZE; |
| 395 | goto fail; |
| 396 | } |
| 397 | } |
| 398 | /* |
| 399 | * Mark QUM_F_FLOW_CLASSIFIED on the packet to indicate |
| 400 | * that the __flow structure has valid info now. |
| 401 | */ |
| 402 | pkt->pkt_qum_qflags |= QUM_F_FLOW_CLASSIFIED; |
| 403 | return 0; |
| 404 | } |
| 405 | |
| 406 | fail: |
| 407 | ASSERT(error != 0 && !(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED)); |
| 408 | KPKT_CLEAR_FLOW_ALL(pkt->pkt_flow); |
| 409 | |
| 410 | return error; |
| 411 | } |
| 412 | |