| 1 | /* |
| 2 | * Copyright (c) 2012-2021 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | /* |
| 30 | * Flow Control and Feedback Advisory |
| 31 | * |
| 32 | * Each mbuf that is being sent out through an interface is tagged with a |
| 33 | * unique 32-bit ID which will help to identify all the packets that belong |
| 34 | * to a particular flow at the interface layer. Packets carrying such ID |
| 35 | * would need to be marked with PKTF_FLOW_ID. Normally, this ID is computed |
| 36 | * by the module that generates the flow. There are 3 kinds of flow sources |
| 37 | * that are currently recognized: |
| 38 | * |
| 39 | * a. INPCB (INET/INET6 Protocol Control Block). When a socket is |
| 40 | * connected, the flow hash for the socket is computed and stored in |
| 41 | * the PCB. Further transmissions on the socket will cause the hash |
| 42 | * value to be carried within the mbuf as the flow ID. |
| 43 | * |
| 44 | * b. Interface. When an interface is attached, the flow hash for the |
| 45 | * interface is computed and stored in the ifnet. This value is |
| 46 | * normally ignored for most network drivers, except for those that |
| 47 | * reside atop another driver, e.g. a virtual interface performing |
| 48 | * encapsulation/encryption on the original packet and sending the |
| 49 | * newly-generated packet to another interface. Such interface needs |
| 50 | * to associate all generated packets with the interface flow hash |
| 51 | * value as the flow ID. |
| 52 | * |
| 53 | * c. PF (Packet Filter). When a packet goes through PF and it is not |
| 54 | * already associated with a flow ID, PF will compute a flow hash and |
| 55 | * store it in the packet as flow ID. When the packet is associated |
| 56 | * with a PF state, the state record will have the flow ID stored |
| 57 | * within, in order to avoid recalculating the flow hash. Although PF |
| 58 | * is capable of generating flow IDs, it does not participate in flow |
| 59 | * advisory, and therefore packets whose IDs are computed by PF will |
| 60 | * not have their PKTF_FLOW_ADV packet flag set. |
| 61 | * |
| 62 | * Activation of flow advisory mechanism is done by setting the PKTF_FLOW_ADV |
| 63 | * packet flag; because a flow ID is required, the mechanism will not take |
| 64 | * place unless PKTF_FLOW_ID is set as well. The packet must also carry one |
| 65 | * of the flow source types FLOWSRC_{INPCB,IFNET} in order to identify where |
| 66 | * the flow advisory notification should be delivered to. As noted above, |
| 67 | * FLOWSRC_PF does not participate in this mechanism. |
| 68 | * |
| 69 | * The classq module configured on the interface is responsible for exerting |
| 70 | * flow control to the upper layers. This occurs when the number of packets |
| 71 | * queued for a flow reaches a limit. The module generating the flow will |
| 72 | * cease transmission until further flow advisory notice, and the flow will |
| 73 | * be inserted into the classq's flow control list. |
| 74 | * |
| 75 | * When packets are dequeued from the classq and the number of packets for |
| 76 | * a flow goes below a limit, the classq will transfer its flow control list |
| 77 | * to the global fadv_list. This will then trigger the flow advisory thread |
| 78 | * to run, which will cause the flow source modules to be notified that data |
| 79 | * can now be generated for those previously flow-controlled flows. |
| 80 | */ |
| 81 | |
| 82 | #include <sys/param.h> |
| 83 | #include <sys/systm.h> |
| 84 | #include <sys/kernel.h> |
| 85 | #include <sys/mcache.h> /* for VERIFY() */ |
| 86 | #include <sys/mbuf.h> |
| 87 | #include <sys/proc_internal.h> |
| 88 | #include <sys/socketvar.h> |
| 89 | |
| 90 | #include <kern/assert.h> |
| 91 | #include <kern/thread.h> |
| 92 | #include <kern/locks.h> |
| 93 | #include <kern/zalloc.h> |
| 94 | |
| 95 | #include <netinet/in_pcb.h> |
| 96 | #include <net/flowadv.h> |
| 97 | #if SKYWALK |
| 98 | #include <skywalk/os_channel.h> |
| 99 | #endif /* SKYWALK */ |
| 100 | |
| 101 | /* Lock group and attribute for fadv_lock */ |
| 102 | static LCK_GRP_DECLARE(fadv_lock_grp, "fadv_lock" ); |
| 103 | static LCK_MTX_DECLARE(fadv_lock, &fadv_lock_grp); |
| 104 | |
| 105 | /* protected by fadv_lock */ |
| 106 | static STAILQ_HEAD(fadv_head, flowadv_fcentry) fadv_list = |
| 107 | STAILQ_HEAD_INITIALIZER(fadv_list); |
| 108 | static thread_t fadv_thread = THREAD_NULL; |
| 109 | static uint32_t fadv_active; |
| 110 | |
| 111 | #define FADV_CACHE_NAME "flowadv" /* cache name */ |
| 112 | |
| 113 | static int flowadv_thread_cont(int); |
| 114 | static void flowadv_thread_func(void *, wait_result_t); |
| 115 | |
| 116 | void |
| 117 | flowadv_init(void) |
| 118 | { |
| 119 | if (kernel_thread_start(continuation: flowadv_thread_func, NULL, new_thread: &fadv_thread) != |
| 120 | KERN_SUCCESS) { |
| 121 | panic("%s: couldn't create flow event advisory thread" , |
| 122 | __func__); |
| 123 | /* NOTREACHED */ |
| 124 | } |
| 125 | thread_deallocate(thread: fadv_thread); |
| 126 | } |
| 127 | |
| 128 | struct flowadv_fcentry * |
| 129 | flowadv_alloc_entry(int how) |
| 130 | { |
| 131 | return kalloc_type(struct flowadv_fcentry, how | Z_ZERO); |
| 132 | } |
| 133 | |
| 134 | void |
| 135 | flowadv_free_entry(struct flowadv_fcentry *fce) |
| 136 | { |
| 137 | kfree_type(struct flowadv_fcentry, fce); |
| 138 | } |
| 139 | |
| 140 | void |
| 141 | flowadv_add(struct flowadv_fclist *fcl) |
| 142 | { |
| 143 | if (STAILQ_EMPTY(fcl)) { |
| 144 | return; |
| 145 | } |
| 146 | |
| 147 | lck_mtx_lock_spin(lck: &fadv_lock); |
| 148 | |
| 149 | STAILQ_CONCAT(&fadv_list, fcl); |
| 150 | VERIFY(!STAILQ_EMPTY(&fadv_list)); |
| 151 | |
| 152 | if (!fadv_active && fadv_thread != THREAD_NULL) { |
| 153 | wakeup_one(chan: (caddr_t)&fadv_list); |
| 154 | } |
| 155 | |
| 156 | lck_mtx_unlock(lck: &fadv_lock); |
| 157 | } |
| 158 | |
| 159 | void |
| 160 | flowadv_add_entry(struct flowadv_fcentry *fce) |
| 161 | { |
| 162 | lck_mtx_lock_spin(lck: &fadv_lock); |
| 163 | STAILQ_INSERT_HEAD(&fadv_list, fce, fce_link); |
| 164 | VERIFY(!STAILQ_EMPTY(&fadv_list)); |
| 165 | |
| 166 | if (!fadv_active && fadv_thread != THREAD_NULL) { |
| 167 | wakeup_one(chan: (caddr_t)&fadv_list); |
| 168 | } |
| 169 | |
| 170 | lck_mtx_unlock(lck: &fadv_lock); |
| 171 | } |
| 172 | |
| 173 | static int |
| 174 | flowadv_thread_cont(int err) |
| 175 | { |
| 176 | #pragma unused(err) |
| 177 | for (;;) { |
| 178 | LCK_MTX_ASSERT(&fadv_lock, LCK_MTX_ASSERT_OWNED); |
| 179 | while (STAILQ_EMPTY(&fadv_list)) { |
| 180 | VERIFY(!fadv_active); |
| 181 | (void) msleep0(chan: &fadv_list, mtx: &fadv_lock, pri: (PSOCK | PSPIN), |
| 182 | wmesg: "flowadv_cont" , timo: 0, continuation: flowadv_thread_cont); |
| 183 | /* NOTREACHED */ |
| 184 | } |
| 185 | |
| 186 | fadv_active = 1; |
| 187 | for (;;) { |
| 188 | struct flowadv_fcentry *fce; |
| 189 | |
| 190 | VERIFY(!STAILQ_EMPTY(&fadv_list)); |
| 191 | fce = STAILQ_FIRST(&fadv_list); |
| 192 | STAILQ_REMOVE(&fadv_list, fce, |
| 193 | flowadv_fcentry, fce_link); |
| 194 | STAILQ_NEXT(fce, fce_link) = NULL; |
| 195 | |
| 196 | lck_mtx_unlock(lck: &fadv_lock); |
| 197 | |
| 198 | if (fce->fce_event_type == FCE_EVENT_TYPE_CONGESTION_EXPERIENCED) { |
| 199 | switch (fce->fce_flowsrc_type) { |
| 200 | case FLOWSRC_CHANNEL: |
| 201 | kern_channel_flowadv_report_ce_event(fce, fce->fce_ce_cnt, |
| 202 | fce->fce_pkts_since_last_report); |
| 203 | break; |
| 204 | case FLOWSRC_INPCB: |
| 205 | case FLOWSRC_IFNET: |
| 206 | case FLOWSRC_PF: |
| 207 | default: |
| 208 | break; |
| 209 | } |
| 210 | |
| 211 | goto next; |
| 212 | } |
| 213 | |
| 214 | switch (fce->fce_flowsrc_type) { |
| 215 | case FLOWSRC_INPCB: |
| 216 | inp_flowadv(fce->fce_flowid); |
| 217 | break; |
| 218 | |
| 219 | case FLOWSRC_IFNET: |
| 220 | #if SKYWALK |
| 221 | /* |
| 222 | * when using the flowID allocator, IPSec |
| 223 | * driver uses the "pkt_flowid" field in mbuf |
| 224 | * packet header for the globally unique flowID |
| 225 | * and the "pkt_mpriv_srcid" field carries the |
| 226 | * interface flow control id (if_flowhash). |
| 227 | * For IPSec flows, it is the IPSec driver |
| 228 | * network interface which is flow controlled, |
| 229 | * instead of the IPSec SA flow. |
| 230 | */ |
| 231 | ifnet_flowadv(fce->fce_flowsrc_token); |
| 232 | #else /* !SKYWALK */ |
| 233 | ifnet_flowadv(fce->fce_flowid); |
| 234 | #endif /* !SKYWALK */ |
| 235 | break; |
| 236 | |
| 237 | #if SKYWALK |
| 238 | case FLOWSRC_CHANNEL: |
| 239 | kern_channel_flowadv_clear(fce); |
| 240 | break; |
| 241 | #endif /* SKYWALK */ |
| 242 | |
| 243 | case FLOWSRC_PF: |
| 244 | default: |
| 245 | break; |
| 246 | } |
| 247 | next: |
| 248 | flowadv_free_entry(fce); |
| 249 | lck_mtx_lock_spin(lck: &fadv_lock); |
| 250 | |
| 251 | /* if there's no pending request, we're done */ |
| 252 | if (STAILQ_EMPTY(&fadv_list)) { |
| 253 | break; |
| 254 | } |
| 255 | } |
| 256 | fadv_active = 0; |
| 257 | } |
| 258 | } |
| 259 | |
| 260 | __dead2 |
| 261 | static void |
| 262 | flowadv_thread_func(void *v, wait_result_t w) |
| 263 | { |
| 264 | #pragma unused(v, w) |
| 265 | lck_mtx_lock(lck: &fadv_lock); |
| 266 | (void) msleep0(chan: &fadv_list, mtx: &fadv_lock, pri: (PSOCK | PSPIN), |
| 267 | wmesg: "flowadv" , timo: 0, continuation: flowadv_thread_cont); |
| 268 | /* |
| 269 | * msleep0() shouldn't have returned as PCATCH was not set; |
| 270 | * therefore assert in this case. |
| 271 | */ |
| 272 | lck_mtx_unlock(lck: &fadv_lock); |
| 273 | VERIFY(0); |
| 274 | } |
| 275 | |