1/*
2 * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/cdefs.h>
30#include <sys/param.h>
31#include <sys/mbuf.h>
32#include <sys/socket.h>
33#include <sys/sockio.h>
34#include <sys/systm.h>
35#include <sys/sysctl.h>
36#include <sys/syslog.h>
37#include <sys/proc.h>
38#include <sys/errno.h>
39#include <sys/kernel.h>
40#include <sys/kauth.h>
41#include <kern/zalloc.h>
42#include <netinet/in.h>
43
44#include <net/classq/classq.h>
45#include <net/classq/if_classq.h>
46#include <net/pktsched/pktsched.h>
47#include <net/pktsched/pktsched_fq_codel.h>
48#include <net/classq/classq_fq_codel.h>
49
50static uint32_t flowq_size; /* size of flowq */
51static struct mcache *flowq_cache = NULL; /* mcache for flowq */
52
53#define FQ_ZONE_MAX (32 * 1024) /* across all interfaces */
54
55#define DTYPE_NODROP 0 /* no drop */
56#define DTYPE_FORCED 1 /* a "forced" drop */
57#define DTYPE_EARLY 2 /* an "unforced" (early) drop */
58
59void
60fq_codel_init(void)
61{
62 if (flowq_cache != NULL)
63 return;
64
65 flowq_size = sizeof (fq_t);
66 flowq_cache = mcache_create("fq.flowq", flowq_size, sizeof (uint64_t),
67 0, MCR_SLEEP);
68 if (flowq_cache == NULL) {
69 panic("%s: failed to allocate flowq_cache", __func__);
70 /* NOTREACHED */
71 }
72}
73
74void
75fq_codel_reap_caches(boolean_t purge)
76{
77 mcache_reap_now(flowq_cache, purge);
78}
79
80fq_t *
81fq_alloc(classq_pkt_type_t ptype)
82{
83 fq_t *fq = NULL;
84 fq = mcache_alloc(flowq_cache, MCR_SLEEP);
85 if (fq == NULL) {
86 log(LOG_ERR, "%s: unable to allocate from flowq_cache\n");
87 return (NULL);
88 }
89
90 bzero(fq, flowq_size);
91 fq->fq_ptype = ptype;
92 if (ptype == QP_MBUF) {
93 MBUFQ_INIT(&fq->fq_mbufq);
94 }
95 return (fq);
96}
97
98void
99fq_destroy(fq_t *fq)
100{
101 VERIFY(fq_empty(fq));
102 VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)));
103 VERIFY(fq->fq_bytes == 0);
104 mcache_free(flowq_cache, fq);
105}
106
107static void
108fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl,
109 u_int64_t *now)
110{
111 u_int64_t maxgetqtime;
112 if (FQ_IS_DELAYHIGH(flowq) || flowq->fq_getqtime == 0 ||
113 fq_empty(flowq) ||
114 flowq->fq_bytes < FQ_MIN_FC_THRESHOLD_BYTES)
115 return;
116 maxgetqtime = flowq->fq_getqtime + fqs->fqs_update_interval;
117 if ((*now) > maxgetqtime) {
118 /*
119 * there was no dequeue in an update interval worth of
120 * time. It means that the queue is stalled.
121 */
122 FQ_SET_DELAY_HIGH(flowq);
123 fq_cl->fcl_stat.fcl_dequeue_stall++;
124 }
125}
126
127void
128fq_head_drop(fq_if_t *fqs, fq_t *fq)
129{
130 pktsched_pkt_t pkt;
131 uint32_t *pkt_flags;
132 uint64_t *pkt_timestamp;
133 struct ifclassq *ifq = fqs->fqs_ifq;
134
135 _PKTSCHED_PKT_INIT(&pkt);
136 if (fq_getq_flow_internal(fqs, fq, &pkt) == NULL)
137 return;
138
139 pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
140 NULL, NULL);
141
142 *pkt_timestamp = 0;
143 if (pkt.pktsched_ptype == QP_MBUF)
144 *pkt_flags &= ~PKTF_PRIV_GUARDED;
145
146 IFCQ_DROP_ADD(ifq, 1, pktsched_get_pkt_len(&pkt));
147 IFCQ_CONVERT_LOCK(ifq);
148 pktsched_free_pkt(&pkt);
149}
150
151int
152fq_addq(fq_if_t *fqs, pktsched_pkt_t *pkt, fq_if_classq_t *fq_cl)
153{
154 int droptype = DTYPE_NODROP, fc_adv = 0, ret = CLASSQEQ_SUCCESS;
155 u_int64_t now;
156 fq_t *fq = NULL;
157 uint64_t *pkt_timestamp;
158 uint32_t *pkt_flags;
159 uint32_t pkt_flowid, pkt_tx_start_seq;
160 uint8_t pkt_proto, pkt_flowsrc;
161
162 pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid,
163 &pkt_flowsrc, &pkt_proto, &pkt_tx_start_seq);
164
165 if (pkt->pktsched_ptype == QP_MBUF) {
166 /* See comments in <rdar://problem/14040693> */
167 VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED));
168 *pkt_flags |= PKTF_PRIV_GUARDED;
169 }
170
171 if (*pkt_timestamp > 0) {
172 now = *pkt_timestamp;
173 } else {
174 struct timespec now_ts;
175 nanouptime(&now_ts);
176 now = (now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
177 *pkt_timestamp = now;
178 }
179
180 /* find the flowq for this packet */
181 fq = fq_if_hash_pkt(fqs, pkt_flowid, pktsched_get_pkt_svc(pkt),
182 now, TRUE, pkt->pktsched_ptype);
183 if (fq == NULL) {
184 /* drop the packet if we could not allocate a flow queue */
185 fq_cl->fcl_stat.fcl_drop_memfailure++;
186 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
187 return (CLASSQEQ_DROP);
188 }
189 VERIFY(fq->fq_ptype == pkt->pktsched_ptype);
190
191 fq_detect_dequeue_stall(fqs, fq, fq_cl, &now);
192
193 if (FQ_IS_DELAYHIGH(fq)) {
194 if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) &&
195 (*pkt_flags & PKTF_FLOW_ADV)) {
196 fc_adv = 1;
197 /*
198 * If the flow is suspended or it is not
199 * TCP, drop the packet
200 */
201 if (pkt_proto != IPPROTO_TCP) {
202 droptype = DTYPE_EARLY;
203 fq_cl->fcl_stat.fcl_drop_early++;
204 }
205 } else {
206 /*
207 * Need to drop a packet, instead of dropping this
208 * one, try to drop from the head of the queue
209 */
210 if (!fq_empty(fq)) {
211 fq_head_drop(fqs, fq);
212 droptype = DTYPE_NODROP;
213 } else {
214 droptype = DTYPE_EARLY;
215 }
216 fq_cl->fcl_stat.fcl_drop_early++;
217 }
218
219 }
220
221 /* Set the return code correctly */
222 if (fc_adv == 1 && droptype != DTYPE_FORCED) {
223 if (fq_if_add_fcentry(fqs, pkt, pkt_flowid, pkt_flowsrc,
224 fq_cl)) {
225 fq->fq_flags |= FQF_FLOWCTL_ON;
226 /* deliver flow control advisory error */
227 if (droptype == DTYPE_NODROP) {
228 ret = CLASSQEQ_SUCCESS_FC;
229 } else {
230 /* dropped due to flow control */
231 ret = CLASSQEQ_DROP_FC;
232 }
233 } else {
234 /*
235 * if we could not flow control the flow, it is
236 * better to drop
237 */
238 droptype = DTYPE_FORCED;
239 ret = CLASSQEQ_DROP_FC;
240 fq_cl->fcl_stat.fcl_flow_control_fail++;
241 }
242 }
243
244 /*
245 * If the queue length hits the queue limit, drop a packet from the
246 * front of the queue for a flow with maximum number of bytes. This
247 * will penalize heavy and unresponsive flows. It will also avoid a
248 * tail drop.
249 */
250 if (droptype == DTYPE_NODROP && fq_if_at_drop_limit(fqs)) {
251 if (fqs->fqs_large_flow == fq) {
252 /*
253 * Drop from the head of the current fq. Since a
254 * new packet will be added to the tail, it is ok
255 * to leave fq in place.
256 */
257 fq_head_drop(fqs, fq);
258 } else {
259 if (fqs->fqs_large_flow == NULL) {
260 droptype = DTYPE_FORCED;
261 fq_cl->fcl_stat.fcl_drop_overflow++;
262 ret = CLASSQEQ_DROP;
263
264 /*
265 * if this fq was freshly created and there
266 * is nothing to enqueue, free it
267 */
268 if (fq_empty(fq) && !(fq->fq_flags &
269 (FQF_NEW_FLOW | FQF_OLD_FLOW))) {
270 fq_if_destroy_flow(fqs, fq_cl, fq);
271 fq = NULL;
272 }
273 } else {
274 fq_if_drop_packet(fqs);
275 }
276 }
277 }
278
279 if (droptype == DTYPE_NODROP) {
280 uint32_t pkt_len = pktsched_get_pkt_len(pkt);
281 fq_enqueue(fq, pkt->pktsched_pkt);
282 fq->fq_bytes += pkt_len;
283 fq_cl->fcl_stat.fcl_byte_cnt += pkt_len;
284 fq_cl->fcl_stat.fcl_pkt_cnt++;
285
286 /*
287 * check if this queue will qualify to be the next
288 * victim queue
289 */
290 fq_if_is_flow_heavy(fqs, fq);
291 } else {
292 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
293 return ((ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP);
294 }
295
296 /*
297 * If the queue is not currently active, add it to the end of new
298 * flows list for that service class.
299 */
300 if ((fq->fq_flags & (FQF_NEW_FLOW|FQF_OLD_FLOW)) == 0) {
301 VERIFY(STAILQ_NEXT(fq, fq_actlink) == NULL);
302 STAILQ_INSERT_TAIL(&fq_cl->fcl_new_flows, fq, fq_actlink);
303 fq->fq_flags |= FQF_NEW_FLOW;
304
305 fq_cl->fcl_stat.fcl_newflows_cnt++;
306
307 fq->fq_deficit = fq_cl->fcl_quantum;
308 }
309 return (ret);
310}
311
312void *
313fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
314{
315 void *p;
316 uint32_t plen;
317 fq_if_classq_t *fq_cl;
318 struct ifclassq *ifq = fqs->fqs_ifq;
319
320 fq_dequeue(fq, p);
321 if (p == NULL)
322 return (NULL);
323
324 pktsched_pkt_encap(pkt, fq->fq_ptype, p);
325 plen = pktsched_get_pkt_len(pkt);
326
327 VERIFY(fq->fq_bytes >= plen);
328 fq->fq_bytes -= plen;
329
330 fq_cl = &fqs->fqs_classq[fq->fq_sc_index];
331 fq_cl->fcl_stat.fcl_byte_cnt -= plen;
332 fq_cl->fcl_stat.fcl_pkt_cnt--;
333 IFCQ_DEC_LEN(ifq);
334 IFCQ_DEC_BYTES(ifq, plen);
335
336 /* Reset getqtime so that we don't count idle times */
337 if (fq_empty(fq))
338 fq->fq_getqtime = 0;
339
340 return (p);
341}
342
343void *
344fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt)
345{
346 void *p;
347 fq_if_classq_t *fq_cl;
348 u_int64_t now;
349 int64_t qdelay = 0;
350 struct timespec now_ts;
351 uint32_t *pkt_flags, pkt_tx_start_seq;
352 uint64_t *pkt_timestamp;
353
354 p = fq_getq_flow_internal(fqs, fq, pkt);
355 if (p == NULL)
356 return (NULL);
357
358 pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
359 NULL, &pkt_tx_start_seq);
360
361 nanouptime(&now_ts);
362 now = (now_ts.tv_sec * NSEC_PER_SEC) + now_ts.tv_nsec;
363
364 /* this will compute qdelay in nanoseconds */
365 if (now > *pkt_timestamp)
366 qdelay = now - *pkt_timestamp;
367 fq_cl = &fqs->fqs_classq[fq->fq_sc_index];
368
369 if (fq->fq_min_qdelay == 0 ||
370 (qdelay > 0 && (u_int64_t)qdelay < fq->fq_min_qdelay))
371 fq->fq_min_qdelay = qdelay;
372 if (now >= fq->fq_updatetime) {
373 if (fq->fq_min_qdelay > fqs->fqs_target_qdelay) {
374 if (!FQ_IS_DELAYHIGH(fq))
375 FQ_SET_DELAY_HIGH(fq);
376 } else {
377 FQ_CLEAR_DELAY_HIGH(fq);
378 }
379
380
381 /* Reset measured queue delay and update time */
382 fq->fq_updatetime = now + fqs->fqs_update_interval;
383 fq->fq_min_qdelay = 0;
384 }
385 if (!FQ_IS_DELAYHIGH(fq) || fq_empty(fq)) {
386 FQ_CLEAR_DELAY_HIGH(fq);
387 if (fq->fq_flags & FQF_FLOWCTL_ON) {
388 fq_if_flow_feedback(fqs, fq, fq_cl);
389 }
390 }
391
392 if (fq_empty(fq)) {
393 /* Reset getqtime so that we don't count idle times */
394 fq->fq_getqtime = 0;
395 } else {
396 fq->fq_getqtime = now;
397 }
398 fq_if_is_flow_heavy(fqs, fq);
399
400 *pkt_timestamp = 0;
401 if (pkt->pktsched_ptype == QP_MBUF)
402 *pkt_flags &= ~PKTF_PRIV_GUARDED;
403
404 return (p);
405}
406