1/*
2 * Copyright (c) 2009-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/systm.h>
30#include <sys/kernel.h>
31#include <sys/types.h>
32#include <sys/filedesc.h>
33#include <sys/file_internal.h>
34#include <sys/proc.h>
35#include <sys/socket.h>
36#include <sys/socketvar.h>
37#include <sys/errno.h>
38#include <sys/protosw.h>
39#include <sys/domain.h>
40#include <sys/mbuf.h>
41#include <sys/queue.h>
42#include <sys/sysctl.h>
43#include <sys/sysproto.h>
44
45#include <net/if.h>
46#include <net/if_var.h>
47#include <net/route.h>
48
49#include <netinet/in.h>
50#include <netinet/in_var.h>
51#include <netinet/in_pcb.h>
52#include <netinet/ip.h>
53#include <netinet/ip_var.h>
54#include <netinet/ip6.h>
55#include <netinet6/ip6_var.h>
56#include <netinet/udp.h>
57#include <netinet/udp_var.h>
58#include <netinet/tcp.h>
59#include <netinet/tcp_var.h>
60#include <netinet/tcp_cc.h>
61#include <netinet/lro_ext.h>
62#include <netinet/in_tclass.h>
63
64struct dcsp_msc_map {
65 u_int8_t dscp;
66 mbuf_svc_class_t msc;
67};
68static inline int so_throttle_best_effort(struct socket *, struct ifnet *);
69static void set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *, int);
70static errno_t dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *, size_t,
71 struct dcsp_msc_map *);
72
73static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */
74static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */
75static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */
76decl_lck_mtx_data(static, tclass_lock_data);
77static lck_mtx_t *tclass_lock = &tclass_lock_data;
78
79SYSCTL_NODE(_net, OID_AUTO, qos,
80 CTLFLAG_RW|CTLFLAG_LOCKED, 0, "QoS");
81
82static int sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS;
83SYSCTL_PROC(_net_qos, OID_AUTO, default_netsvctype_to_dscp_map,
84 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
85 0, 0, sysctl_default_netsvctype_to_dscp_map, "S", "");
86
87static int sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS;
88SYSCTL_PROC(_net_qos, OID_AUTO, dscp_to_wifi_ac_map,
89 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
90 0, 0, sysctl_dscp_to_wifi_ac_map, "S", "");
91
92static int sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS;
93SYSCTL_PROC(_net_qos, OID_AUTO, reset_dscp_to_wifi_ac_map,
94 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
95 0, 0, sysctl_reset_dscp_to_wifi_ac_map, "I", "");
96
97int net_qos_verbose = 0;
98SYSCTL_INT(_net_qos, OID_AUTO, verbose,
99 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_verbose, 0, "");
100
101/*
102 * Fastlane QoS policy:
103 * By Default allow all apps to get traffic class to DSCP mapping
104 */
105SYSCTL_NODE(_net_qos, OID_AUTO, policy,
106 CTLFLAG_RW|CTLFLAG_LOCKED, 0, "");
107
108int net_qos_policy_restricted = 0;
109SYSCTL_INT(_net_qos_policy, OID_AUTO, restricted,
110 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restricted, 0, "");
111
112int net_qos_policy_restrict_avapps = 0;
113SYSCTL_INT(_net_qos_policy, OID_AUTO, restrict_avapps,
114 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restrict_avapps, 0, "");
115
116int net_qos_policy_wifi_enabled = 0;
117SYSCTL_INT(_net_qos_policy, OID_AUTO, wifi_enabled,
118 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_wifi_enabled, 0, "");
119
120int net_qos_policy_none_wifi_enabled = 0;
121SYSCTL_INT(_net_qos_policy, OID_AUTO, none_wifi_enabled,
122 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_none_wifi_enabled, 0, "");
123
124int net_qos_policy_capable_enabled = 0;
125SYSCTL_INT(_net_qos_policy, OID_AUTO, capable_enabled,
126 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_capable_enabled, 0, "");
127
128/*
129 * Socket traffic class from network service type
130 */
131const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT] = {
132 SO_TC_BE, /* NET_SERVICE_TYPE_BE */
133 SO_TC_BK_SYS, /* NET_SERVICE_TYPE_BK */
134 SO_TC_VI, /* NET_SERVICE_TYPE_SIG */
135 SO_TC_VI, /* NET_SERVICE_TYPE_VI */
136 SO_TC_VO, /* NET_SERVICE_TYPE_VO */
137 SO_TC_RV, /* NET_SERVICE_TYPE_RV */
138 SO_TC_AV, /* NET_SERVICE_TYPE_AV */
139 SO_TC_OAM, /* NET_SERVICE_TYPE_OAM */
140 SO_TC_RD /* NET_SERVICE_TYPE_RD */
141};
142
143/*
144 * DSCP mappings for QoS Fastlane as based on network service types
145 */
146static const
147struct netsvctype_dscp_map fastlane_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {
148 { NET_SERVICE_TYPE_BE, _DSCP_DF },
149 { NET_SERVICE_TYPE_BK, _DSCP_AF11 },
150 { NET_SERVICE_TYPE_SIG, _DSCP_CS3 },
151 { NET_SERVICE_TYPE_VI, _DSCP_AF41 },
152 { NET_SERVICE_TYPE_VO, _DSCP_EF },
153 { NET_SERVICE_TYPE_RV, _DSCP_CS4 },
154 { NET_SERVICE_TYPE_AV, _DSCP_AF31 },
155 { NET_SERVICE_TYPE_OAM, _DSCP_CS2 },
156 { NET_SERVICE_TYPE_RD, _DSCP_AF21 },
157};
158
159static struct net_qos_dscp_map default_net_qos_dscp_map;
160
161/*
162 * The size is one more than the max because DSCP start at zero
163 */
164#define DSCP_ARRAY_SIZE (_MAX_DSCP + 1)
165
166/*
167 * The DSCP to UP mapping (via mbuf service class) for WiFi follows is the mapping
168 * that implemented at the 802.11 driver level when the mbuf service class is
169 * MBUF_SC_BE.
170 *
171 * This clashes with the recommended mapping documented by the IETF document
172 * draft-szigeti-tsvwg-ieee-802-11e-01.txt but we keep the mapping to maintain
173 * binary compatibility. Applications should use the network service type socket
174 * option instead to select L2 QoS marking instead of IP_TOS or IPV6_TCLASS.
175 */
176static const struct dcsp_msc_map default_dscp_to_wifi_ac_map[] = {
177 { _DSCP_DF, MBUF_SC_BE }, /* RFC 2474 Standard */
178 { 1, MBUF_SC_BE }, /* */
179 { 2, MBUF_SC_BE }, /* */
180 { 3, MBUF_SC_BE }, /* */
181 { 4, MBUF_SC_BE }, /* */
182 { 5, MBUF_SC_BE }, /* */
183 { 6, MBUF_SC_BE }, /* */
184 { 7, MBUF_SC_BE }, /* */
185
186 { _DSCP_CS1, MBUF_SC_BK }, /* RFC 3662 Low-Priority Data */
187 { 9, MBUF_SC_BK }, /* */
188 { _DSCP_AF11, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */
189 { 11, MBUF_SC_BK }, /* */
190 { _DSCP_AF12, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */
191 { 13, MBUF_SC_BK }, /* */
192 { _DSCP_AF13, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */
193 { 15, MBUF_SC_BK }, /* */
194
195 { _DSCP_CS2, MBUF_SC_BK }, /* RFC 4594 OAM */
196 { 17, MBUF_SC_BK }, /* */
197 { _DSCP_AF21, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */
198 { 19, MBUF_SC_BK }, /* */
199 { _DSCP_AF22, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */
200 { 21, MBUF_SC_BK }, /* */
201 { _DSCP_AF23, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */
202 { 23, MBUF_SC_BK }, /* */
203
204 { _DSCP_CS3, MBUF_SC_BE }, /* RFC 2474 Broadcast Video */
205 { 25, MBUF_SC_BE }, /* */
206 { _DSCP_AF31, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */
207 { 27, MBUF_SC_BE }, /* */
208 { _DSCP_AF32, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */
209 { 29, MBUF_SC_BE }, /* */
210 { _DSCP_AF33, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */
211 { 31, MBUF_SC_BE }, /* */
212
213 { _DSCP_CS4, MBUF_SC_VI }, /* RFC 2474 Real-Time Interactive */
214 { 33, MBUF_SC_VI }, /* */
215 { _DSCP_AF41, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */
216 { 35, MBUF_SC_VI }, /* */
217 { _DSCP_AF42, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */
218 { 37, MBUF_SC_VI }, /* */
219 { _DSCP_AF43, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */
220 { 39, MBUF_SC_VI }, /* */
221
222 { _DSCP_CS5, MBUF_SC_VI }, /* RFC 2474 Signaling */
223 { 41, MBUF_SC_VI }, /* */
224 { 42, MBUF_SC_VI }, /* */
225 { 43, MBUF_SC_VI }, /* */
226 { _DSCP_VA, MBUF_SC_VI }, /* RFC 5865 VOICE-ADMIT */
227 { 45, MBUF_SC_VI }, /* */
228 { _DSCP_EF, MBUF_SC_VI }, /* RFC 3246 Telephony */
229 { 47, MBUF_SC_VI }, /* */
230
231 { _DSCP_CS6, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */
232 { 49, MBUF_SC_VO }, /* */
233 { 50, MBUF_SC_VO }, /* */
234 { 51, MBUF_SC_VO }, /* */
235 { 52, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Sigma */
236 { 53, MBUF_SC_VO }, /* */
237 { 54, MBUF_SC_VO }, /* */
238 { 55, MBUF_SC_VO }, /* */
239
240 { _DSCP_CS7, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */
241 { 57, MBUF_SC_VO }, /* */
242 { 58, MBUF_SC_VO }, /* */
243 { 59, MBUF_SC_VO }, /* */
244 { 60, MBUF_SC_VO }, /* */
245 { 61, MBUF_SC_VO }, /* */
246 { 62, MBUF_SC_VO }, /* */
247 { 63, MBUF_SC_VO }, /* */
248
249 { 255, MBUF_SC_UNSPEC } /* invalid DSCP to mark last entry */
250};
251
252mbuf_svc_class_t wifi_dscp_to_msc_array[DSCP_ARRAY_SIZE];
253
254/*
255 * If there is no foreground activity on the interface for bg_switch_time
256 * seconds, the background connections can switch to foreground TCP
257 * congestion control.
258 */
259#define TCP_BG_SWITCH_TIME 2 /* seconds */
260
261#if (DEVELOPMENT || DEBUG)
262
263static int tfp_count = 0;
264
265static TAILQ_HEAD(, tclass_for_proc) tfp_head =
266 TAILQ_HEAD_INITIALIZER(tfp_head);
267
268struct tclass_for_proc {
269 TAILQ_ENTRY(tclass_for_proc) tfp_link;
270 int tfp_class;
271 pid_t tfp_pid;
272 char tfp_pname[(2 * MAXCOMLEN) + 1];
273 u_int32_t tfp_qos_mode;
274};
275
276static int get_pid_tclass(struct so_tcdbg *);
277static int get_pname_tclass(struct so_tcdbg *);
278static int set_pid_tclass(struct so_tcdbg *);
279static int set_pname_tclass(struct so_tcdbg *);
280static int flush_pid_tclass(struct so_tcdbg *);
281static int purge_tclass_for_proc(void);
282static int flush_tclass_for_proc(void);
283static void set_tclass_for_curr_proc(struct socket *);
284
285/*
286 * Must be called with tclass_lock held
287 */
288static struct tclass_for_proc *
289find_tfp_by_pid(pid_t pid)
290{
291 struct tclass_for_proc *tfp;
292
293 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
294 if (tfp->tfp_pid == pid)
295 break;
296 }
297 return (tfp);
298}
299
300/*
301 * Must be called with tclass_lock held
302 */
303static struct tclass_for_proc *
304find_tfp_by_pname(const char *pname)
305{
306 struct tclass_for_proc *tfp;
307
308 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
309 if (strncmp(pname, tfp->tfp_pname,
310 sizeof (tfp->tfp_pname)) == 0)
311 break;
312 }
313 return (tfp);
314}
315
316__private_extern__ void
317set_tclass_for_curr_proc(struct socket *so)
318{
319 struct tclass_for_proc *tfp = NULL;
320 proc_t p = current_proc(); /* Not ref counted */
321 pid_t pid = proc_pid(p);
322 char *pname = proc_best_name(p);
323
324 lck_mtx_lock(tclass_lock);
325
326 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
327 if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 &&
328 strncmp(pname, tfp->tfp_pname,
329 sizeof (tfp->tfp_pname)) == 0)) {
330 if (tfp->tfp_class != SO_TC_UNSPEC)
331 so->so_traffic_class = tfp->tfp_class;
332
333 if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE)
334 so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
335 else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE)
336 so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
337 break;
338 }
339 }
340
341 lck_mtx_unlock(tclass_lock);
342}
343
344/*
345 * Purge entries with PIDs of exited processes
346 */
347int
348purge_tclass_for_proc(void)
349{
350 int error = 0;
351 struct tclass_for_proc *tfp, *tvar;
352
353 lck_mtx_lock(tclass_lock);
354
355 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
356 proc_t p;
357
358 if (tfp->tfp_pid == -1)
359 continue;
360 if ((p = proc_find(tfp->tfp_pid)) == NULL) {
361 tfp_count--;
362 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
363
364 _FREE(tfp, M_TEMP);
365 } else {
366 proc_rele(p);
367 }
368 }
369
370 lck_mtx_unlock(tclass_lock);
371
372 return (error);
373}
374
375/*
376 * Remove one entry
377 * Must be called with tclass_lock held
378 */
379static void
380free_tclass_for_proc(struct tclass_for_proc *tfp)
381{
382 if (tfp == NULL)
383 return;
384 tfp_count--;
385 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
386 _FREE(tfp, M_TEMP);
387}
388
389/*
390 * Remove all entries
391 */
392int
393flush_tclass_for_proc(void)
394{
395 int error = 0;
396 struct tclass_for_proc *tfp, *tvar;
397
398 lck_mtx_lock(tclass_lock);
399
400 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
401 free_tclass_for_proc(tfp);
402 }
403
404 lck_mtx_unlock(tclass_lock);
405
406 return (error);
407
408}
409
410/*
411 * Must be called with tclass_lock held
412 */
413static struct tclass_for_proc *
414alloc_tclass_for_proc(pid_t pid, const char *pname)
415{
416 struct tclass_for_proc *tfp;
417
418 if (pid == -1 && pname == NULL)
419 return (NULL);
420
421 tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO);
422 if (tfp == NULL)
423 return (NULL);
424
425 tfp->tfp_pid = pid;
426 /*
427 * Add per pid entries before per proc name so we can find
428 * a specific instance of a process before the general name base entry.
429 */
430 if (pid != -1) {
431 TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link);
432 } else {
433 strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname));
434 TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link);
435 }
436
437 tfp_count++;
438
439 return (tfp);
440}
441
442/*
443 * SO_TC_UNSPEC for tclass means to remove the entry
444 */
445int
446set_pid_tclass(struct so_tcdbg *so_tcdbg)
447{
448 int error = EINVAL;
449 proc_t p = NULL;
450 struct filedesc *fdp;
451 struct fileproc *fp;
452 struct tclass_for_proc *tfp;
453 int i;
454 pid_t pid = so_tcdbg->so_tcdbg_pid;
455 int tclass = so_tcdbg->so_tcdbg_tclass;
456 int netsvctype = so_tcdbg->so_tcdbg_netsvctype;
457
458 p = proc_find(pid);
459 if (p == NULL) {
460 printf("%s proc_find(%d) failed\n", __func__, pid);
461 goto done;
462 }
463
464 /* Need a tfp */
465 lck_mtx_lock(tclass_lock);
466
467 tfp = find_tfp_by_pid(pid);
468 if (tfp == NULL) {
469 tfp = alloc_tclass_for_proc(pid, NULL);
470 if (tfp == NULL) {
471 lck_mtx_unlock(tclass_lock);
472 error = ENOBUFS;
473 goto done;
474 }
475 }
476 tfp->tfp_class = tclass;
477 tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode;
478
479 lck_mtx_unlock(tclass_lock);
480
481 if (tfp != NULL) {
482 proc_fdlock(p);
483
484 fdp = p->p_fd;
485 for (i = 0; i < fdp->fd_nfiles; i++) {
486 struct socket *so;
487
488 fp = fdp->fd_ofiles[i];
489 if (fp == NULL ||
490 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
491 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
492 continue;
493
494 so = (struct socket *)fp->f_fglob->fg_data;
495 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6)
496 continue;
497
498 socket_lock(so, 1);
499 if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE)
500 so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
501 else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE)
502 so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
503 socket_unlock(so, 1);
504
505 if (netsvctype != _NET_SERVICE_TYPE_UNSPEC)
506 error = sock_setsockopt(so, SOL_SOCKET,
507 SO_NET_SERVICE_TYPE, &netsvctype, sizeof(int));
508 if (tclass != SO_TC_UNSPEC)
509 error = sock_setsockopt(so, SOL_SOCKET,
510 SO_TRAFFIC_CLASS, &tclass, sizeof(int));
511
512 }
513
514 proc_fdunlock(p);
515 }
516
517 error = 0;
518done:
519 if (p != NULL)
520 proc_rele(p);
521
522 return (error);
523}
524
525int
526set_pname_tclass(struct so_tcdbg *so_tcdbg)
527{
528 int error = EINVAL;
529 struct tclass_for_proc *tfp;
530
531 lck_mtx_lock(tclass_lock);
532
533 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
534 if (tfp == NULL) {
535 tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname);
536 if (tfp == NULL) {
537 lck_mtx_unlock(tclass_lock);
538 error = ENOBUFS;
539 goto done;
540 }
541 }
542 tfp->tfp_class = so_tcdbg->so_tcdbg_tclass;
543 tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode;
544
545 lck_mtx_unlock(tclass_lock);
546
547 error = 0;
548done:
549
550 return (error);
551}
552
553static int
554flush_pid_tclass(struct so_tcdbg *so_tcdbg)
555{
556 pid_t pid = so_tcdbg->so_tcdbg_pid;
557 int tclass = so_tcdbg->so_tcdbg_tclass;
558 struct filedesc *fdp;
559 int error = EINVAL;
560 proc_t p;
561 int i;
562
563 p = proc_find(pid);
564 if (p == PROC_NULL) {
565 printf("%s proc_find(%d) failed\n", __func__, pid);
566 goto done;
567 }
568
569 proc_fdlock(p);
570 fdp = p->p_fd;
571 for (i = 0; i < fdp->fd_nfiles; i++) {
572 struct socket *so;
573 struct fileproc *fp;
574
575 fp = fdp->fd_ofiles[i];
576 if (fp == NULL ||
577 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
578 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
579 continue;
580
581 so = (struct socket *)fp->f_fglob->fg_data;
582 error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass,
583 sizeof (tclass));
584 if (error != 0) {
585 printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, "
586 "tclass=%d) failed %d\n", __func__,
587 (uint64_t)VM_KERNEL_ADDRPERM(so), i, tclass,
588 error);
589 error = 0;
590 }
591 }
592 proc_fdunlock(p);
593
594 error = 0;
595done:
596 if (p != PROC_NULL)
597 proc_rele(p);
598
599 return (error);
600}
601
602int
603get_pid_tclass(struct so_tcdbg *so_tcdbg)
604{
605 int error = EINVAL;
606 proc_t p = NULL;
607 struct tclass_for_proc *tfp;
608 pid_t pid = so_tcdbg->so_tcdbg_pid;
609
610 so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */
611
612 p = proc_find(pid);
613 if (p == NULL) {
614 printf("%s proc_find(%d) failed\n", __func__, pid);
615 goto done;
616 }
617
618 /* Need a tfp */
619 lck_mtx_lock(tclass_lock);
620
621 tfp = find_tfp_by_pid(pid);
622 if (tfp != NULL) {
623 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
624 so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
625 error = 0;
626 }
627 lck_mtx_unlock(tclass_lock);
628done:
629 if (p != NULL)
630 proc_rele(p);
631
632 return (error);
633}
634
635int
636get_pname_tclass(struct so_tcdbg *so_tcdbg)
637{
638 int error = EINVAL;
639 struct tclass_for_proc *tfp;
640
641 so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */
642
643 /* Need a tfp */
644 lck_mtx_lock(tclass_lock);
645
646 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
647 if (tfp != NULL) {
648 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
649 so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
650 error = 0;
651 }
652 lck_mtx_unlock(tclass_lock);
653
654 return (error);
655}
656
657static int
658delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg)
659{
660 int error = EINVAL;
661 pid_t pid = so_tcdbg->so_tcdbg_pid;
662 struct tclass_for_proc *tfp = NULL;
663
664 lck_mtx_lock(tclass_lock);
665
666 if (pid != -1)
667 tfp = find_tfp_by_pid(pid);
668 else
669 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
670
671 if (tfp != NULL) {
672 free_tclass_for_proc(tfp);
673 error = 0;
674 }
675
676 lck_mtx_unlock(tclass_lock);
677
678 return (error);
679}
680
681/*
682 * Setting options requires privileges
683 */
684__private_extern__ int
685so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg)
686{
687 int error = 0;
688
689 if ((so->so_state & SS_PRIV) == 0)
690 return (EPERM);
691
692 socket_unlock(so, 0);
693
694 switch (so_tcdbg->so_tcdbg_cmd) {
695 case SO_TCDBG_PID:
696 error = set_pid_tclass(so_tcdbg);
697 break;
698
699 case SO_TCDBG_PNAME:
700 error = set_pname_tclass(so_tcdbg);
701 break;
702
703 case SO_TCDBG_PURGE:
704 error = purge_tclass_for_proc();
705 break;
706
707 case SO_TCDBG_FLUSH:
708 error = flush_tclass_for_proc();
709 break;
710
711 case SO_TCDBG_DELETE:
712 error = delete_tclass_for_pid_pname(so_tcdbg);
713 break;
714
715 case SO_TCDBG_TCFLUSH_PID:
716 error = flush_pid_tclass(so_tcdbg);
717 break;
718
719 default:
720 error = EINVAL;
721 break;
722 }
723
724 socket_lock(so, 0);
725
726 return (error);
727}
728
729/*
730 * Not required to be privileged to get
731 */
732__private_extern__ int
733sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
734{
735 int error = 0;
736 struct so_tcdbg so_tcdbg;
737 void *buf = NULL;
738 size_t len = sopt->sopt_valsize;
739
740 error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg),
741 sizeof (struct so_tcdbg));
742 if (error != 0)
743 return (error);
744
745 sopt->sopt_valsize = len;
746
747 socket_unlock(so, 0);
748
749 switch (so_tcdbg.so_tcdbg_cmd) {
750 case SO_TCDBG_PID:
751 error = get_pid_tclass(&so_tcdbg);
752 break;
753
754 case SO_TCDBG_PNAME:
755 error = get_pname_tclass(&so_tcdbg);
756 break;
757
758 case SO_TCDBG_COUNT:
759 lck_mtx_lock(tclass_lock);
760 so_tcdbg.so_tcdbg_count = tfp_count;
761 lck_mtx_unlock(tclass_lock);
762 break;
763
764 case SO_TCDBG_LIST: {
765 struct tclass_for_proc *tfp;
766 int n, alloc_count;
767 struct so_tcdbg *ptr;
768
769 lck_mtx_lock(tclass_lock);
770 if ((alloc_count = tfp_count) == 0) {
771 lck_mtx_unlock(tclass_lock);
772 error = EINVAL;
773 break;
774 }
775 len = alloc_count * sizeof (struct so_tcdbg);
776 lck_mtx_unlock(tclass_lock);
777
778 buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
779 if (buf == NULL) {
780 error = ENOBUFS;
781 break;
782 }
783
784 lck_mtx_lock(tclass_lock);
785 n = 0;
786 ptr = (struct so_tcdbg *)buf;
787 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
788 if (++n > alloc_count)
789 break;
790 if (tfp->tfp_pid != -1) {
791 ptr->so_tcdbg_cmd = SO_TCDBG_PID;
792 ptr->so_tcdbg_pid = tfp->tfp_pid;
793 } else {
794 ptr->so_tcdbg_cmd = SO_TCDBG_PNAME;
795 ptr->so_tcdbg_pid = -1;
796 strlcpy(ptr->so_tcdbg_pname,
797 tfp->tfp_pname,
798 sizeof (ptr->so_tcdbg_pname));
799 }
800 ptr->so_tcdbg_tclass = tfp->tfp_class;
801 ptr->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
802 ptr++;
803 }
804
805 lck_mtx_unlock(tclass_lock);
806 }
807 break;
808
809 default:
810 error = EINVAL;
811 break;
812 }
813
814 socket_lock(so, 0);
815
816 if (error == 0) {
817 if (buf == NULL) {
818 error = sooptcopyout(sopt, &so_tcdbg,
819 sizeof (struct so_tcdbg));
820 } else {
821 error = sooptcopyout(sopt, buf, len);
822 _FREE(buf, M_TEMP);
823 }
824 }
825 return (error);
826}
827
828#endif /* (DEVELOPMENT || DEBUG) */
829
830int
831so_get_netsvc_marking_level(struct socket *so)
832{
833 int marking_level = NETSVC_MRKNG_UNKNOWN;
834 struct ifnet *ifp = NULL;
835
836 switch (SOCK_DOM(so)) {
837 case PF_INET: {
838 struct inpcb *inp = sotoinpcb(so);
839
840 if (inp != NULL)
841 ifp = inp->inp_last_outifp;
842 break;
843 }
844 case PF_INET6: {
845 struct in6pcb *in6p = sotoin6pcb(so);
846
847 if (in6p != NULL)
848 ifp = in6p->in6p_last_outifp;
849 break;
850 }
851 default:
852 break;
853 }
854 if (ifp != NULL) {
855 if ((ifp->if_eflags &
856 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) ==
857 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) {
858 if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED))
859 marking_level = NETSVC_MRKNG_LVL_L3L2_ALL;
860 else
861 marking_level = NETSVC_MRKNG_LVL_L3L2_BK;
862 } else {
863 marking_level = NETSVC_MRKNG_LVL_L2;
864 }
865 }
866 return (marking_level);
867}
868
869__private_extern__ int
870so_set_traffic_class(struct socket *so, int optval)
871{
872 int error = 0;
873
874 if (optval < SO_TC_BE || optval > SO_TC_CTL) {
875 error = EINVAL;
876 } else {
877 switch (optval) {
878 case _SO_TC_BK:
879 optval = SO_TC_BK;
880 break;
881 case _SO_TC_VI:
882 optval = SO_TC_VI;
883 break;
884 case _SO_TC_VO:
885 optval = SO_TC_VO;
886 break;
887 default:
888 if (!SO_VALID_TC(optval))
889 error = EINVAL;
890 break;
891 }
892
893 if (error == 0) {
894 int oldval = so->so_traffic_class;
895
896 VERIFY(SO_VALID_TC(optval));
897 so->so_traffic_class = optval;
898
899 if ((SOCK_DOM(so) == PF_INET ||
900 SOCK_DOM(so) == PF_INET6) &&
901 SOCK_TYPE(so) == SOCK_STREAM)
902 set_tcp_stream_priority(so);
903
904 if ((SOCK_DOM(so) == PF_INET ||
905 SOCK_DOM(so) == PF_INET6) &&
906 optval != oldval && (optval == SO_TC_BK_SYS ||
907 oldval == SO_TC_BK_SYS)) {
908 /*
909 * If the app switches from BK_SYS to something
910 * else, resume the socket if it was suspended.
911 */
912 if (oldval == SO_TC_BK_SYS)
913 inp_reset_fc_state(so->so_pcb);
914
915 SOTHROTTLELOG("throttle[%d]: so 0x%llx "
916 "[%d,%d] opportunistic %s\n", so->last_pid,
917 (uint64_t)VM_KERNEL_ADDRPERM(so),
918 SOCK_DOM(so), SOCK_TYPE(so),
919 (optval == SO_TC_BK_SYS) ? "ON" : "OFF");
920 }
921 }
922 }
923 return (error);
924}
925
926__private_extern__ int
927so_set_net_service_type(struct socket *so, int netsvctype)
928{
929 int sotc;
930 int error;
931
932 if (!IS_VALID_NET_SERVICE_TYPE(netsvctype))
933 return (EINVAL);
934
935 sotc = sotc_by_netservicetype[netsvctype];
936 error = so_set_traffic_class(so, sotc);
937 if (error != 0)
938 return (error);
939 so->so_netsvctype = netsvctype;
940 so->so_flags1 |= SOF1_TC_NET_SERV_TYPE;
941
942 return (0);
943}
944
945__private_extern__ void
946so_set_default_traffic_class(struct socket *so)
947{
948 so->so_traffic_class = SO_TC_BE;
949
950 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) {
951 if (net_qos_policy_restricted == 0)
952 so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
953#if (DEVELOPMENT || DEBUG)
954 if (tfp_count > 0)
955 set_tclass_for_curr_proc(so);
956#endif /* (DEVELOPMENT || DEBUG) */
957 }
958}
959
960__private_extern__ int
961so_set_opportunistic(struct socket *so, int optval)
962{
963 return (so_set_traffic_class(so, (optval == 0) ?
964 SO_TC_BE : SO_TC_BK_SYS));
965}
966
967__private_extern__ int
968so_get_opportunistic(struct socket *so)
969{
970 return (so->so_traffic_class == SO_TC_BK_SYS);
971}
972
973__private_extern__ int
974so_tc_from_control(struct mbuf *control, int *out_netsvctype)
975{
976 struct cmsghdr *cm;
977 int sotc = SO_TC_UNSPEC;
978
979 *out_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
980
981 for (cm = M_FIRST_CMSGHDR(control); cm != NULL;
982 cm = M_NXT_CMSGHDR(control, cm)) {
983 int val;
984
985 if (cm->cmsg_len < sizeof (struct cmsghdr))
986 break;
987 if (cm->cmsg_level != SOL_SOCKET ||
988 cm->cmsg_len != CMSG_LEN(sizeof(int)))
989 continue;
990 val = *(int *)(void *)CMSG_DATA(cm);
991 /*
992 * The first valid option wins
993 */
994 switch (cm->cmsg_type) {
995 case SO_TRAFFIC_CLASS:
996 if (SO_VALID_TC(val)) {
997 sotc = val;
998 return (sotc);
999 /* NOT REACHED */
1000 } else if (val < SO_TC_NET_SERVICE_OFFSET) {
1001 break;
1002 }
1003 /*
1004 * Handle the case SO_NET_SERVICE_TYPE values are
1005 * passed using SO_TRAFFIC_CLASS
1006 */
1007 val = val - SO_TC_NET_SERVICE_OFFSET;
1008 /* FALLTHROUGH */
1009 case SO_NET_SERVICE_TYPE:
1010 if (!IS_VALID_NET_SERVICE_TYPE(val))
1011 break;
1012 *out_netsvctype = val;
1013 sotc = sotc_by_netservicetype[val];
1014 return (sotc);
1015 /* NOT REACHED */
1016 default:
1017 break;
1018 }
1019 }
1020
1021 return (sotc);
1022}
1023
1024__private_extern__ void
1025so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off)
1026{
1027 uint32_t mtc = m_get_traffic_class(m);
1028
1029 if (mtc >= SO_TC_STATS_MAX)
1030 mtc = MBUF_TC_BE;
1031
1032 so->so_tc_stats[mtc].rxpackets += 1;
1033 so->so_tc_stats[mtc].rxbytes +=
1034 ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off;
1035}
1036
1037__private_extern__ void
1038so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes,
1039 uint32_t mtc)
1040{
1041 if (mtc >= SO_TC_STATS_MAX)
1042 mtc = MBUF_TC_BE;
1043
1044 so->so_tc_stats[mtc].rxpackets += pkts;
1045 so->so_tc_stats[mtc].rxbytes += bytes;
1046}
1047
1048static inline int
1049so_throttle_best_effort(struct socket *so, struct ifnet *ifp)
1050{
1051 u_int32_t uptime = net_uptime();
1052 return (soissrcbesteffort(so) &&
1053 net_io_policy_throttle_best_effort == 1 &&
1054 ifp->if_rt_sendts > 0 &&
1055 (int)(uptime - ifp->if_rt_sendts) <= TCP_BG_SWITCH_TIME);
1056}
1057
1058__private_extern__ void
1059set_tcp_stream_priority(struct socket *so)
1060{
1061 struct inpcb *inp = sotoinpcb(so);
1062 struct tcpcb *tp = intotcpcb(inp);
1063 struct ifnet *outifp;
1064 u_char old_cc = tp->tcp_cc_index;
1065 int recvbg = IS_TCP_RECV_BG(so);
1066 bool is_local = false, fg_active = false;
1067 u_int32_t uptime;
1068
1069 VERIFY((SOCK_CHECK_DOM(so, PF_INET) ||
1070 SOCK_CHECK_DOM(so, PF_INET6)) &&
1071 SOCK_CHECK_TYPE(so, SOCK_STREAM) &&
1072 SOCK_CHECK_PROTO(so, IPPROTO_TCP));
1073
1074 /* Return if the socket is in a terminal state */
1075 if (inp->inp_state == INPCB_STATE_DEAD)
1076 return;
1077
1078 outifp = inp->inp_last_outifp;
1079 uptime = net_uptime();
1080
1081 /*
1082 * If the socket was marked as a background socket or if the
1083 * traffic class is set to background with traffic class socket
1084 * option then make both send and recv side of the stream to be
1085 * background. The variable sotcdb which can be set with sysctl
1086 * is used to disable these settings for testing.
1087 */
1088 if (outifp == NULL || (outifp->if_flags & IFF_LOOPBACK))
1089 is_local = true;
1090
1091 /* Check if there has been recent foreground activity */
1092 if (outifp != NULL) {
1093 /*
1094 * If the traffic source is background, check if
1095 * if it can be switched to foreground. This can
1096 * happen when there is no indication of foreground
1097 * activity.
1098 */
1099 if (soissrcbackground(so) && outifp->if_fg_sendts > 0 &&
1100 (int)(uptime - outifp->if_fg_sendts) <= TCP_BG_SWITCH_TIME)
1101 fg_active = true;
1102
1103 /*
1104 * The traffic source is best-effort -- check if
1105 * the policy to throttle best effort is enabled
1106 * and there was realtime activity on this
1107 * interface recently. If this is true, enable
1108 * algorithms that respond to increased latency
1109 * on best-effort traffic.
1110 */
1111 if (so_throttle_best_effort(so, outifp))
1112 fg_active = true;
1113 }
1114
1115 /*
1116 * System initiated background traffic like cloud uploads should
1117 * always use background delay sensitive algorithms. This will
1118 * make the stream more responsive to other streams on the user's
1119 * network and it will minimize latency induced.
1120 */
1121 if (fg_active || IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1122 /*
1123 * If the interface that the connection is using is
1124 * loopback, do not use background congestion
1125 * control algorithm.
1126 *
1127 * If there has been recent foreground activity or if
1128 * there was an indication that a foreground application
1129 * is going to use networking (net_io_policy_throttled),
1130 * switch the backgroung streams to use background
1131 * congestion control algorithm. Otherwise, even background
1132 * flows can move into foreground.
1133 */
1134 if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0 || is_local ||
1135 !IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1136 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
1137 tcp_set_foreground_cc(so);
1138 } else {
1139 if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX)
1140 tcp_set_background_cc(so);
1141 }
1142
1143 /* Set receive side background flags */
1144 if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0 || is_local ||
1145 !IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1146 tcp_clear_recv_bg(so);
1147 } else {
1148 tcp_set_recv_bg(so);
1149 }
1150 } else {
1151 tcp_clear_recv_bg(so);
1152 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
1153 tcp_set_foreground_cc(so);
1154 }
1155
1156 if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) {
1157 SOTHROTTLELOG("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; "
1158 "%s recv\n", so->last_pid,
1159 (uint64_t)VM_KERNEL_ADDRPERM(so),
1160 SOCK_DOM(so), SOCK_TYPE(so),
1161 (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ?
1162 "background" : "foreground",
1163 IS_TCP_RECV_BG(so) ? "background" : "foreground");
1164 }
1165}
1166
1167/*
1168 * Set traffic class to an IPv4 or IPv6 packet
1169 * - mark the mbuf
1170 * - set the DSCP code following the WMM mapping
1171 */
1172__private_extern__ void
1173set_packet_service_class(struct mbuf *m, struct socket *so,
1174 int sotc, u_int32_t flags)
1175{
1176 mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */
1177 struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */
1178
1179 if (!(m->m_flags & M_PKTHDR))
1180 return;
1181
1182 /*
1183 * Here is the precedence:
1184 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
1185 * 2) Traffic class passed via ancillary data to sendmsdg(2)
1186 * 3) Traffic class socket option last
1187 */
1188 if (sotc != SO_TC_UNSPEC) {
1189 VERIFY(SO_VALID_TC(sotc));
1190 msc = so_tc2msc(sotc);
1191 /* Assert because tc must have been valid */
1192 VERIFY(MBUF_VALID_SC(msc));
1193 }
1194
1195 /*
1196 * If TRAFFIC_MGT_SO_BACKGROUND is set or policy to throttle
1197 * best effort is set, depress the priority.
1198 */
1199 if (!IS_MBUF_SC_BACKGROUND(msc) && soisthrottled(so))
1200 msc = MBUF_SC_BK;
1201
1202 if (IS_MBUF_SC_BESTEFFORT(msc) && inp->inp_last_outifp != NULL &&
1203 so_throttle_best_effort(so, inp->inp_last_outifp))
1204 msc = MBUF_SC_BK;
1205
1206 if (soissrcbackground(so))
1207 m->m_pkthdr.pkt_flags |= PKTF_SO_BACKGROUND;
1208
1209 if (soissrcrealtime(so) || IS_MBUF_SC_REALTIME(msc))
1210 m->m_pkthdr.pkt_flags |= PKTF_SO_REALTIME;
1211 /*
1212 * Set the traffic class in the mbuf packet header svc field
1213 */
1214 if (sotcdb & SOTCDB_NO_MTC)
1215 goto no_mbtc;
1216
1217 /*
1218 * Elevate service class if the packet is a pure TCP ACK.
1219 * We can do this only when the flow is not a background
1220 * flow and the outgoing interface supports
1221 * transmit-start model.
1222 */
1223 if (!IS_MBUF_SC_BACKGROUND(msc) &&
1224 (flags & (PKT_SCF_TCP_ACK | PKT_SCF_TCP_SYN)) != 0)
1225 msc = MBUF_SC_CTL;
1226
1227 (void) m_set_service_class(m, msc);
1228
1229 /*
1230 * Set the privileged traffic auxiliary flag if applicable,
1231 * or clear it.
1232 */
1233 if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) &&
1234 msc != MBUF_SC_UNSPEC)
1235 m->m_pkthdr.pkt_flags |= PKTF_PRIO_PRIVILEGED;
1236 else
1237 m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED;
1238
1239no_mbtc:
1240 /*
1241 * For TCP with background traffic class switch CC algo based on sysctl
1242 */
1243 if (so->so_type == SOCK_STREAM)
1244 set_tcp_stream_priority(so);
1245
1246 so_tc_update_stats(m, so, msc);
1247}
1248
1249__private_extern__ void
1250so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc)
1251{
1252 mbuf_traffic_class_t mtc;
1253
1254 /*
1255 * Assume socket and mbuf traffic class values are the same
1256 * Also assume the socket lock is held. Note that the stats
1257 * at the socket layer are reduced down to the legacy traffic
1258 * classes; we could/should potentially expand so_tc_stats[].
1259 */
1260 mtc = MBUF_SC2TC(msc);
1261 VERIFY(mtc < SO_TC_STATS_MAX);
1262 so->so_tc_stats[mtc].txpackets += 1;
1263 so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len;
1264}
1265
1266__private_extern__ void
1267socket_tclass_init(void)
1268{
1269 _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX);
1270
1271 tclass_lck_grp_attr = lck_grp_attr_alloc_init();
1272 tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr);
1273 tclass_lck_attr = lck_attr_alloc_init();
1274 lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr);
1275}
1276
1277__private_extern__ mbuf_svc_class_t
1278so_tc2msc(int tc)
1279{
1280 mbuf_svc_class_t msc;
1281
1282 switch (tc) {
1283 case SO_TC_BK_SYS:
1284 msc = MBUF_SC_BK_SYS;
1285 break;
1286 case SO_TC_BK:
1287 case _SO_TC_BK:
1288 msc = MBUF_SC_BK;
1289 break;
1290 case SO_TC_BE:
1291 msc = MBUF_SC_BE;
1292 break;
1293 case SO_TC_RD:
1294 msc = MBUF_SC_RD;
1295 break;
1296 case SO_TC_OAM:
1297 msc = MBUF_SC_OAM;
1298 break;
1299 case SO_TC_AV:
1300 msc = MBUF_SC_AV;
1301 break;
1302 case SO_TC_RV:
1303 msc = MBUF_SC_RV;
1304 break;
1305 case SO_TC_VI:
1306 case _SO_TC_VI:
1307 msc = MBUF_SC_VI;
1308 break;
1309 case SO_TC_NETSVC_SIG:
1310 msc = MBUF_SC_SIG;
1311 break;
1312 case SO_TC_VO:
1313 case _SO_TC_VO:
1314 msc = MBUF_SC_VO;
1315 break;
1316 case SO_TC_CTL:
1317 msc = MBUF_SC_CTL;
1318 break;
1319 case SO_TC_ALL:
1320 default:
1321 msc = MBUF_SC_UNSPEC;
1322 break;
1323 }
1324
1325 return (msc);
1326}
1327
1328__private_extern__ int
1329so_svc2tc(mbuf_svc_class_t svc)
1330{
1331 switch (svc) {
1332 case MBUF_SC_BK_SYS:
1333 return (SO_TC_BK_SYS);
1334 case MBUF_SC_BK:
1335 return (SO_TC_BK);
1336 case MBUF_SC_BE:
1337 return (SO_TC_BE);
1338 case MBUF_SC_RD:
1339 return (SO_TC_RD);
1340 case MBUF_SC_OAM:
1341 return (SO_TC_OAM);
1342 case MBUF_SC_AV:
1343 return (SO_TC_AV);
1344 case MBUF_SC_RV:
1345 return (SO_TC_RV);
1346 case MBUF_SC_VI:
1347 return (SO_TC_VI);
1348 case MBUF_SC_SIG:
1349 return (SO_TC_NETSVC_SIG);
1350 case MBUF_SC_VO:
1351 return (SO_TC_VO);
1352 case MBUF_SC_CTL:
1353 return (SO_TC_CTL);
1354 case MBUF_SC_UNSPEC:
1355 default:
1356 return (SO_TC_BE);
1357 }
1358}
1359
1360/*
1361 * LRO is turned on for AV streaming class.
1362 */
1363void
1364so_set_lro(struct socket *so, int optval)
1365{
1366 if (optval == SO_TC_AV) {
1367 so->so_flags |= SOF_USELRO;
1368 } else {
1369 if (so->so_flags & SOF_USELRO) {
1370 /* transition to non LRO class */
1371 so->so_flags &= ~SOF_USELRO;
1372 struct inpcb *inp = sotoinpcb(so);
1373 struct tcpcb *tp = NULL;
1374 if (inp) {
1375 tp = intotcpcb(inp);
1376 if (tp && (tp->t_flagsext & TF_LRO_OFFLOADED)) {
1377 tcp_lro_remove_state(inp->inp_laddr,
1378 inp->inp_faddr,
1379 inp->inp_lport,
1380 inp->inp_fport);
1381 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
1382 }
1383 }
1384 }
1385 }
1386}
1387
1388static size_t
1389sotc_index(int sotc)
1390{
1391 switch (sotc) {
1392 case SO_TC_BK_SYS:
1393 return (SOTCIX_BK_SYS);
1394 case _SO_TC_BK:
1395 case SO_TC_BK:
1396 return (SOTCIX_BK);
1397
1398 case SO_TC_BE:
1399 return (SOTCIX_BE);
1400 case SO_TC_RD:
1401 return (SOTCIX_RD);
1402 case SO_TC_OAM:
1403 return (SOTCIX_OAM);
1404
1405 case SO_TC_AV:
1406 return (SOTCIX_AV);
1407 case SO_TC_RV:
1408 return (SOTCIX_RV);
1409 case _SO_TC_VI:
1410 case SO_TC_VI:
1411 return (SOTCIX_VI);
1412
1413 case _SO_TC_VO:
1414 case SO_TC_VO:
1415 return (SOTCIX_VO);
1416 case SO_TC_CTL:
1417 return (SOTCIX_CTL);
1418
1419 default:
1420 break;
1421 }
1422 /*
1423 * Unknown traffic class value
1424 */
1425 return (SIZE_T_MAX);
1426}
1427
1428/*
1429 * Pass NULL ifp for default map
1430 */
1431static errno_t
1432set_netsvctype_dscp_map(size_t in_count,
1433 const struct netsvctype_dscp_map *netsvctype_dscp_map)
1434{
1435 size_t i;
1436 struct net_qos_dscp_map *net_qos_dscp_map = NULL;
1437 int netsvctype;
1438
1439 /*
1440 * Do not accept more that max number of distinct DSCPs
1441 */
1442 if (in_count > _MAX_DSCP || netsvctype_dscp_map == NULL)
1443 return (EINVAL);
1444
1445 /*
1446 * Validate input parameters
1447 */
1448 for (i = 0; i < in_count; i++) {
1449 if (!IS_VALID_NET_SERVICE_TYPE(netsvctype_dscp_map[i].netsvctype))
1450 return (EINVAL);
1451 if (netsvctype_dscp_map[i].dscp > _MAX_DSCP)
1452 return (EINVAL);
1453 }
1454
1455 net_qos_dscp_map = &default_net_qos_dscp_map;
1456
1457 for (i = 0; i < in_count; i++) {
1458 netsvctype = netsvctype_dscp_map[i].netsvctype;
1459
1460 net_qos_dscp_map->netsvctype_to_dscp[netsvctype] =
1461 netsvctype_dscp_map[i].dscp;
1462 }
1463 for (netsvctype = 0; netsvctype < _NET_SERVICE_TYPE_COUNT; netsvctype++) {
1464 switch (netsvctype) {
1465 case NET_SERVICE_TYPE_BE:
1466 case NET_SERVICE_TYPE_BK:
1467 case NET_SERVICE_TYPE_VI:
1468 case NET_SERVICE_TYPE_VO:
1469 case NET_SERVICE_TYPE_RV:
1470 case NET_SERVICE_TYPE_AV:
1471 case NET_SERVICE_TYPE_OAM:
1472 case NET_SERVICE_TYPE_RD: {
1473 size_t sotcix;
1474
1475 sotcix = sotc_index(sotc_by_netservicetype[netsvctype]);
1476 if (sotcix != SIZE_T_MAX) {
1477 net_qos_dscp_map->sotc_to_dscp[sotcix] =
1478 netsvctype_dscp_map[netsvctype].dscp;
1479 }
1480 break;
1481 }
1482 case NET_SERVICE_TYPE_SIG:
1483 /* Signaling does not have its own traffic class */
1484 break;
1485 default:
1486 /* We should not be here */
1487 ASSERT(0);
1488 }
1489 }
1490 /* Network control socket traffic class is always best effort */
1491 net_qos_dscp_map->sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF;
1492
1493 /* Backround socket traffic class DSCP same as backround system */
1494 net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK] =
1495 net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK_SYS];
1496
1497 return (0);
1498}
1499
1500/*
1501 * out_count is an input/ouput parameter
1502 */
1503static errno_t
1504get_netsvctype_dscp_map(size_t *out_count,
1505 struct netsvctype_dscp_map *netsvctype_dscp_map)
1506{
1507 size_t i;
1508 struct net_qos_dscp_map *net_qos_dscp_map = NULL;
1509
1510 /*
1511 * Do not accept more that max number of distinct DSCPs
1512 */
1513 if (out_count == NULL || netsvctype_dscp_map == NULL)
1514 return (EINVAL);
1515 if (*out_count > _MAX_DSCP)
1516 return (EINVAL);
1517
1518 net_qos_dscp_map = &default_net_qos_dscp_map;
1519
1520 for (i = 0; i < MIN(_NET_SERVICE_TYPE_COUNT, *out_count); i++) {
1521 netsvctype_dscp_map[i].netsvctype = i;
1522 netsvctype_dscp_map[i].dscp = net_qos_dscp_map->netsvctype_to_dscp[i];
1523
1524 }
1525 *out_count = i;
1526
1527 return (0);
1528}
1529
1530void
1531net_qos_map_init()
1532{
1533 errno_t error;
1534
1535 /*
1536 * By default use the Fastlane DSCP mappngs
1537 */
1538 error = set_netsvctype_dscp_map(_NET_SERVICE_TYPE_COUNT,
1539 fastlane_netsvctype_dscp_map);
1540 ASSERT(error == 0);
1541
1542 /*
1543 * No DSCP mapping for network control
1544 */
1545 default_net_qos_dscp_map.sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF;
1546
1547 set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1);
1548}
1549
1550int
1551sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS
1552{
1553#pragma unused(oidp, arg1, arg2)
1554 int error = 0;
1555 const size_t max_netsvctype_to_dscp_map_len =
1556 _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map);
1557 size_t len;
1558 struct netsvctype_dscp_map netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {};
1559 size_t count;
1560
1561 if (req->oldptr == USER_ADDR_NULL) {
1562 req->oldidx =
1563 _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map);
1564 } else if (req->oldlen > 0) {
1565 count = _NET_SERVICE_TYPE_COUNT;
1566 error = get_netsvctype_dscp_map(&count, netsvctype_dscp_map);
1567 if (error != 0)
1568 goto done;
1569 len = count * sizeof(struct netsvctype_dscp_map);
1570 error = SYSCTL_OUT(req, netsvctype_dscp_map,
1571 MIN(len, req->oldlen));
1572 if (error != 0)
1573 goto done;
1574 }
1575
1576 if (req->newptr == USER_ADDR_NULL)
1577 goto done;
1578
1579 error = proc_suser(current_proc());
1580 if (error != 0)
1581 goto done;
1582
1583 /*
1584 * Check input length
1585 */
1586 if (req->newlen > max_netsvctype_to_dscp_map_len) {
1587 error = EINVAL;
1588 goto done;
1589 }
1590 /*
1591 * Cap the number of entries to copy from input buffer
1592 */
1593 error = SYSCTL_IN(req, netsvctype_dscp_map, req->newlen);
1594 if (error != 0)
1595 goto done;
1596
1597 count = req->newlen / sizeof(struct netsvctype_dscp_map);
1598 error = set_netsvctype_dscp_map(count, netsvctype_dscp_map);
1599done:
1600 return (error);
1601}
1602
1603__private_extern__ errno_t
1604set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed,
1605 int sotc, int netsvctype, u_int8_t *dscp_inout)
1606{
1607 if (ifp == NULL || dscp_inout == NULL)
1608 return (EINVAL);
1609
1610 if ((ifp->if_eflags &
1611 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) ==
1612 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) {
1613 u_int8_t dscp;
1614
1615 /*
1616 * When on a Fastlane network, IP_TOS/IPV6_TCLASS are no-ops
1617 */
1618 dscp = _DSCP_DF;
1619
1620 /*
1621 * For DSCP use the network service type is specified, otherwise
1622 * use the socket traffic class
1623 *
1624 * When not whitelisted by the policy, set DSCP only for best
1625 * effort and background, and set the mbuf service class to
1626 * best effort as well so the packet will be queued and
1627 * scheduled at a lower priority.
1628 * We still want to prioritize control traffic on the interface
1629 * so we do not change the mbuf service class for SO_TC_CTL
1630 */
1631 if (IS_VALID_NET_SERVICE_TYPE(netsvctype) &&
1632 netsvctype != NET_SERVICE_TYPE_BE) {
1633 dscp = default_net_qos_dscp_map.netsvctype_to_dscp[netsvctype];
1634
1635 if (qos_allowed == FALSE &&
1636 netsvctype != NET_SERVICE_TYPE_BE &&
1637 netsvctype != NET_SERVICE_TYPE_BK) {
1638 dscp = _DSCP_DF;
1639 if (sotc != SO_TC_CTL)
1640 m_set_service_class(m, MBUF_SC_BE);
1641 }
1642 } else if (sotc != SO_TC_UNSPEC) {
1643 size_t sotcix = sotc_index(sotc);
1644 if (sotcix != SIZE_T_MAX) {
1645 dscp = default_net_qos_dscp_map.sotc_to_dscp[sotcix];
1646
1647 if (qos_allowed == FALSE && sotc != SO_TC_BE &&
1648 sotc != SO_TC_BK && sotc != SO_TC_BK_SYS &&
1649 sotc != SO_TC_CTL) {
1650 dscp = _DSCP_DF;
1651 if (sotc != SO_TC_CTL)
1652 m_set_service_class(m, MBUF_SC_BE);
1653 }
1654 }
1655 }
1656 if (net_qos_verbose != 0)
1657 printf("%s qos_allowed %d sotc %u netsvctype %u dscp %u\n",
1658 __func__, qos_allowed, sotc, netsvctype, dscp);
1659
1660 if (*dscp_inout != dscp) {
1661 *dscp_inout = dscp;
1662 }
1663 } else if (*dscp_inout != _DSCP_DF && IFNET_IS_WIFI_INFRA(ifp)) {
1664 mbuf_svc_class_t msc = m_get_service_class(m);
1665
1666 /*
1667 * For WiFi infra, when the mbuf service class is best effort
1668 * and the DSCP is not default, set the service class based
1669 * on DSCP
1670 */
1671 if (msc == MBUF_SC_BE) {
1672 msc = wifi_dscp_to_msc_array[*dscp_inout];
1673
1674 if (msc != MBUF_SC_BE) {
1675 m_set_service_class(m, msc);
1676
1677 if (net_qos_verbose != 0)
1678 printf("%s set msc %u for dscp %u\n",
1679 __func__, msc, *dscp_inout);
1680 }
1681 }
1682 }
1683
1684 return (0);
1685}
1686
1687static void
1688set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *map, int clear)
1689{
1690 int i;
1691
1692 if (clear)
1693 bzero(wifi_dscp_to_msc_array, sizeof(wifi_dscp_to_msc_array));
1694
1695 for (i = 0; i < DSCP_ARRAY_SIZE; i++) {
1696 const struct dcsp_msc_map *elem = map + i;
1697
1698 if (elem->dscp > _MAX_DSCP || elem->msc == MBUF_SC_UNSPEC)
1699 break;
1700 switch (elem->msc) {
1701 case MBUF_SC_BK_SYS:
1702 case MBUF_SC_BK:
1703 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BK;
1704 break;
1705 default:
1706 case MBUF_SC_BE:
1707 case MBUF_SC_RD:
1708 case MBUF_SC_OAM:
1709 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BE;
1710 break;
1711 case MBUF_SC_AV:
1712 case MBUF_SC_RV:
1713 case MBUF_SC_VI:
1714 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VI;
1715 break;
1716 case MBUF_SC_VO:
1717 case MBUF_SC_CTL:
1718 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VO;
1719 break;
1720 }
1721 }
1722}
1723
1724static errno_t
1725dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *netsvctype_dscp_map,
1726 size_t count, struct dcsp_msc_map *dcsp_msc_map)
1727{
1728 errno_t error = 0;
1729 u_int32_t i;
1730
1731 /*
1732 * Validate input parameters
1733 */
1734 for (i = 0; i < count; i++) {
1735 if (!SO_VALID_TC(netsvctype_dscp_map[i].netsvctype)) {
1736 error = EINVAL;
1737 goto done;
1738 }
1739 if (netsvctype_dscp_map[i].dscp > _MAX_DSCP) {
1740 error = EINVAL;
1741 goto done;
1742 }
1743 }
1744
1745 bzero(dcsp_msc_map, DSCP_ARRAY_SIZE * sizeof(struct dcsp_msc_map));
1746
1747 for (i = 0; i < count; i++) {
1748 dcsp_msc_map[i].dscp = netsvctype_dscp_map[i].dscp;
1749 dcsp_msc_map[i].msc = so_tc2msc(netsvctype_dscp_map[i].netsvctype);
1750 }
1751done:
1752 return (error);
1753}
1754
1755int
1756sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS
1757{
1758#pragma unused(oidp, arg1, arg2)
1759 int error = 0;
1760 size_t len = DSCP_ARRAY_SIZE * sizeof(struct netsvctype_dscp_map);
1761 struct netsvctype_dscp_map netsvctype_dscp_map[DSCP_ARRAY_SIZE] = {};
1762 struct dcsp_msc_map dcsp_msc_map[DSCP_ARRAY_SIZE];
1763 size_t count;
1764 u_int32_t i;
1765
1766 if (req->oldptr == USER_ADDR_NULL) {
1767 req->oldidx = len;
1768 } else if (req->oldlen > 0) {
1769 for (i = 0; i < DSCP_ARRAY_SIZE; i++) {
1770 netsvctype_dscp_map[i].dscp = i;
1771 netsvctype_dscp_map[i].netsvctype =
1772 so_svc2tc(wifi_dscp_to_msc_array[i]);
1773 }
1774 error = SYSCTL_OUT(req, netsvctype_dscp_map,
1775 MIN(len, req->oldlen));
1776 if (error != 0)
1777 goto done;
1778 }
1779
1780 if (req->newptr == USER_ADDR_NULL)
1781 goto done;
1782
1783 error = proc_suser(current_proc());
1784 if (error != 0)
1785 goto done;
1786
1787 /*
1788 * Check input length
1789 */
1790 if (req->newlen > len) {
1791 error = EINVAL;
1792 goto done;
1793 }
1794 /*
1795 * Cap the number of entries to copy from input buffer
1796 */
1797 if (len > req->newlen)
1798 len = req->newlen;
1799 error = SYSCTL_IN(req, netsvctype_dscp_map, len);
1800 if (error != 0) {
1801 goto done;
1802 }
1803 count = len / sizeof(struct netsvctype_dscp_map);
1804 bzero(dcsp_msc_map, sizeof(dcsp_msc_map));
1805 error = dscp_msc_map_from_netsvctype_dscp_map(netsvctype_dscp_map, count,
1806 dcsp_msc_map);
1807 if (error != 0) {
1808 goto done;
1809 }
1810 set_dscp_to_wifi_ac_map(dcsp_msc_map, 0);
1811done:
1812 return (error);
1813}
1814
1815int
1816sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS
1817{
1818#pragma unused(oidp, arg1, arg2)
1819 int error = 0;
1820 int val = 0;
1821
1822 error = sysctl_handle_int(oidp, &val, 0, req);
1823 if (error || !req->newptr)
1824 return (error);
1825
1826 set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1);
1827
1828 return (0);
1829}
1830
1831/*
1832 * Returns whether a large upload or download transfer should be marked as
1833 * BK service type for network activity. This is a system level
1834 * hint/suggestion to classify application traffic based on statistics
1835 * collected from the current network attachment
1836 *
1837 * Returns 1 for BK and 0 for default
1838 */
1839
1840int
1841net_qos_guideline(struct proc *p, struct net_qos_guideline_args *arg,
1842 int *retval)
1843{
1844#pragma unused(p)
1845#define RETURN_USE_BK 1
1846#define RETURN_USE_DEFAULT 0
1847 struct net_qos_param qos_arg;
1848 struct ifnet *ipv4_primary, *ipv6_primary;
1849 int err = 0;
1850
1851 if (arg->param == USER_ADDR_NULL || retval == NULL ||
1852 arg->param_len != sizeof (qos_arg)) {
1853 return (EINVAL);
1854 }
1855 err = copyin(arg->param, (caddr_t) &qos_arg, sizeof (qos_arg));
1856 if (err != 0)
1857 return (err);
1858
1859 *retval = RETURN_USE_DEFAULT;
1860 ipv4_primary = ifindex2ifnet[get_primary_ifscope(AF_INET)];
1861 ipv6_primary = ifindex2ifnet[get_primary_ifscope(AF_INET6)];
1862
1863 /*
1864 * If either of the interfaces is in Low Internet mode, enable
1865 * background delay based algorithms on this transfer
1866 */
1867 if (qos_arg.nq_uplink) {
1868 if ((ipv4_primary != NULL &&
1869 (ipv4_primary->if_xflags & IFXF_LOW_INTERNET_UL)) ||
1870 (ipv6_primary != NULL &&
1871 (ipv6_primary->if_xflags & IFXF_LOW_INTERNET_UL))) {
1872 *retval = RETURN_USE_BK;
1873 return (0);
1874 }
1875 } else {
1876 if ((ipv4_primary != NULL &&
1877 (ipv4_primary->if_xflags & IFXF_LOW_INTERNET_DL)) ||
1878 (ipv6_primary != NULL &&
1879 (ipv6_primary->if_xflags & IFXF_LOW_INTERNET_DL))) {
1880 *retval = RETURN_USE_BK;
1881 return (0);
1882 }
1883 }
1884
1885 /*
1886 * Some times IPv4 and IPv6 primary interfaces can be different.
1887 * In this case, if either of them is non-cellular, we should mark
1888 * the transfer as BK as it can potentially get used based on
1889 * the host name resolution
1890 */
1891 if (ipv4_primary != NULL && IFNET_IS_EXPENSIVE(ipv4_primary) &&
1892 ipv6_primary != NULL && IFNET_IS_EXPENSIVE(ipv6_primary)) {
1893 if (qos_arg.nq_use_expensive) {
1894 return (0);
1895 } else {
1896 *retval = RETURN_USE_BK;
1897 return (0);
1898 }
1899 }
1900 if (qos_arg.nq_transfer_size >= 5 * 1024 * 1024) {
1901 *retval = RETURN_USE_BK;
1902 return (0);
1903 }
1904
1905
1906#undef RETURN_USE_BK
1907#undef RETURN_USE_DEFAULT
1908 return (0);
1909}
1910