tcp_output.c source code [xnu/bsd/netinet/tcp_output.c]

1	/*
2	* Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30	* The Regents of the University of California. All rights reserved.
31	*
32	* Redistribution and use in source and binary forms, with or without
33	* modification, are permitted provided that the following conditions
34	* are met:
35	* 1. Redistributions of source code must retain the above copyright
36	* notice, this list of conditions and the following disclaimer.
37	* 2. Redistributions in binary form must reproduce the above copyright
38	* notice, this list of conditions and the following disclaimer in the
39	* documentation and/or other materials provided with the distribution.
40	* 3. All advertising materials mentioning features or use of this software
41	* must display the following acknowledgement:
42	* This product includes software developed by the University of
43	* California, Berkeley and its contributors.
44	* 4. Neither the name of the University nor the names of its contributors
45	* may be used to endorse or promote products derived from this software
46	* without specific prior written permission.
47	*
48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58	* SUCH DAMAGE.
59	*
60	* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
61	* $FreeBSD: src/sys/netinet/tcp_output.c,v 1.39.2.10 2001/07/07 04:30:38 silby Exp $
62	*/
63	/*
64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65	* support for mandatory and extensible security protections. This notice
66	* is included in support of clause 2.2 (b) of the Apple Public License,
67	* Version 2.0.
68	*/
69
70	#define _IP_VHL
71
72	#include "tcp_includes.h"
73
74	#include <sys/param.h>
75	#include <sys/systm.h>
76	#include <sys/kernel.h>
77	#include <sys/sysctl.h>
78	#include <sys/mbuf.h>
79	#include <sys/domain.h>
80	#include <sys/protosw.h>
81	#include <sys/socket.h>
82	#include <sys/socketvar.h>
83
84	#include <net/route.h>
85	#include <net/ntstat.h>
86	#include <net/if_var.h>
87	#include <net/if.h>
88	#include <net/if_types.h>
89	#include <net/dlil.h>
90
91	#include <netinet/in.h>
92	#include <netinet/in_systm.h>
93	#include <netinet/in_var.h>
94	#include <netinet/in_tclass.h>
95	#include <netinet/ip.h>
96	#include <netinet/in_pcb.h>
97	#include <netinet/ip_var.h>
98	#include <mach/sdt.h>
99	#include <netinet6/in6_pcb.h>
100	#include <netinet/ip6.h>
101	#include <netinet6/ip6_var.h>
102	#include <netinet/tcp.h>
103	#include <netinet/tcp_cache.h>
104	#include <netinet/tcp_fsm.h>
105	#include <netinet/tcp_seq.h>
106	#include <netinet/tcp_timer.h>
107	#include <netinet/tcp_var.h>
108	#include <netinet/tcpip.h>
109	#include <netinet/tcp_cc.h>
110	#if TCPDEBUG
111	#include <netinet/tcp_debug.h>
112	#endif
113	#include <netinet/tcp_log.h>
114	#include <sys/kdebug.h>
115	#include <mach/sdt.h>
116
117	#if IPSEC
118	#include <netinet6/ipsec.h>
119	#endif /IPSEC/
120
121	#if MPTCP
122	#include <netinet/mptcp_var.h>
123	#include <netinet/mptcp.h>
124	#include <netinet/mptcp_opt.h>
125	#include <netinet/mptcp_seq.h>
126	#endif
127
128	#include <corecrypto/ccaes.h>
129
130	#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 1)
131	#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 3)
132	#define DBG_FNC_TCP_OUTPUT NETDBG_CODE(DBG_NETTCP, (4 << 8) \| 1)
133
134	SYSCTL_SKMEM_TCP_INT(OID_AUTO, path_mtu_discovery,
135	CTLFLAG_RW \| CTLFLAG_LOCKED, int, path_mtu_discovery, `1`,
136	"Enable Path MTU Discovery");
137
138	SYSCTL_SKMEM_TCP_INT(OID_AUTO, local_slowstart_flightsize,
139	CTLFLAG_RW \| CTLFLAG_LOCKED, int, ss_fltsz_local, `8`,
140	"Slow start flight size for local networks");
141
142	SYSCTL_SKMEM_TCP_INT(OID_AUTO, tso, CTLFLAG_RW \| CTLFLAG_LOCKED,
143	int, tcp_do_tso, `1`, "Enable TCP Segmentation Offload");
144
145	SYSCTL_SKMEM_TCP_INT(OID_AUTO, ecn_setup_percentage,
146	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_ecn_setup_percentage, `100`,
147	"Max ECN setup percentage");
148
149	SYSCTL_SKMEM_TCP_INT(OID_AUTO, accurate_ecn,
150	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_acc_ecn, `0`,
151	"Accurate ECN mode (0: disable, 1: enable ACE feedback");
152
153	// TO BE REMOVED
154	SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_ack_compression,
155	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_do_ack_compression, `1`,
156	"Enable TCP ACK compression (on (cell only): 1, off: 0, on (all interfaces): 2)");
157
158	SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_compression_rate,
159	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_ack_compression_rate, TCP_COMP_CHANGE_RATE,
160	"Rate at which we force sending new ACKs (in ms)");
161
162	SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_timestamps,
163	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_randomize_timestamps, `1`,
164	"Randomize TCP timestamps to prevent tracking (on: 1, off: 0)");
165
166	static int
167	sysctl_change_ecn_setting SYSCTL_HANDLER_ARGS
168	{
169	#pragma unused(oidp, arg1, arg2)
170	int i, err = `0`, changed = `0`;
171	struct ifnet *ifp;
172
173	err = sysctl_io_number(req, bigValue: tcp_ecn_outbound, valueSize: sizeof(int32_t),
174	pValue: &i, changed: &changed);
175	if (err != `0` \|\| req->newptr == USER_ADDR_NULL) {
176	return err;
177	}
178
179	if (changed) {
180	if ((tcp_ecn_outbound == `0` \|\| tcp_ecn_outbound == `1`) &&
181	(i == `0` \|\| i == `1`)) {
182	tcp_ecn_outbound = i;
183	SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound);
184	return err;
185	}
186	if (tcp_ecn_outbound == `2` && (i == `0` \|\| i == `1`)) {
187	/*
188	* Reset ECN enable flags on non-cellular
189	* interfaces so that the system default will take
190	* over
191	*/
192	ifnet_head_lock_shared();
193	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
194	if (!IFNET_IS_CELLULAR(ifp)) {
195	if_clear_eflags(ifp,
196	IFEF_ECN_ENABLE \|
197	IFEF_ECN_DISABLE);
198	}
199	}
200	ifnet_head_done();
201	} else {
202	/*
203	* Set ECN enable flags on non-cellular
204	* interfaces
205	*/
206	ifnet_head_lock_shared();
207	TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
208	if (!IFNET_IS_CELLULAR(ifp)) {
209	if_set_eflags(ifp, IFEF_ECN_ENABLE);
210	if_clear_eflags(ifp, IFEF_ECN_DISABLE);
211	}
212	}
213	ifnet_head_done();
214	}
215	tcp_ecn_outbound = i;
216	SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_initiate_out, tcp_ecn_outbound);
217	}
218	/ Change the other one too as the work is done /
219	if (i == `2` \|\| tcp_ecn_inbound == `2`) {
220	tcp_ecn_inbound = i;
221	SYSCTL_SKMEM_UPDATE_FIELD(tcp.ecn_negotiate_in, tcp_ecn_inbound);
222	}
223	return err;
224	}
225
226	int tcp_ecn_outbound = `2`;
227	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_initiate_out,
228	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_ecn_outbound, `0`,
229	sysctl_change_ecn_setting, "IU",
230	"Initiate ECN for outbound connections");
231
232	int tcp_ecn_inbound = `2`;
233	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, ecn_negotiate_in,
234	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_ecn_inbound, `0`,
235	sysctl_change_ecn_setting, "IU",
236	"Initiate ECN for inbound connections");
237
238	SYSCTL_SKMEM_TCP_INT(OID_AUTO, packetchain,
239	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_packet_chaining, `50`,
240	"Enable TCP output packet chaining");
241
242	SYSCTL_SKMEM_TCP_INT(OID_AUTO, socket_unlocked_on_output,
243	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_output_unlocked, `1`,
244	"Unlock TCP when sending packets down to IP");
245
246	SYSCTL_SKMEM_TCP_INT(OID_AUTO, min_iaj_win,
247	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_min_iaj_win, MIN_IAJ_WIN,
248	"Minimum recv win based on inter-packet arrival jitter");
249
250	SYSCTL_SKMEM_TCP_INT(OID_AUTO, acc_iaj_react_limit,
251	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_acc_iaj_react_limit,
252	ACC_IAJ_REACT_LIMIT, "Accumulated IAJ when receiver starts to react");
253
254	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufinc,
255	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_autosndbuf_inc,
256	`8` * `1024`, "Increment in send socket bufffer size");
257
258	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autosndbufmax,
259	CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_KERN, uint32_t, tcp_autosndbuf_max, `2` * `1024` * `1024`,
260	"Maximum send socket buffer size");
261
262	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_recvbg,
263	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_use_rtt_recvbg, `1`,
264	"Use RTT for bg recv algorithm");
265
266	SYSCTL_SKMEM_TCP_INT(OID_AUTO, recv_throttle_minwin,
267	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_recv_throttle_minwin, `16` * `1024`,
268	"Minimum recv win for throttling");
269
270	SYSCTL_SKMEM_TCP_INT(OID_AUTO, enable_tlp,
271	CTLFLAG_RW \| CTLFLAG_LOCKED,
272	int32_t, tcp_enable_tlp, `1`, "Enable Tail loss probe");
273
274	static int32_t packchain_newlist = `0`;
275	static int32_t packchain_looped = `0`;
276	static int32_t packchain_sent = `0`;
277
278	/ temporary: for testing /
279	#if IPSEC
280	extern int ipsec_bypass;
281	#endif
282
283	extern int slowlink_wsize; / window correction for slow links /
284
285	extern u_int32_t kipf_count;
286
287	static int tcp_ip_output(struct socket , struct* tcpcb , struct* mbuf *,
288	int, struct mbuf , int, int*, boolean_t);
289	static int tcp_recv_throttle(struct tcpcb *tp);
290
291	__attribute__((noinline))
292	static int32_t
293	tcp_tfo_check(struct tcpcb *tp, int32_t len)
294	{
295	struct socket *so = tp->t_inpcb->inp_socket;
296	unsigned int optlen = `0`;
297	unsigned int cookie_len;
298
299	if (tp->t_flags & TF_NOOPT) {
300	goto fallback;
301	}
302
303	if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
304	!tcp_heuristic_do_tfo(tp)) {
305	tp->t_tfo_stats \|= TFO_S_HEURISTICS_DISABLE;
306	tcpstat.tcps_tfo_heuristics_disable++;
307	goto fallback;
308	}
309
310	if (so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
311	return len;
312	}
313
314	optlen += TCPOLEN_MAXSEG;
315
316	if (tp->t_flags & TF_REQ_SCALE) {
317	optlen += `4`;
318	}
319
320	#if MPTCP
321	if ((so->so_flags & SOF_MP_SUBFLOW) && mptcp_enable &&
322	(tp->t_rxtshift <= mptcp_mpcap_retries \|\|
323	(tptomptp(tp)->mpt_mpte->mpte_flags & MPTE_FORCE_ENABLE))) {
324	optlen += sizeof(struct mptcp_mpcapable_opt_common) + sizeof(mptcp_key_t);
325	}
326	#endif /* MPTCP */
327
328	if (tp->t_flags & TF_REQ_TSTMP) {
329	optlen += TCPOLEN_TSTAMP_APPA;
330	}
331
332	if (SACK_ENABLED(tp)) {
333	optlen += TCPOLEN_SACK_PERMITTED;
334	}
335
336	/ Now, decide whether to use TFO or not /
337
338	/ Don't even bother trying if there is no space at all... /
339	if (MAX_TCPOPTLEN - optlen < TCPOLEN_FASTOPEN_REQ) {
340	goto fallback;
341	}
342
343	cookie_len = tcp_cache_get_cookie_len(tp);
344	if (cookie_len == `0`) {
345	/ No cookie, so we request one /
346	return `0`;
347	}
348
349	/ There is not enough space for the cookie, so we cannot do TFO /
350	if (MAX_TCPOPTLEN - optlen < cookie_len) {
351	goto fallback;
352	}
353
354	/ Do not send SYN+data if there is more in the queue than MSS /
355	if (so->so_snd.sb_cc > (tp->t_maxopd - MAX_TCPOPTLEN)) {
356	goto fallback;
357	}
358
359	/ Ok, everything looks good. We can go on and do TFO /
360	return len;
361
362	fallback:
363	tcp_disable_tfo(tp);
364	return `0`;
365	}
366
367	/ Returns the number of bytes written to the TCP option-space /
368	__attribute__((noinline))
369	static unsigned int
370	tcp_tfo_write_cookie_rep(struct tcpcb tp, unsigned* int optlen, u_char *opt)
371	{
372	u_char out[CCAES_BLOCK_SIZE];
373	unsigned ret = `0`;
374	u_char *bp;
375
376	if (MAX_TCPOPTLEN - optlen <
377	TCPOLEN_FASTOPEN_REQ + TFO_COOKIE_LEN_DEFAULT) {
378	return ret;
379	}
380
381	tcp_tfo_gen_cookie(inp: tp->t_inpcb, out, blk_size: sizeof(out));
382
383	bp = opt + optlen;
384
385	*bp++ = TCPOPT_FASTOPEN;
386	*bp++ = `2` + TFO_COOKIE_LEN_DEFAULT;
387	memcpy(dst: bp, src: out, TFO_COOKIE_LEN_DEFAULT);
388	ret += `2` + TFO_COOKIE_LEN_DEFAULT;
389
390	tp->t_tfo_stats \|= TFO_S_COOKIE_SENT;
391	tcpstat.tcps_tfo_cookie_sent++;
392
393	return ret;
394	}
395
396	__attribute__((noinline))
397	static unsigned int
398	tcp_tfo_write_cookie(struct tcpcb tp, unsigned* int optlen, int32_t len,
399	u_char *opt)
400	{
401	uint8_t tfo_len;
402	struct socket *so = tp->t_inpcb->inp_socket;
403	unsigned ret = `0`;
404	int res;
405	u_char *bp;
406
407	if (TCPOLEN_FASTOPEN_REQ > MAX_TCPOPTLEN - optlen) {
408	return `0`;
409	}
410	tfo_len = (uint8_t)(MAX_TCPOPTLEN - optlen - TCPOLEN_FASTOPEN_REQ);
411
412	if (so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
413	/ If there is some data, let's track it /
414	if (len > `0`) {
415	tp->t_tfo_stats \|= TFO_S_SYN_DATA_SENT;
416	tcpstat.tcps_tfo_syn_data_sent++;
417	}
418
419	return `0`;
420	}
421
422	bp = opt + optlen;
423
424	/*
425	* The cookie will be copied in the appropriate place within the
426	* TCP-option space. That way we avoid the need for an intermediate
427	* variable.
428	*/
429	res = tcp_cache_get_cookie(tp, cookie: bp + TCPOLEN_FASTOPEN_REQ, len: &tfo_len);
430	if (res == `0`) {
431	*bp++ = TCPOPT_FASTOPEN;
432	*bp++ = TCPOLEN_FASTOPEN_REQ;
433	ret += TCPOLEN_FASTOPEN_REQ;
434
435	tp->t_tfo_flags \|= TFO_F_COOKIE_REQ;
436
437	tp->t_tfo_stats \|= TFO_S_COOKIE_REQ;
438	tcpstat.tcps_tfo_cookie_req++;
439	} else {
440	*bp++ = TCPOPT_FASTOPEN;
441	*bp++ = TCPOLEN_FASTOPEN_REQ + tfo_len;
442
443	ret += TCPOLEN_FASTOPEN_REQ + tfo_len;
444
445	tp->t_tfo_flags \|= TFO_F_COOKIE_SENT;
446
447	/ If there is some data, let's track it /
448	if (len > `0`) {
449	tp->t_tfo_stats \|= TFO_S_SYN_DATA_SENT;
450	tcpstat.tcps_tfo_syn_data_sent++;
451	}
452	}
453
454	return ret;
455	}
456
457	static inline bool
458	tcp_send_ecn_flags_on_syn(struct tcpcb *tp)
459	{
460	/ We allow Accurate ECN negotiation on first retransmission as well /
461	bool send_on_first_retrans = (tp->ecn_flags & TE_ACE_SETUPSENT) &&
462	(tp->t_rxtshift <= `1`);
463
464	return !(tp->ecn_flags & (TE_SETUPSENT \| TE_ACE_SETUPSENT)) \|\| send_on_first_retrans;
465	}
466
467	void
468	tcp_set_ecn(struct tcpcb tp, struct* ifnet *ifp)
469	{
470	boolean_t inbound;
471
472	/*
473	* Socket option has precedence
474	*/
475	if (tp->ecn_flags & TE_ECN_MODE_ENABLE) {
476	tp->ecn_flags \|= TE_ENABLE_ECN;
477	goto check_heuristic;
478	}
479
480	if (tp->ecn_flags & TE_ECN_MODE_DISABLE) {
481	tp->ecn_flags &= ~TE_ENABLE_ECN;
482	return;
483	}
484	/*
485	* Per interface setting comes next
486	*/
487	if (ifp != NULL) {
488	if (ifp->if_eflags & IFEF_ECN_ENABLE) {
489	tp->ecn_flags \|= TE_ENABLE_ECN;
490	goto check_heuristic;
491	}
492
493	if (ifp->if_eflags & IFEF_ECN_DISABLE) {
494	tp->ecn_flags &= ~TE_ENABLE_ECN;
495	return;
496	}
497	}
498	/*
499	* System wide settings come last
500	*/
501	inbound = (tp->t_inpcb->inp_socket->so_head != NULL);
502	if ((inbound && tcp_ecn_inbound == `1`) \|\|
503	(!inbound && tcp_ecn_outbound == `1`)) {
504	tp->ecn_flags \|= TE_ENABLE_ECN;
505	goto check_heuristic;
506	} else {
507	tp->ecn_flags &= ~TE_ENABLE_ECN;
508	}
509
510	return;
511
512	check_heuristic:
513	if (TCP_ACC_ECN_ENABLED(tp)) {
514	/ Allow ECN when Accurate ECN is enabled until heuristics are fixed /
515	tp->ecn_flags \|= TE_ENABLE_ECN;
516	/ Set the accurate ECN state /
517	if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_disabled) {
518	tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_feature_enabled;
519	}
520	if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_feature_disabled) {
521	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_feature_enabled;
522	}
523	}
524	if (!tcp_heuristic_do_ecn(tp) && !TCP_ACC_ECN_ENABLED(tp)) {
525	/ Allow ECN when Accurate ECN is enabled until heuristics are fixed /
526	tp->ecn_flags &= ~TE_ENABLE_ECN;
527	}
528	/*
529	* If the interface setting, system-level setting and heuristics
530	* allow to enable ECN, randomly select 5% of connections to
531	* enable it
532	*/
533	if ((tp->ecn_flags & (TE_ECN_MODE_ENABLE \| TE_ECN_MODE_DISABLE
534	\| TE_ENABLE_ECN)) == TE_ENABLE_ECN) {
535	/*
536	* Use the random value in iss for randomizing
537	* this selection
538	*/
539	if ((tp->iss % `100`) >= tcp_ecn_setup_percentage && !TCP_ACC_ECN_ENABLED(tp)) {
540	/ Don't disable Accurate ECN randomly /
541	tp->ecn_flags &= ~TE_ENABLE_ECN;
542	}
543	}
544	}
545
546	int
547	tcp_flight_size(struct tcpcb *tp)
548	{
549	int ret;
550
551	VERIFY(tp->sackhint.sack_bytes_acked >= `0`);
552	VERIFY(tp->sackhint.sack_bytes_rexmit >= `0`);
553
554	/*
555	* RFC6675, SetPipe (), SACK'd bytes are discounted. All the rest is still in-flight.
556	*/
557	ret = tp->snd_nxt - tp->snd_una - tp->sackhint.sack_bytes_acked;
558
559	if (ret < `0`) {
560	/*
561	* This happens when the RTO-timer fires because snd_nxt gets artificially
562	* decreased. If we then receive some SACK-blogs, sack_bytes_acked is
563	* going to be high.
564	*/
565	ret = `0`;
566	}
567
568	return ret;
569	}
570
571	/*
572	* Either of ECT0 or ECT1 flag should be set
573	* when this function is called
574	*/
575	static void
576	tcp_add_accecn_option(struct tcpcb tp, uint16_t flags, uint32_t lp, uint8_t *optlen)
577	{
578	uint8_t max_len = TCP_MAXOLEN - *optlen;
579	uint8_t len = TCPOLEN_ACCECN_EMPTY;
580
581	uint32_t e1b = (uint32_t)(tp->t_rcv_ect1_bytes & TCP_ACO_MASK);
582	uint32_t e0b = (uint32_t)(tp->t_rcv_ect0_bytes & TCP_ACO_MASK);
583	uint32_t ceb = (uint32_t)(tp->t_rcv_ce_bytes & TCP_ACO_MASK);
584
585	if (max_len < TCPOLEN_ACCECN_EMPTY) {
586	TCP_LOG(tp, "not enough space to add any AccECN option");
587	return;
588	}
589
590	if (!(flags & TH_SYN \|\| (tp->ecn_flags & TE_ACE_FINAL_ACK_3WHS) \|\|
591	tp->snd_una == tp->iss + `1` \|\|
592	tp->ecn_flags & (TE_ACO_ECT1 \| TE_ACO_ECT0))) {
593	/*
594	* Since this is neither a SYN-ACK packet, nor the final ACK of
595	* the 3WHS (nor the first acked data segment) nor any of the ECT byte
596	* counter flags are set, no need to send the option.
597	*/
598	return;
599	}
600
601	if ((flags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK) &&
602	tp->t_rxtshift >= `1`) {
603	/*
604	* If this is a SYN-ACK retransmission (first),
605	* retry without AccECN option and just with ACE fields.
606	* From second retransmission onwards, we don't send any
607	* Accurate ECN state.
608	*/
609	return;
610	}
611
612	if (max_len < (TCPOLEN_ACCECN_EMPTY + `1` * TCPOLEN_ACCECN_COUNTER)) {
613	/ Can carry EMPTY option which can be used to test path in SYN-ACK packet /
614	if (flags & TH_SYN) {
615	*lp++ = htonl((TCPOPT_ACCECN1 << `24`) \| (len << `16`) \|
616	(TCPOPT_NOP << `8`) \| TCPOPT_NOP);
617	optlen += len + `2`; /* 2 NOPs /
618	TCP_LOG(tp, "add empty AccECN option, optlen=%u", *optlen);
619	}
620	} else if (max_len < (TCPOLEN_ACCECN_EMPTY + `2` * TCPOLEN_ACCECN_COUNTER)) {
621	/ Can carry one option /
622	len += `1` * TCPOLEN_ACCECN_COUNTER;
623	if (tp->ecn_flags & TE_ACO_ECT1) {
624	*lp++ = htonl((TCPOPT_ACCECN1 << `24`) \| (len << `16`) \| ((e1b >> `8`) & `0xffff`));
625	*lp++ = htonl(((e1b & `0xff`) << `24`) \| (TCPOPT_NOP << `16`) \| (TCPOPT_NOP << `8`) \| TCPOPT_NOP);
626	} else {
627	*lp++ = htonl((TCPOPT_ACCECN0 << `24`) \| (len << `16`) \| ((e0b >> `8`) & `0xffff`));
628	*lp++ = htonl(((e0b & `0xff`) << `24`) \| (TCPOPT_NOP << `16`) \| (TCPOPT_NOP << `8`) \| TCPOPT_NOP);
629	}
630	optlen += len + `3`; /* 3 NOPs /
631	TCP_LOG(tp, "add single counter for AccECN option, optlen=%u", *optlen);
632	} else if (max_len < (TCPOLEN_ACCECN_EMPTY + `3` * TCPOLEN_ACCECN_COUNTER)) {
633	/ Can carry two options /
634	len += `2` * TCPOLEN_ACCECN_COUNTER;
635	if (tp->ecn_flags & TE_ACO_ECT1) {
636	*lp++ = htonl((TCPOPT_ACCECN1 << `24`) \| (len << `16`) \| ((e1b >> `8`) & `0xffff`));
637	*lp++ = htonl(((e1b & `0xff`) << `24`) \| (ceb & `0xffffff`));
638	} else {
639	*lp++ = htonl((TCPOPT_ACCECN0 << `24`) \| (len << `16`) \| ((e0b >> `8`) & `0xffff`));
640	*lp++ = htonl(((e0b & `0xff`) << `24`) \| (ceb & `0xffffff`));
641	}
642	optlen += len; /* 0 NOPs /
643	TCP_LOG(tp, "add 2 counters for AccECN option, optlen=%u", *optlen);
644	} else {
645	/*
646	* TCP option sufficient to hold full AccECN option
647	* but send counter that changed during the entire connection.
648	*/
649	len += `3` * TCPOLEN_ACCECN_COUNTER;
650	/ Can carry all three options /
651	if (tp->ecn_flags & TE_ACO_ECT1) {
652	*lp++ = htonl((TCPOPT_ACCECN1 << `24`) \| (len << `16`) \| ((e1b >> `8`) & `0xffff`));
653	*lp++ = htonl(((e1b & `0xff`) << `24`) \| (ceb & `0xffffff`));
654	*lp++ = htonl(((e0b & `0xffffff`) << `8`) \| TCPOPT_NOP);
655	} else {
656	*lp++ = htonl((TCPOPT_ACCECN0 << `24`) \| (len << `16`) \| ((e0b >> `8`) & `0xffff`));
657	*lp++ = htonl(((e0b & `0xff`) << `24`) \| (ceb & `0xffffff`));
658	*lp++ = htonl(((e1b & `0xffffff`) << `8`) \| TCPOPT_NOP);
659	}
660	optlen += len + `1`; /* 1 NOP /
661	TCP_LOG(tp, "add all 3 counters for AccECN option, optlen=%u", *optlen);
662	}
663	}
664
665	/*
666	* Tcp output routine: figure out what should be sent and send it.
667	*
668	* Returns: 0 Success
669	* EADDRNOTAVAIL
670	* ENOBUFS
671	* EMSGSIZE
672	* EHOSTUNREACH
673	* ENETDOWN
674	* ip_output_list:ENOMEM
675	* ip_output_list:EADDRNOTAVAIL
676	* ip_output_list:ENETUNREACH
677	* ip_output_list:EHOSTUNREACH
678	* ip_output_list:EACCES
679	* ip_output_list:EMSGSIZE
680	* ip_output_list:ENOBUFS
681	* ip_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
682	* ip6_output_list:EINVAL
683	* ip6_output_list:EOPNOTSUPP
684	* ip6_output_list:EHOSTUNREACH
685	* ip6_output_list:EADDRNOTAVAIL
686	* ip6_output_list:ENETUNREACH
687	* ip6_output_list:EMSGSIZE
688	* ip6_output_list:ENOBUFS
689	* ip6_output_list:??? [ignorable: mostly IPSEC/firewall/DLIL]
690	*/
691	int
692	tcp_output(struct tcpcb *tp)
693	{
694	struct inpcb *inp = tp->t_inpcb;
695	struct socket *so = inp->inp_socket;
696	int32_t len, recwin, sendwin, off;
697	uint32_t max_len = `0`;
698	uint16_t flags;
699	int error;
700	struct mbuf *m;
701	struct ip *ip = NULL;
702	struct ip6_hdr *ip6 = NULL;
703	struct tcphdr *th;
704	u_char opt[TCP_MAXOLEN];
705	unsigned int ipoptlen, optlen, hdrlen;
706	int idle, sendalot, lost = `0`;
707	int sendalot_cnt = `0`;
708	int i, sack_rxmit;
709	int tso = `0`;
710	int sack_bytes_rxmt;
711	tcp_seq old_snd_nxt = `0`;
712	struct sackhole *p;
713	#if IPSEC
714	size_t ipsec_optlen = `0`;
715	#endif /* IPSEC */
716	int idle_time = `0`;
717	struct mbuf *packetlist = NULL;
718	struct mbuf *tp_inp_options = inp->inp_depend4.inp4_options;
719	int isipv6 = inp->inp_vflag & INP_IPV6;
720	int packchain_listadd = `0`;
721	int so_options = so->so_options;
722	struct rtentry *rt;
723	u_int32_t svc_flags = `0`, allocated_len;
724	#if MPTCP
725	boolean_t mptcp_acknow;
726	#endif /* MPTCP */
727	boolean_t cell = FALSE;
728	boolean_t wifi = FALSE;
729	boolean_t wired = FALSE;
730	boolean_t sack_rescue_rxt = FALSE;
731	int sotc = so->so_traffic_class;
732	boolean_t do_not_compress = FALSE;
733	boolean_t sack_rxmted = FALSE;
734
735	/*
736	* Determine length of data that should be transmitted,
737	* and flags that will be used.
738	* If there is some data or critical controls (SYN, RST)
739	* to send, then transmit; otherwise, investigate further.
740	*/
741	idle = (tp->t_flags & TF_LASTIDLE) \|\| (tp->snd_max == tp->snd_una);
742
743	/ Since idle_time is signed integer, the following integer subtraction*
744	* will take care of wrap around of tcp_now
745	*/
746	idle_time = tcp_now - tp->t_rcvtime;
747	if (idle && idle_time >= TCP_IDLETIMEOUT(tp)) {
748	if (CC_ALGO(tp)->after_idle != NULL &&
749	(tp->tcp_cc_index != TCP_CC_ALGO_CUBIC_INDEX \|\|
750	idle_time >= TCP_CC_CWND_NONVALIDATED_PERIOD)) {
751	CC_ALGO(tp)->after_idle(tp);
752	tcp_ccdbg_trace(tp, NULL, event: TCP_CC_IDLE_TIMEOUT);
753	}
754
755	/*
756	* Do some other tasks that need to be done after
757	* idle time
758	*/
759	if (!SLIST_EMPTY(&tp->t_rxt_segments)) {
760	tcp_rxtseg_clean(tp);
761	}
762
763	/ If stretch ack was auto-disabled, re-evaluate it /
764	tcp_cc_after_idle_stretchack(tp);
765	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
766	}
767	tp->t_flags &= ~TF_LASTIDLE;
768	if (idle) {
769	if (tp->t_flags & TF_MORETOCOME) {
770	tp->t_flags \|= TF_LASTIDLE;
771	idle = `0`;
772	}
773	}
774	#if MPTCP
775	if (tp->t_mpflags & TMPF_RESET) {
776	tcp_check_timer_state(tp);
777	/*
778	* Once a RST has been sent for an MPTCP subflow,
779	* the subflow socket stays around until deleted.
780	* No packets such as FINs must be sent after RST.
781	*/
782	return `0`;
783	}
784	#endif /* MPTCP */
785
786	again:
787	#if MPTCP
788	mptcp_acknow = FALSE;
789
790	if (so->so_flags & SOF_MP_SUBFLOW && SEQ_LT(tp->snd_nxt, tp->snd_una)) {
791	os_log_error(mptcp_log_handle, "%s - %lx: snd_nxt is %u and snd_una is %u, cnt %d\n",
792	__func__, (unsigned long)VM_KERNEL_ADDRPERM(tp->t_mpsub->mpts_mpte),
793	tp->snd_nxt, tp->snd_una, sendalot_cnt);
794	}
795	#endif
796	do_not_compress = FALSE;
797	sendalot_cnt++;
798
799	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_START, `0`, `0`, `0`, `0`, `0`);
800
801	if (isipv6) {
802	KERNEL_DEBUG(DBG_LAYER_BEG,
803	((inp->inp_fport << `16`) \| inp->inp_lport),
804	(((inp->in6p_laddr.s6_addr16[`0`] & `0xffff`) << `16`) \|
805	(inp->in6p_faddr.s6_addr16[`0`] & `0xffff`)),
806	sendalot, `0`, `0`);
807	} else {
808	KERNEL_DEBUG(DBG_LAYER_BEG,
809	((inp->inp_fport << `16`) \| inp->inp_lport),
810	(((inp->inp_laddr.s_addr & `0xffff`) << `16`) \|
811	(inp->inp_faddr.s_addr & `0xffff`)),
812	sendalot, `0`, `0`);
813	}
814	/*
815	* If the route generation id changed, we need to check that our
816	* local (source) IP address is still valid. If it isn't either
817	* return error or silently do nothing (assuming the address will
818	* come back before the TCP connection times out).
819	*/
820	rt = inp->inp_route.ro_rt;
821	if (rt != NULL && ROUTE_UNUSABLE(&tp->t_inpcb->inp_route)) {
822	struct ifnet *ifp;
823	struct in_ifaddr *ia = NULL;
824	struct in6_ifaddr *ia6 = NULL;
825	int found_srcaddr = `0`;
826
827	/ disable multipages at the socket /
828	somultipages(so, FALSE);
829
830	/ Disable TSO for the socket until we know more /
831	tp->t_flags &= ~TF_TSO;
832
833	soif2kcl(so, FALSE);
834
835	if (isipv6) {
836	ia6 = ifa_foraddr6(&inp->in6p_laddr);
837	if (ia6 != NULL) {
838	found_srcaddr = `1`;
839	}
840	} else {
841	ia = ifa_foraddr(inp->inp_laddr.s_addr);
842	if (ia != NULL) {
843	found_srcaddr = `1`;
844	}
845	}
846
847	/ check that the source address is still valid /
848	if (found_srcaddr == `0`) {
849	soevent(so,
850	hint: (SO_FILT_HINT_LOCKED \| SO_FILT_HINT_NOSRCADDR));
851
852	if (tp->t_state >= TCPS_CLOSE_WAIT) {
853	tcp_drop(tp, EADDRNOTAVAIL);
854	return EADDRNOTAVAIL;
855	}
856
857	/*
858	* Set retransmit timer if it wasn't set,
859	* reset Persist timer and shift register as the
860	* advertised peer window may not be valid anymore
861	*/
862	if (tp->t_timer[TCPT_REXMT] == `0`) {
863	tp->t_timer[TCPT_REXMT] =
864	OFFSET_FROM_START(tp, tp->t_rxtcur);
865	if (tp->t_timer[TCPT_PERSIST] != `0`) {
866	tp->t_timer[TCPT_PERSIST] = `0`;
867	tp->t_persist_stop = `0`;
868	TCP_RESET_REXMT_STATE(tp);
869	}
870	}
871
872	if (tp->t_pktlist_head != NULL) {
873	m_freem_list(tp->t_pktlist_head);
874	}
875	TCP_PKTLIST_CLEAR(tp);
876
877	/ drop connection if source address isn't available /
878	if (so->so_flags & SOF_NOADDRAVAIL) {
879	tcp_drop(tp, EADDRNOTAVAIL);
880	return EADDRNOTAVAIL;
881	} else {
882	TCP_LOG_OUTPUT(tp, "no source address silently ignored");
883	tcp_check_timer_state(tp);
884	return `0`; / silently ignore, keep data in socket: address may be back /
885	}
886	}
887	if (ia != NULL) {
888	ifa_remref(ifa: &ia->ia_ifa);
889	}
890
891	if (ia6 != NULL) {
892	ifa_remref(ifa: &ia6->ia_ifa);
893	}
894
895	/*
896	* Address is still valid; check for multipages capability
897	* again in case the outgoing interface has changed.
898	*/
899	RT_LOCK(rt);
900	if ((ifp = rt->rt_ifp) != NULL) {
901	somultipages(so, (ifp->if_hwassist & IFNET_MULTIPAGES));
902	tcp_set_tso(tp, ifp);
903	soif2kcl(so, (ifp->if_eflags & IFEF_2KCL));
904	tcp_set_ecn(tp, ifp);
905	}
906	if (rt->rt_flags & RTF_UP) {
907	RT_GENID_SYNC(rt);
908	}
909	/*
910	* See if we should do MTU discovery. Don't do it if:
911	* 1) it is disabled via the sysctl
912	* 2) the route isn't up
913	* 3) the MTU is locked (if it is, then discovery
914	* has been disabled)
915	*/
916
917	if (!path_mtu_discovery \|\| ((rt != NULL) &&
918	(!(rt->rt_flags & RTF_UP) \|\|
919	(rt->rt_rmx.rmx_locks & RTV_MTU)))) {
920	tp->t_flags &= ~TF_PMTUD;
921	} else {
922	tp->t_flags \|= TF_PMTUD;
923	}
924
925	RT_UNLOCK(rt);
926	}
927
928	if (rt != NULL) {
929	cell = IFNET_IS_CELLULAR(rt->rt_ifp);
930	wifi = (!cell && IFNET_IS_WIFI(rt->rt_ifp));
931	wired = (!wifi && IFNET_IS_WIRED(rt->rt_ifp));
932	}
933
934	/*
935	* If we've recently taken a timeout, snd_max will be greater than
936	* snd_nxt. There may be SACK information that allows us to avoid
937	* resending already delivered data. Adjust snd_nxt accordingly.
938	*/
939	if (SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max)) {
940	max_len = tcp_sack_adjust(tp);
941	}
942	sendalot = `0`;
943	off = tp->snd_nxt - tp->snd_una;
944	sendwin = min(a: tp->snd_wnd, b: tp->snd_cwnd);
945
946	if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > `0`) {
947	sendwin = min(a: sendwin, b: slowlink_wsize);
948	}
949
950	flags = tcp_outflags[tp->t_state];
951	/*
952	* Send any SACK-generated retransmissions. If we're explicitly
953	* trying to send out new data (when sendalot is 1), bypass this
954	* function. If we retransmit in fast recovery mode, decrement
955	* snd_cwnd, since we're replacing a (future) new transmission
956	* with a retransmission now, and we previously incremented
957	* snd_cwnd in tcp_input().
958	*/
959	/*
960	* Still in sack recovery , reset rxmit flag to zero.
961	*/
962	sack_rxmit = `0`;
963	sack_bytes_rxmt = `0`;
964	len = `0`;
965	p = NULL;
966	if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp) &&
967	(p = tcp_sack_output(tp, sack_bytes_rexmt: &sack_bytes_rxmt))) {
968	int32_t cwin;
969
970	if (tcp_do_better_lr) {
971	cwin = min(a: tp->snd_wnd, b: tp->snd_cwnd) - tcp_flight_size(tp);
972	if (cwin <= `0` && sack_rxmted == FALSE) {
973	/ Allow to clock out at least on per period /
974	cwin = tp->t_maxseg;
975	}
976
977	sack_rxmted = TRUE;
978	} else {
979	cwin = min(a: tp->snd_wnd, b: tp->snd_cwnd) - sack_bytes_rxmt;
980	}
981	if (cwin < `0`) {
982	cwin = `0`;
983	}
984	/ Do not retransmit SACK segments beyond snd_recover /
985	if (SEQ_GT(p->end, tp->snd_recover)) {
986	/*
987	* (At least) part of sack hole extends beyond
988	* snd_recover. Check to see if we can rexmit data
989	* for this hole.
990	*/
991	if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
992	/*
993	* Can't rexmit any more data for this hole.
994	* That data will be rexmitted in the next
995	* sack recovery episode, when snd_recover
996	* moves past p->rxmit.
997	*/
998	p = NULL;
999	goto after_sack_rexmit;
1000	} else {
1001	/ Can rexmit part of the current hole /
1002	len = ((int32_t)min(a: cwin,
1003	b: tp->snd_recover - p->rxmit));
1004	}
1005	} else {
1006	len = ((int32_t)min(a: cwin, b: p->end - p->rxmit));
1007	}
1008	if (len > `0`) {
1009	off = p->rxmit - tp->snd_una;
1010	sack_rxmit = `1`;
1011	sendalot = `1`;
1012	/ Everything sent after snd_nxt will allow us to account for fast-retransmit of the retransmitted segment /
1013	tp->send_highest_sack = tp->snd_nxt;
1014	tp->t_new_dupacks = `0`;
1015	tcpstat.tcps_sack_rexmits++;
1016	tcpstat.tcps_sack_rexmit_bytes +=
1017	min(a: len, b: tp->t_maxseg);
1018	} else {
1019	len = `0`;
1020	}
1021	}
1022	after_sack_rexmit:
1023	/*
1024	* Get standard flags, and add SYN or FIN if requested by 'hidden'
1025	* state flags.
1026	*/
1027	if (tp->t_flags & TF_NEEDFIN) {
1028	flags \|= TH_FIN;
1029	}
1030
1031	/*
1032	* If in persist timeout with window of 0, send 1 byte.
1033	* Otherwise, if window is small but nonzero
1034	* and timer expired, we will send what we can
1035	* and go to transmit state.
1036	*/
1037	if (tp->t_flagsext & TF_FORCE) {
1038	if (sendwin == `0`) {
1039	/*
1040	* If we still have some data to send, then
1041	* clear the FIN bit. Usually this would
1042	* happen below when it realizes that we
1043	* aren't sending all the data. However,
1044	* if we have exactly 1 byte of unsent data,
1045	* then it won't clear the FIN bit below,
1046	* and if we are in persist state, we wind
1047	* up sending the packet without recording
1048	* that we sent the FIN bit.
1049	*
1050	* We can't just blindly clear the FIN bit,
1051	* because if we don't have any more data
1052	* to send then the probe will be the FIN
1053	* itself.
1054	*/
1055	if (off < so->so_snd.sb_cc) {
1056	flags &= ~TH_FIN;
1057	}
1058	sendwin = `1`;
1059	} else {
1060	tp->t_timer[TCPT_PERSIST] = `0`;
1061	tp->t_persist_stop = `0`;
1062	TCP_RESET_REXMT_STATE(tp);
1063	}
1064	}
1065
1066	/*
1067	* If snd_nxt == snd_max and we have transmitted a FIN, the
1068	* offset will be > 0 even if so_snd.sb_cc is 0, resulting in
1069	* a negative length. This can also occur when TCP opens up
1070	* its congestion window while receiving additional duplicate
1071	* acks after fast-retransmit because TCP will reset snd_nxt
1072	* to snd_max after the fast-retransmit.
1073	*
1074	* In the normal retransmit-FIN-only case, however, snd_nxt will
1075	* be set to snd_una, the offset will be 0, and the length may
1076	* wind up 0.
1077	*
1078	* If sack_rxmit is true we are retransmitting from the scoreboard
1079	* in which case len is already set.
1080	*/
1081	if (sack_rxmit == `0`) {
1082	if (sack_bytes_rxmt == `0`) {
1083	len = min(a: so->so_snd.sb_cc, b: sendwin) - off;
1084	} else {
1085	int32_t cwin;
1086
1087	if (tcp_do_better_lr) {
1088	cwin = tp->snd_cwnd - tcp_flight_size(tp);
1089	} else {
1090	cwin = tp->snd_cwnd -
1091	(tp->snd_nxt - tp->sack_newdata) -
1092	sack_bytes_rxmt;
1093	}
1094	if (cwin < `0`) {
1095	cwin = `0`;
1096	}
1097	/*
1098	* We are inside of a SACK recovery episode and are
1099	* sending new data, having retransmitted all the
1100	* data possible in the scoreboard.
1101	*/
1102	len = min(a: so->so_snd.sb_cc, b: tp->snd_wnd) - off;
1103	/*
1104	* Don't remove this (len > 0) check !
1105	* We explicitly check for len > 0 here (although it
1106	* isn't really necessary), to work around a gcc
1107	* optimization issue - to force gcc to compute
1108	* len above. Without this check, the computation
1109	* of len is bungled by the optimizer.
1110	*/
1111	if (len > `0`) {
1112	len = imin(a: len, b: cwin);
1113	} else {
1114	len = `0`;
1115	}
1116	/*
1117	* At this point SACK recovery can not send any
1118	* data from scoreboard or any new data. Check
1119	* if we can do a rescue retransmit towards the
1120	* tail end of recovery window.
1121	*/
1122	if (len == `0` && cwin > `0` &&
1123	SEQ_LT(tp->snd_fack, tp->snd_recover) &&
1124	!(tp->t_flagsext & TF_RESCUE_RXT)) {
1125	len = min(a: (tp->snd_recover - tp->snd_fack),
1126	b: tp->t_maxseg);
1127	len = imin(a: len, b: cwin);
1128	old_snd_nxt = tp->snd_nxt;
1129	sack_rescue_rxt = TRUE;
1130	tp->snd_nxt = tp->snd_recover - len;
1131	/*
1132	* If FIN has been sent, snd_max
1133	* must have been advanced to cover it.
1134	*/
1135	if ((tp->t_flags & TF_SENTFIN) &&
1136	tp->snd_max == tp->snd_recover) {
1137	tp->snd_nxt--;
1138	}
1139
1140	off = tp->snd_nxt - tp->snd_una;
1141	sendalot = `0`;
1142	tp->t_flagsext \|= TF_RESCUE_RXT;
1143	}
1144	}
1145	}
1146
1147	if (max_len != `0` && len > `0`) {
1148	len = min(a: len, b: max_len);
1149	}
1150
1151	/*
1152	* Lop off SYN bit if it has already been sent. However, if this
1153	* is SYN-SENT state and if segment contains data and if we don't
1154	* know that foreign host supports TAO, suppress sending segment.
1155	*/
1156	if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
1157	if (tp->t_state == TCPS_SYN_RECEIVED && tfo_enabled(tp) && tp->snd_nxt == tp->snd_una + `1`) {
1158	/ We are sending the SYN again! /
1159	off--;
1160	len++;
1161	} else {
1162	if (tp->t_state != TCPS_SYN_RECEIVED \|\| tfo_enabled(tp)) {
1163	flags &= ~TH_SYN;
1164	}
1165
1166	off--;
1167	len++;
1168	if (len > `0` && tp->t_state == TCPS_SYN_SENT) {
1169	while (inp->inp_sndinprog_cnt == `0` &&
1170	tp->t_pktlist_head != NULL) {
1171	packetlist = tp->t_pktlist_head;
1172	packchain_listadd = tp->t_lastchain;
1173	packchain_sent++;
1174	TCP_PKTLIST_CLEAR(tp);
1175
1176	error = tcp_ip_output(so, tp, packetlist,
1177	packchain_listadd, tp_inp_options,
1178	(so_options & SO_DONTROUTE),
1179	(sack_rxmit \|\| (sack_bytes_rxmt != `0`)),
1180	isipv6);
1181	}
1182
1183	/*
1184	* tcp was closed while we were in ip,
1185	* resume close
1186	*/
1187	if (inp->inp_sndinprog_cnt == `0` &&
1188	(tp->t_flags & TF_CLOSING)) {
1189	tp->t_flags &= ~TF_CLOSING;
1190	(void) tcp_close(tp);
1191	} else {
1192	tcp_check_timer_state(tp);
1193	}
1194	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END,
1195	`0`, `0`, `0`, `0`, `0`);
1196	return `0`;
1197	}
1198	}
1199	}
1200
1201	/*
1202	* Be careful not to send data and/or FIN on SYN segments.
1203	* This measure is needed to prevent interoperability problems
1204	* with not fully conformant TCP implementations.
1205	*
1206	* In case of TFO, we handle the setting of the len in
1207	* tcp_tfo_check. In case TFO is not enabled, never ever send
1208	* SYN+data.
1209	*/
1210	if ((flags & TH_SYN) && !tfo_enabled(tp)) {
1211	len = `0`;
1212	flags &= ~TH_FIN;
1213	}
1214
1215	/*
1216	* Don't send a RST with data.
1217	*/
1218	if (flags & TH_RST) {
1219	len = `0`;
1220	}
1221
1222	if ((flags & TH_SYN) && tp->t_state <= TCPS_SYN_SENT && tfo_enabled(tp)) {
1223	len = tcp_tfo_check(tp, len);
1224	}
1225
1226	/*
1227	* The check here used to be (len < 0). Some times len is zero
1228	* when the congestion window is closed and we need to check
1229	* if persist timer has to be set in that case. But don't set
1230	* persist until connection is established.
1231	*/
1232	if (len <= `0` && !(flags & TH_SYN)) {
1233	/*
1234	* If FIN has been sent but not acked,
1235	* but we haven't been called to retransmit,
1236	* len will be < 0. Otherwise, window shrank
1237	* after we sent into it. If window shrank to 0,
1238	* cancel pending retransmit, pull snd_nxt back
1239	* to (closed) window, and set the persist timer
1240	* if it isn't already going. If the window didn't
1241	* close completely, just wait for an ACK.
1242	*/
1243	len = `0`;
1244	if (sendwin == `0`) {
1245	tp->t_timer[TCPT_REXMT] = `0`;
1246	tp->t_timer[TCPT_PTO] = `0`;
1247	TCP_RESET_REXMT_STATE(tp);
1248	tp->snd_nxt = tp->snd_una;
1249	off = `0`;
1250	if (tp->t_timer[TCPT_PERSIST] == `0`) {
1251	tcp_setpersist(tp);
1252	}
1253	}
1254	}
1255
1256	/*
1257	* Automatic sizing of send socket buffer. Increase the send
1258	* socket buffer size if all of the following criteria are met
1259	* 1. the receiver has enough buffer space for this data
1260	* 2. send buffer is filled to 7/8th with data (so we actually
1261	* have data to make use of it);
1262	* 3. our send window (slow start and congestion controlled) is
1263	* larger than sent but unacknowledged data in send buffer.
1264	*/
1265	if (!INP_WAIT_FOR_IF_FEEDBACK(inp) && !IN_FASTRECOVERY(tp) &&
1266	(so->so_snd.sb_flags & (SB_AUTOSIZE \| SB_TRIM)) == SB_AUTOSIZE) {
1267	if ((tp->snd_wnd / `4` * `5`) >= so->so_snd.sb_hiwat &&
1268	so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / `8` * `7`) &&
1269	sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
1270	if (sbreserve(sb: &so->so_snd,
1271	cc: min(a: so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
1272	b: tcp_autosndbuf_max)) == `1`) {
1273	so->so_snd.sb_idealsize = so->so_snd.sb_hiwat;
1274	}
1275	}
1276	}
1277
1278	/*
1279	* Truncate to the maximum segment length or enable TCP Segmentation
1280	* Offloading (if supported by hardware) and ensure that FIN is removed
1281	* if the length no longer contains the last data byte.
1282	*
1283	* TSO may only be used if we are in a pure bulk sending state.
1284	* The presence of TCP-MD5, SACK retransmits, SACK advertizements,
1285	* filters and IP options, as well as disabling hardware checksum
1286	* offload prevent using TSO. With TSO the TCP header is the same
1287	* (except for the sequence number) for all generated packets. This
1288	* makes it impossible to transmit any options which vary per generated
1289	* segment or packet.
1290	*
1291	* The length of TSO bursts is limited to TCP_MAXWIN. That limit and
1292	* removal of FIN (if not already catched here) are handled later after
1293	* the exact length of the TCP options are known.
1294	*/
1295	#if IPSEC
1296	/*
1297	* Pre-calculate here as we save another lookup into the darknesses
1298	* of IPsec that way and can actually decide if TSO is ok.
1299	*/
1300	if (ipsec_bypass == `0`) {
1301	ipsec_optlen = ipsec_hdrsiz_tcp(tp);
1302	}
1303	#endif
1304	if (len > tp->t_maxseg) {
1305	if ((tp->t_flags & TF_TSO) && tcp_do_tso && hwcksum_tx &&
1306	kipf_count == `0` &&
1307	tp->rcv_numsacks == `0` && sack_rxmit == `0` &&
1308	sack_bytes_rxmt == `0` &&
1309	inp->inp_options == NULL &&
1310	inp->in6p_options == NULL
1311	#if IPSEC
1312	&& ipsec_optlen == `0`
1313	#endif
1314	) {
1315	tso = `1`;
1316	sendalot = `0`;
1317	} else {
1318	len = tp->t_maxseg;
1319	sendalot = `1`;
1320	tso = `0`;
1321	}
1322	} else {
1323	tso = `0`;
1324	}
1325
1326	/ Send one segment or less as a tail loss probe /
1327	if (tp->t_flagsext & TF_SENT_TLPROBE) {
1328	len = min(a: len, b: tp->t_maxseg);
1329	sendalot = `0`;
1330	tso = `0`;
1331	}
1332
1333	#if MPTCP
1334	if (so->so_flags & SOF_MP_SUBFLOW && off < `0`) {
1335	os_log_error(mptcp_log_handle, "%s - %lx: offset is negative! len %d off %d\n",
1336	__func__, (unsigned long)VM_KERNEL_ADDRPERM(tp->t_mpsub->mpts_mpte),
1337	len, off);
1338	}
1339
1340	if ((so->so_flags & SOF_MP_SUBFLOW) &&
1341	!(tp->t_mpflags & TMPF_TCP_FALLBACK)) {
1342	int newlen = len;
1343	struct mptcb *mp_tp = tptomptp(tp);
1344	if (tp->t_state >= TCPS_ESTABLISHED &&
1345	(tp->t_mpflags & TMPF_SND_MPPRIO \|\|
1346	tp->t_mpflags & TMPF_SND_REM_ADDR \|\|
1347	tp->t_mpflags & TMPF_SND_MPFAIL \|\|
1348	(tp->t_mpflags & TMPF_SND_KEYS &&
1349	mp_tp->mpt_version == MPTCP_VERSION_0) \|\|
1350	tp->t_mpflags & TMPF_SND_JACK \|\|
1351	tp->t_mpflags & TMPF_MPTCP_ECHO_ADDR)) {
1352	if (len > `0`) {
1353	len = `0`;
1354	tso = `0`;
1355	}
1356	/*
1357	* On a new subflow, don't try to send again, because
1358	* we are still waiting for the fourth ack.
1359	*/
1360	if (!(tp->t_mpflags & TMPF_PREESTABLISHED)) {
1361	sendalot = `1`;
1362	}
1363	mptcp_acknow = TRUE;
1364	} else {
1365	mptcp_acknow = FALSE;
1366	}
1367	/*
1368	* The contiguous bytes in the subflow socket buffer can be
1369	* discontiguous at the MPTCP level. Since only one DSS
1370	* option can be sent in one packet, reduce length to match
1371	* the contiguous MPTCP level. Set sendalot to send remainder.
1372	*/
1373	if (len > `0` && off >= `0`) {
1374	newlen = mptcp_adj_sendlen(so, off);
1375	}
1376
1377	if (newlen < len) {
1378	len = newlen;
1379	if (len <= tp->t_maxseg) {
1380	tso = `0`;
1381	}
1382	}
1383	}
1384	#endif /* MPTCP */
1385
1386	if (sack_rxmit) {
1387	if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) {
1388	flags &= ~TH_FIN;
1389	}
1390	} else {
1391	if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) {
1392	flags &= ~TH_FIN;
1393	}
1394	}
1395	/*
1396	* Compare available window to amount of window
1397	* known to peer (as advertised window less
1398	* next expected input). If the difference is at least two
1399	* max size segments, or at least 25% of the maximum possible
1400	* window, then want to send a window update to peer.
1401	*/
1402	recwin = tcp_sbspace(tp);
1403
1404	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
1405	if (recwin < (int32_t)(so->so_rcv.sb_hiwat / `4`) &&
1406	recwin < (int)tp->t_maxseg) {
1407	recwin = `0`;
1408	}
1409	} else {
1410	struct mptcb *mp_tp = tptomptp(tp);
1411	struct socket *mp_so = mptetoso(mpte: mp_tp->mpt_mpte);
1412
1413	if (recwin < (int32_t)(mp_so->so_rcv.sb_hiwat / `4`) &&
1414	recwin < (int)tp->t_maxseg) {
1415	recwin = `0`;
1416	}
1417	}
1418
1419	#if TRAFFIC_MGT
1420	if (tcp_recv_bg == `1` \|\| IS_TCP_RECV_BG(so)) {
1421	/*
1422	* Timestamp MUST be supported to use rledbat unless we haven't
1423	* yet negotiated it.
1424	*/
1425	if (TCP_RLEDBAT_ENABLED(tp) \|\| (tcp_rledbat && tp->t_state <
1426	TCPS_ESTABLISHED)) {
1427	if (recwin > `0` && tcp_cc_rledbat.get_rlwin != NULL) {
1428	/ Min of flow control window and rledbat window /
1429	recwin = imin(a: recwin, b: tcp_cc_rledbat.get_rlwin(tp));
1430	}
1431	} else if (recwin > `0` && tcp_recv_throttle(tp)) {
1432	uint32_t min_iaj_win = tcp_min_iaj_win * tp->t_maxseg;
1433	uint32_t bg_rwintop = tp->rcv_adv;
1434	if (SEQ_LT(bg_rwintop, tp->rcv_nxt + min_iaj_win)) {
1435	bg_rwintop = tp->rcv_nxt + min_iaj_win;
1436	}
1437	recwin = imin(a: (int32_t)(bg_rwintop - tp->rcv_nxt),
1438	b: recwin);
1439	if (recwin < `0`) {
1440	recwin = `0`;
1441	}
1442	}
1443	}
1444	#endif /* TRAFFIC_MGT */
1445
1446	if (recwin > (int32_t)(TCP_MAXWIN << tp->rcv_scale)) {
1447	recwin = (int32_t)(TCP_MAXWIN << tp->rcv_scale);
1448	}
1449
1450	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
1451	if (recwin < (int32_t)(tp->rcv_adv - tp->rcv_nxt)) {
1452	recwin = (int32_t)(tp->rcv_adv - tp->rcv_nxt);
1453	}
1454	} else {
1455	struct mptcb *mp_tp = tptomptp(tp);
1456	int64_t recwin_announced = (int64_t)(mp_tp->mpt_rcvadv - mp_tp->mpt_rcvnxt);
1457
1458	/ Don't remove what we announced at the MPTCP-layer /
1459	VERIFY(recwin_announced < INT32_MAX && recwin_announced > INT32_MIN);
1460	if (recwin < (int32_t)recwin_announced) {
1461	recwin = (int32_t)recwin_announced;
1462	}
1463	}
1464
1465	/*
1466	* Sender silly window avoidance. We transmit under the following
1467	* conditions when len is non-zero:
1468	*
1469	* - we've timed out (e.g. persist timer)
1470	* - we need to retransmit
1471	* - We have a full segment (or more with TSO)
1472	* - This is the last buffer in a write()/send() and we are
1473	* either idle or running NODELAY
1474	* - we have more then 1/2 the maximum send window's worth of
1475	* data (receiver may be limited the window size)
1476	*/
1477	if (len) {
1478	if (tp->t_flagsext & TF_FORCE) {
1479	goto send;
1480	}
1481	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
1482	goto send;
1483	}
1484	if (sack_rxmit) {
1485	goto send;
1486	}
1487
1488	/*
1489	* If this here is the first segment after SYN/ACK and TFO
1490	* is being used, then we always send it, regardless of Nagle,...
1491	*/
1492	if (tp->t_state == TCPS_SYN_RECEIVED &&
1493	tfo_enabled(tp) &&
1494	(tp->t_tfo_flags & TFO_F_COOKIE_VALID) &&
1495	tp->snd_nxt == tp->iss + `1`) {
1496	goto send;
1497	}
1498
1499	/*
1500	* Send new data on the connection only if it is
1501	* not flow controlled
1502	*/
1503	if (!INP_WAIT_FOR_IF_FEEDBACK(inp) \|\|
1504	tp->t_state != TCPS_ESTABLISHED) {
1505	if (off + len == tp->snd_wnd) {
1506	/ We are limited by the receiver's window... /
1507	if (tp->t_rcvwnd_limited_start_time == `0`) {
1508	tp->t_rcvwnd_limited_start_time = net_uptime_us();
1509	}
1510	} else {
1511	/ We are no more limited by the receiver's window... /
1512	if (tp->t_rcvwnd_limited_start_time != `0`) {
1513	uint64_t now = net_uptime_us();
1514
1515	ASSERT(now >= tp->t_rcvwnd_limited_start_time);
1516
1517	tp->t_rcvwnd_limited_total_time += (now - tp->t_rcvwnd_limited_start_time);
1518
1519	tp->t_rcvwnd_limited_start_time = `0`;
1520	}
1521	}
1522
1523	if (len >= tp->t_maxseg) {
1524	goto send;
1525	}
1526
1527	if (!(tp->t_flags & TF_MORETOCOME) &&
1528	(idle \|\| tp->t_flags & TF_NODELAY \|\|
1529	(tp->t_flags & TF_MAXSEGSNT) \|\|
1530	ALLOW_LIMITED_TRANSMIT(tp)) &&
1531	(tp->t_flags & TF_NOPUSH) == `0` &&
1532	(len + off >= so->so_snd.sb_cc \|\|
1533	/*
1534	* MPTCP needs to respect the DSS-mappings. So, it
1535	* may be sending data that could have been
1536	* coalesced, but cannot because of
1537	* mptcp_adj_sendlen().
1538	*/
1539	so->so_flags & SOF_MP_SUBFLOW)) {
1540	goto send;
1541	}
1542	if (len >= tp->max_sndwnd / `2` && tp->max_sndwnd > `0`) {
1543	goto send;
1544	}
1545	} else {
1546	tcpstat.tcps_fcholdpacket++;
1547	}
1548	}
1549
1550	if (recwin > `0`) {
1551	/*
1552	* "adv" is the amount we can increase the window,
1553	* taking into account that we are limited by
1554	* TCP_MAXWIN << tp->rcv_scale.
1555	*/
1556	int32_t adv, oldwin = `0`;
1557	adv = imin(a: recwin, b: (int)TCP_MAXWIN << tp->rcv_scale) -
1558	(tp->rcv_adv - tp->rcv_nxt);
1559
1560	if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) {
1561	oldwin = tp->rcv_adv - tp->rcv_nxt;
1562	}
1563
1564	if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) {
1565	if (adv >= (int32_t) (`2` * tp->t_maxseg)) {
1566	/*
1567	* Update only if the resulting scaled value of
1568	* the window changed, or if there is a change in
1569	* the sequence since the last ack. This avoids
1570	* what appears as dupe ACKS (see rdar://5640997)
1571	*
1572	* If streaming is detected avoid sending too many
1573	* window updates. We will depend on the delack
1574	* timer to send a window update when needed.
1575	*
1576	* If there is more data to read, don't send an ACK.
1577	* Otherwise we will end up sending many ACKs if the
1578	* application is doing micro-reads.
1579	*/
1580	if (!(tp->t_flags & TF_STRETCHACK) &&
1581	(tp->last_ack_sent != tp->rcv_nxt \|\|
1582	((oldwin + adv) >> tp->rcv_scale) >
1583	(oldwin >> tp->rcv_scale))) {
1584	goto send;
1585	}
1586	}
1587	} else {
1588	if (adv >= (int32_t) (`2` * tp->t_maxseg)) {
1589	/*
1590	* ACK every second full-sized segment, if the
1591	* ACK is advancing or the window becomes bigger
1592	*/
1593	if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat &&
1594	(tp->last_ack_sent != tp->rcv_nxt \|\|
1595	((oldwin + adv) >> tp->rcv_scale) >
1596	(oldwin >> tp->rcv_scale))) {
1597	goto send;
1598	}
1599	} else if (tp->t_flags & TF_DELACK) {
1600	/*
1601	* If we delayed the ACK and the window
1602	* is not advancing by a lot (< 2MSS), ACK
1603	* immediately if the last incoming packet had
1604	* the push flag set and we emptied the buffer.
1605	*
1606	* This takes care of a sender doing small
1607	* repeated writes with Nagle enabled.
1608	*/
1609	if (so->so_rcv.sb_cc == `0` &&
1610	tp->last_ack_sent != tp->rcv_nxt &&
1611	(tp->t_flagsext & TF_LAST_IS_PSH)) {
1612	goto send;
1613	}
1614	}
1615	}
1616	if (`4` * adv >= (int32_t) so->so_rcv.sb_hiwat) {
1617	goto send;
1618	}
1619
1620	/*
1621	* Make sure that the delayed ack timer is set if
1622	* we delayed sending a window update because of
1623	* streaming detection.
1624	*/
1625	if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY &&
1626	(tp->t_flags & TF_STRETCHACK) &&
1627	!(tp->t_flags & TF_DELACK)) {
1628	tp->t_flags \|= TF_DELACK;
1629	tp->t_timer[TCPT_DELACK] =
1630	OFFSET_FROM_START(tp, tcp_delack);
1631	}
1632	}
1633
1634	/*
1635	* Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW
1636	* is also a catch-all for the retransmit timer timeout case.
1637	*/
1638	if (tp->t_flags & TF_ACKNOW) {
1639	if (tp->t_forced_acks > `0`) {
1640	tp->t_forced_acks--;
1641	}
1642	goto send;
1643	}
1644	if ((flags & TH_RST) \|\| (flags & TH_SYN)) {
1645	goto send;
1646	}
1647	if (SEQ_GT(tp->snd_up, tp->snd_una)) {
1648	goto send;
1649	}
1650	#if MPTCP
1651	if (mptcp_acknow) {
1652	goto send;
1653	}
1654	#endif /* MPTCP */
1655	/*
1656	* If our state indicates that FIN should be sent
1657	* and we have not yet done so, then we need to send.
1658	*/
1659	if ((flags & TH_FIN) &&
1660	(!(tp->t_flags & TF_SENTFIN) \|\| tp->snd_nxt == tp->snd_una)) {
1661	goto send;
1662	}
1663	/*
1664	* In SACK, it is possible for tcp_output to fail to send a segment
1665	* after the retransmission timer has been turned off. Make sure
1666	* that the retransmission timer is set.
1667	*/
1668	if (SACK_ENABLED(tp) && (tp->t_state >= TCPS_ESTABLISHED) &&
1669	SEQ_GT(tp->snd_max, tp->snd_una) &&
1670	tp->t_timer[TCPT_REXMT] == `0` &&
1671	tp->t_timer[TCPT_PERSIST] == `0`) {
1672	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
1673	tp->t_rxtcur);
1674	goto just_return;
1675	}
1676	/*
1677	* TCP window updates are not reliable, rather a polling protocol
1678	* using ``persist'' packets is used to insure receipt of window
1679	* updates. The three ``states'' for the output side are:
1680	* idle not doing retransmits or persists
1681	* persisting to move a small or zero window
1682	* (re)transmitting and thereby not persisting
1683	*
1684	* tp->t_timer[TCPT_PERSIST]
1685	* is set when we are in persist state.
1686	* tp->t_force
1687	* is set when we are called to send a persist packet.
1688	* tp->t_timer[TCPT_REXMT]
1689	* is set when we are retransmitting
1690	* The output side is idle when both timers are zero.
1691	*
1692	* If send window is too small, there is data to transmit, and no
1693	* retransmit or persist is pending, then go to persist state.
1694	* If nothing happens soon, send when timer expires:
1695	* if window is nonzero, transmit what we can,
1696	* otherwise force out a byte.
1697	*/
1698	if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == `0` &&
1699	tp->t_timer[TCPT_PERSIST] == `0`) {
1700	TCP_RESET_REXMT_STATE(tp);
1701	tcp_setpersist(tp);
1702	}
1703	just_return:
1704	/*
1705	* If there is no reason to send a segment, just return.
1706	* but if there is some packets left in the packet list, send them now.
1707	*/
1708	while (inp->inp_sndinprog_cnt == `0` &&
1709	tp->t_pktlist_head != NULL) {
1710	packetlist = tp->t_pktlist_head;
1711	packchain_listadd = tp->t_lastchain;
1712	packchain_sent++;
1713	TCP_PKTLIST_CLEAR(tp);
1714
1715	error = tcp_ip_output(so, tp, packetlist,
1716	packchain_listadd,
1717	tp_inp_options, (so_options & SO_DONTROUTE),
1718	(sack_rxmit \|\| (sack_bytes_rxmt != `0`)), isipv6);
1719	}
1720	/ tcp was closed while we were in ip; resume close /
1721	if (inp->inp_sndinprog_cnt == `0` &&
1722	(tp->t_flags & TF_CLOSING)) {
1723	tp->t_flags &= ~TF_CLOSING;
1724	(void) tcp_close(tp);
1725	} else {
1726	tcp_check_timer_state(tp);
1727	}
1728	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
1729	return `0`;
1730
1731	send:
1732	/*
1733	* Set TF_MAXSEGSNT flag if the segment size is greater than
1734	* the max segment size.
1735	*/
1736	if (len > `0`) {
1737	do_not_compress = TRUE;
1738
1739	if (len >= tp->t_maxseg) {
1740	tp->t_flags \|= TF_MAXSEGSNT;
1741	} else {
1742	tp->t_flags &= ~TF_MAXSEGSNT;
1743	}
1744	}
1745	/*
1746	* If we are connected and no segment has been ACKed or SACKed yet and we
1747	* hit a retransmission timeout, then we should disable AccECN option
1748	* for the rest of the connection.
1749	*/
1750	if (TCP_ACC_ECN_ON(tp) && tp->t_state == TCPS_ESTABLISHED &&
1751	tp->snd_una == tp->iss + `1` && (tp->snd_fack == `0`)
1752	&& tp->t_rxtshift > `0`) {
1753	if ((tp->ecn_flags & TE_RETRY_WITHOUT_ACO) == `0`) {
1754	tp->ecn_flags \|= TE_RETRY_WITHOUT_ACO;
1755	}
1756	}
1757	/*
1758	* Before ESTABLISHED, force sending of initial options
1759	* unless TCP set not to do any options.
1760	* NOTE: we assume that the IP/TCP header plus TCP options
1761	* always fit in a single mbuf, leaving room for a maximum
1762	* link header, i.e.
1763	* max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
1764	*/
1765	optlen = `0`;
1766	if (isipv6) {
1767	hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1768	} else {
1769	hdrlen = sizeof(struct tcpiphdr);
1770	}
1771	if (flags & TH_SYN) {
1772	tp->snd_nxt = tp->iss;
1773	if ((tp->t_flags & TF_NOOPT) == `0`) {
1774	u_short mss;
1775
1776	opt[`0`] = TCPOPT_MAXSEG;
1777	opt[`1`] = TCPOLEN_MAXSEG;
1778	mss = htons((u_short) tcp_mssopt(tp));
1779	(void)memcpy(dst: opt + `2`, src: &mss, n: sizeof(mss));
1780	optlen = TCPOLEN_MAXSEG;
1781
1782	if ((tp->t_flags & TF_REQ_SCALE) &&
1783	((flags & TH_ACK) == `0` \|\|
1784	(tp->t_flags & TF_RCVD_SCALE))) {
1785	((u_int32_t )(void *)(opt + optlen)) = htonl(
1786	TCPOPT_NOP << `24` \|
1787	TCPOPT_WINDOW << `16` \|
1788	TCPOLEN_WINDOW << `8` \|
1789	tp->request_r_scale);
1790	optlen += `4`;
1791	}
1792	#if MPTCP
1793	if (mptcp_enable && (so->so_flags & SOF_MP_SUBFLOW)) {
1794	optlen = mptcp_setup_syn_opts(so, opt, optlen);
1795	}
1796	#endif /* MPTCP */
1797	}
1798	}
1799
1800	/*
1801	* Send a timestamp and echo-reply if this is a SYN and our side
1802	* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
1803	* and our peer have sent timestamps in our SYN's.
1804	*/
1805	if ((tp->t_flags & (TF_REQ_TSTMP \| TF_NOOPT)) == TF_REQ_TSTMP &&
1806	(flags & TH_RST) == `0` &&
1807	((flags & TH_ACK) == `0` \|\|
1808	(tp->t_flags & TF_RCVD_TSTMP))) {
1809	u_int32_t lp = (u_int32_t )(void *)(opt + optlen);
1810
1811	/ Form timestamp option as shown in appendix A of RFC 1323. /
1812	*lp++ = htonl(TCPOPT_TSTAMP_HDR);
1813	*lp++ = htonl(tcp_now + tp->t_ts_offset);
1814	*lp = htonl(tp->ts_recent);
1815	optlen += TCPOLEN_TSTAMP_APPA;
1816	}
1817
1818	if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == `0`)) {
1819	/*
1820	* Tack on the SACK permitted option last.
1821	* And do padding of options after tacking this on.
1822	* This is because of MSS, TS, WinScale and Signatures are
1823	* all present, we have just 2 bytes left for the SACK
1824	* permitted option, which is just enough.
1825	*/
1826	/*
1827	* If this is the first SYN of connection (not a SYN
1828	* ACK), include SACK permitted option. If this is a
1829	* SYN ACK, include SACK permitted option if peer has
1830	* already done so. This is only for active connect,
1831	* since the syncache takes care of the passive connect.
1832	*/
1833	if ((flags & TH_SYN) &&
1834	(!(flags & TH_ACK) \|\| (tp->t_flags & TF_SACK_PERMIT))) {
1835	u_char *bp;
1836	bp = (u_char *)opt + optlen;
1837
1838	*bp++ = TCPOPT_SACK_PERMITTED;
1839	*bp++ = TCPOLEN_SACK_PERMITTED;
1840	optlen += TCPOLEN_SACK_PERMITTED;
1841	}
1842	}
1843	#if MPTCP
1844	if (so->so_flags & SOF_MP_SUBFLOW) {
1845	/*
1846	* Its important to piggyback acks with data as ack only packets
1847	* may get lost and data packets that don't send Data ACKs
1848	* still advance the subflow level ACK and therefore make it
1849	* hard for the remote end to recover in low cwnd situations.
1850	*/
1851	if (len != `0`) {
1852	tp->t_mpflags \|= (TMPF_SEND_DSN \|
1853	TMPF_MPTCP_ACKNOW);
1854	} else {
1855	tp->t_mpflags \|= TMPF_MPTCP_ACKNOW;
1856	}
1857	optlen = mptcp_setup_opts(tp, off, opt: &opt[`0`], optlen, flags,
1858	len, p_mptcp_acknow: &mptcp_acknow, do_not_compress: &do_not_compress);
1859	tp->t_mpflags &= ~TMPF_SEND_DSN;
1860	}
1861	#endif /* MPTCP */
1862
1863	if (tfo_enabled(tp) && !(tp->t_flags & TF_NOOPT) &&
1864	(flags & (TH_SYN \| TH_ACK)) == TH_SYN) {
1865	optlen += tcp_tfo_write_cookie(tp, optlen, len, opt);
1866	}
1867
1868	if (tfo_enabled(tp) &&
1869	(flags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK) &&
1870	(tp->t_tfo_flags & TFO_F_OFFER_COOKIE)) {
1871	optlen += tcp_tfo_write_cookie_rep(tp, optlen, opt);
1872	}
1873
1874	if (SACK_ENABLED(tp) && ((tp->t_flags & TF_NOOPT) == `0`)) {
1875	/*
1876	* Send SACKs if necessary. This should be the last
1877	* option processed. Only as many SACKs are sent as
1878	* are permitted by the maximum options size.
1879	*
1880	* In general, SACK blocks consume 8*n+2 bytes.
1881	* So a full size SACK blocks option is 34 bytes
1882	* (to generate 4 SACK blocks). At a minimum,
1883	* we need 10 bytes (to generate 1 SACK block).
1884	* If TCP Timestamps (12 bytes) and TCP Signatures
1885	* (18 bytes) are both present, we'll just have
1886	* 10 bytes for SACK options 40 - (12 + 18).
1887	*/
1888	if (TCPS_HAVEESTABLISHED(tp->t_state) &&
1889	(tp->t_flags & TF_SACK_PERMIT) &&
1890	(tp->rcv_numsacks > `0` \|\| TCP_SEND_DSACK_OPT(tp)) &&
1891	MAX_TCPOPTLEN - optlen >= TCPOLEN_SACK + `2`) {
1892	unsigned int sackoptlen = `0`;
1893	int nsack, padlen;
1894	u_char bp = (u_char )opt + optlen;
1895	u_int32_t *lp;
1896
1897	nsack = (MAX_TCPOPTLEN - optlen - `2`) / TCPOLEN_SACK;
1898	nsack = min(a: nsack, b: (tp->rcv_numsacks +
1899	(TCP_SEND_DSACK_OPT(tp) ? `1` : `0`)));
1900	sackoptlen = (`2` + nsack * TCPOLEN_SACK);
1901	VERIFY(sackoptlen < UINT8_MAX);
1902
1903	/*
1904	* First we need to pad options so that the
1905	* SACK blocks can start at a 4-byte boundary
1906	* (sack option and length are at a 2 byte offset).
1907	*/
1908	padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % `4`;
1909	optlen += padlen;
1910	while (padlen-- > `0`) {
1911	*bp++ = TCPOPT_NOP;
1912	}
1913
1914	tcpstat.tcps_sack_send_blocks++;
1915	*bp++ = TCPOPT_SACK;
1916	*bp++ = (uint8_t)sackoptlen;
1917	lp = (u_int32_t )(void* *)bp;
1918
1919	/*
1920	* First block of SACK option should represent
1921	* DSACK. Prefer to send SACK information if there
1922	* is space for only one SACK block. This will
1923	* allow for faster recovery.
1924	*/
1925	if (TCP_SEND_DSACK_OPT(tp) && nsack > `0` &&
1926	(tp->rcv_numsacks == `0` \|\| nsack > `1`)) {
1927	*lp++ = htonl(tp->t_dsack_lseq);
1928	*lp++ = htonl(tp->t_dsack_rseq);
1929	tcpstat.tcps_dsack_sent++;
1930	tp->t_dsack_sent++;
1931	nsack--;
1932	}
1933	VERIFY(nsack == `0` \|\| tp->rcv_numsacks >= nsack);
1934	for (i = `0`; i < nsack; i++) {
1935	struct sackblk sack = tp->sackblks[i];
1936	*lp++ = htonl(sack.start);
1937	*lp++ = htonl(sack.end);
1938	}
1939	optlen += sackoptlen;
1940
1941	/ Make sure we didn't write too much /
1942	VERIFY((u_char *)lp - opt <= MAX_TCPOPTLEN);
1943	}
1944	}
1945
1946	/*
1947	* AccECN option - after SACK
1948	* Don't send on <SYN>,
1949	* send only on <SYN,ACK> before ACCECN is negotiated or
1950	* when doing an AccECN session. Don't send AccECN option
1951	* if retransmitting a SYN-ACK or a data segment
1952	*/
1953	if ((TCP_ACC_ECN_ON(tp) \|\|
1954	(TCP_ACC_ECN_ENABLED(tp) && (flags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK)))
1955	&& ((tp->ecn_flags & TE_RETRY_WITHOUT_ACO) == `0`)) {
1956	uint32_t lp = (uint32_t )(void *)(opt + optlen);
1957	/ lp will become outdated after options are added /
1958	tcp_add_accecn_option(tp, flags, lp, optlen: (uint8_t *)&optlen);
1959	}
1960	/ Pad TCP options to a 4 byte boundary /
1961	if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) {
1962	int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t));
1963	u_char bp = (u_char )opt + optlen;
1964
1965	optlen += pad;
1966	while (pad) {
1967	*bp++ = TCPOPT_EOL;
1968	pad--;
1969	}
1970	}
1971
1972	/*
1973	* For Accurate ECN, send ACE flag based on r.cep, if
1974	* We have completed handshake and are in ESTABLISHED state, and
1975	* This is not the final ACK of 3WHS.
1976	*/
1977	if (TCP_ACC_ECN_ON(tp) && TCPS_HAVEESTABLISHED(tp->t_state) &&
1978	(tp->ecn_flags & TE_ACE_FINAL_ACK_3WHS) == `0`) {
1979	uint8_t ace = tp->t_rcv_ce_packets & TCP_ACE_MASK;
1980	if (ace & `0x01`) {
1981	flags \|= TH_ECE;
1982	} else {
1983	flags &= ~TH_ECE;
1984	}
1985	if (ace & `0x02`) {
1986	flags \|= TH_CWR;
1987	} else {
1988	flags &= ~TH_CWR;
1989	}
1990	if (ace & `0x04`) {
1991	flags \|= TH_AE;
1992	} else {
1993	flags &= ~TH_AE;
1994	}
1995	}
1996
1997	/*
1998	* RFC 3168 states that:
1999	* - If you ever sent an ECN-setup SYN/SYN-ACK you must be prepared
2000	* to handle the TCP ECE flag, even if you also later send a
2001	* non-ECN-setup SYN/SYN-ACK.
2002	* - If you ever send a non-ECN-setup SYN/SYN-ACK, you must not set
2003	* the ip ECT flag.
2004	*
2005	* It is not clear how the ECE flag would ever be set if you never
2006	* set the IP ECT flag on outbound packets. All the same, we use
2007	* the TE_SETUPSENT to indicate that we have committed to handling
2008	* the TCP ECE flag correctly. We use the TE_SENDIPECT to indicate
2009	* whether or not we should set the IP ECT flag on outbound packet
2010	*
2011	* For a SYN-ACK, send an ECN setup SYN-ACK
2012	*
2013	* Below we send ECN for three different handhshake states:
2014	* 1. Server received SYN and is sending a SYN-ACK (state->TCPS_SYN_RECEIVED)
2015	* - both classic and Accurate ECN have special encoding
2016	* 2. Client is sending SYN packet (state->SYN_SENT)
2017	* - both classic and Accurate ECN have special encoding
2018	* 3. Client is sending final ACK of 3WHS (state->ESTABLISHED)
2019	* - Only Accurate ECN has special encoding
2020	*/
2021	if ((flags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK) &&
2022	(tp->ecn_flags & TE_ENABLE_ECN)) {
2023	/ Server received either legacy or Accurate ECN setup SYN /
2024	if (tp->ecn_flags & (TE_SETUPRECEIVED \| TE_ACE_SETUPRECEIVED)) {
2025	if (tcp_send_ecn_flags_on_syn(tp)) {
2026	if (TCP_ACC_ECN_ENABLED(tp) && (tp->ecn_flags & TE_ACE_SETUPRECEIVED)) {
2027	/*
2028	* Accurate ECN mode is on. Initialize packet and byte counters
2029	* for the server sending SYN-ACK. Although s_cep will be initialized
2030	* during input processing of ACK of SYN-ACK, initialize here as well
2031	* in case ACK gets lost.
2032	*
2033	* Non-zero initial values are used to
2034	* support a stateless handshake (see
2035	* Section 5.1 of AccECN draft) and to be
2036	* distinct from cases where the fields
2037	* are incorrectly zeroed.
2038	*/
2039	tp->t_rcv_ce_packets = `5`;
2040	tp->t_snd_ce_packets = `5`;
2041
2042	/ Initialize CE byte counter to 0 /
2043	tp->t_rcv_ce_bytes = tp->t_snd_ce_bytes = `0`;
2044
2045	if (tp->ecn_flags & TE_ACE_SETUP_NON_ECT) {
2046	tp->t_prev_ace_flags = TH_CWR;
2047	flags \|= tp->t_prev_ace_flags;
2048	/ Remove the setup flag as it is also used for final ACK /
2049	tp->ecn_flags &= ~TE_ACE_SETUP_NON_ECT;
2050	tcpstat.tcps_ecn_ace_syn_not_ect++;
2051	} else if (tp->ecn_flags & TE_ACE_SETUP_ECT1) {
2052	tp->t_prev_ace_flags = (TH_CWR \| TH_ECE);
2053	flags \|= tp->t_prev_ace_flags;
2054	tp->ecn_flags &= ~TE_ACE_SETUP_ECT1;
2055	tcpstat.tcps_ecn_ace_syn_ect1++;
2056	} else if (tp->ecn_flags & TE_ACE_SETUP_ECT0) {
2057	tp->t_prev_ace_flags = TH_AE;
2058	flags \|= tp->t_prev_ace_flags;
2059	tp->ecn_flags &= ~TE_ACE_SETUP_ECT0;
2060	tcpstat.tcps_ecn_ace_syn_ect0++;
2061	} else if (tp->ecn_flags & TE_ACE_SETUP_CE) {
2062	tp->t_prev_ace_flags = (TH_AE \| TH_CWR);
2063	flags \|= tp->t_prev_ace_flags;
2064	tp->ecn_flags &= ~TE_ACE_SETUP_CE;
2065	/*
2066	* Receive counter is updated on
2067	* all acceptable packets except
2068	* CE on SYN packets (SYN=1, ACK=0)
2069	*/
2070	tcpstat.tcps_ecn_ace_syn_ce++;
2071	} else {
2072	if (tp->t_prev_ace_flags != `0`) {
2073	/ Set the flags for retransmitted SYN-ACK same as the previous one /
2074	flags \|= tp->t_prev_ace_flags;
2075	} else {
2076	/ We shouldn't come here /
2077	panic("ECN flags (0x%x) not set correctly", tp->ecn_flags);
2078	}
2079	}
2080	/*
2081	* We are not yet committing to send IP ECT packets when
2082	* Accurate ECN mode is on
2083	*/
2084	tp->ecn_flags \|= (TE_ACE_SETUPSENT);
2085	} else if (tp->ecn_flags & TE_SETUPRECEIVED) {
2086	/*
2087	* Setting TH_ECE makes this an ECN-setup
2088	* SYN-ACK
2089	*/
2090	flags \|= TH_ECE;
2091	/*
2092	* Record that we sent the ECN-setup and
2093	* default to setting IP ECT.
2094	*/
2095	tp->ecn_flags \|= (TE_SETUPSENT \| TE_SENDIPECT);
2096	}
2097	tcpstat.tcps_ecn_server_setup++;
2098	tcpstat.tcps_ecn_server_success++;
2099	} else {
2100	/*
2101	* For classic ECN, we sent an ECN-setup SYN-ACK but it was
2102	* dropped. Fallback to non-ECN-setup
2103	* SYN-ACK and clear flag to indicate that
2104	* we should not send data with IP ECT set
2105	*
2106	* Pretend we didn't receive an
2107	* ECN-setup SYN.
2108	*
2109	* We already incremented the counter
2110	* assuming that the ECN setup will
2111	* succeed. Decrementing here
2112	* tcps_ecn_server_success to correct it.
2113	*
2114	* For Accurate ECN, we don't yet remove TE_ACE_SETUPRECEIVED
2115	* as the client might have received Accurate ECN SYN-ACK.
2116	* We decide Accurate ECN's state on processing last ACK from the client.
2117	*/
2118	if (tp->ecn_flags & (TE_SETUPSENT \| TE_ACE_SETUPSENT)) {
2119	tcpstat.tcps_ecn_lost_synack++;
2120	tcpstat.tcps_ecn_server_success--;
2121	tp->ecn_flags \|= TE_LOST_SYNACK;
2122	}
2123
2124	tp->ecn_flags &=
2125	~(TE_SETUPRECEIVED \| TE_SENDIPECT \|
2126	TE_SENDCWR);
2127	}
2128	}
2129	} else if ((flags & (TH_SYN \| TH_ACK)) == TH_SYN &&
2130	(tp->ecn_flags & TE_ENABLE_ECN)) {
2131	if (tcp_send_ecn_flags_on_syn(tp)) {
2132	if (TCP_ACC_ECN_ENABLED(tp)) {
2133	/ We are negotiating AccECN in SYN /
2134	flags \|= TH_ACE;
2135	/*
2136	* For AccECN, we only set the ECN-setup sent
2137	* flag as we are not committing to set ECT yet.
2138	*/
2139	tp->ecn_flags \|= (TE_ACE_SETUPSENT);
2140	} else {
2141	/*
2142	* Setting TH_ECE and TH_CWR makes this an
2143	* ECN-setup SYN
2144	*/
2145	flags \|= (TH_ECE \| TH_CWR);
2146	/*
2147	* Record that we sent the ECN-setup and default to
2148	* setting IP ECT.
2149	*/
2150	tp->ecn_flags \|= (TE_SETUPSENT \| TE_SENDIPECT);
2151	}
2152	tcpstat.tcps_ecn_client_setup++;
2153	tp->ecn_flags \|= TE_CLIENT_SETUP;
2154	} else {
2155	/*
2156	* We sent an ECN-setup SYN but it was dropped.
2157	* Fall back to non-ECN and clear flag indicating
2158	* we should send data with IP ECT set.
2159	*/
2160	if (tp->ecn_flags & (TE_SETUPSENT \| TE_ACE_SETUPSENT)) {
2161	tcpstat.tcps_ecn_lost_syn++;
2162	tp->ecn_flags \|= TE_LOST_SYN;
2163	}
2164	tp->ecn_flags &= ~TE_SENDIPECT;
2165	}
2166	} else if (TCP_ACC_ECN_ON(tp) && (tp->ecn_flags & TE_ACE_FINAL_ACK_3WHS) &&
2167	len == `0` && (flags & (TH_FLAGS_ALL)) == TH_ACK) {
2168	/*
2169	* Client has processed SYN-ACK and moved to ESTABLISHED.
2170	* This is the final ACK of 3WHS. If ACC_ECN has been negotiated,
2171	* then send the handshake encoding as per Table 3 of Accurate ECN draft.
2172	* We are clearing the ACE flags just in case if they were set before.
2173	* TODO: if client has to carry data in the 3WHS ACK, then we need to send a pure ACK first
2174	*/
2175	flags &= ~(TH_AE \| TH_CWR \| TH_ECE);
2176	if (tp->ecn_flags & TE_ACE_SETUP_NON_ECT) {
2177	flags \|= TH_CWR;
2178	tp->ecn_flags &= ~TE_ACE_SETUP_NON_ECT;
2179	} else if (tp->ecn_flags & TE_ACE_SETUP_ECT1) {
2180	flags \|= (TH_CWR \| TH_ECE);
2181	tp->ecn_flags &= ~TE_ACE_SETUP_ECT1;
2182	} else if (tp->ecn_flags & TE_ACE_SETUP_ECT0) {
2183	flags \|= TH_AE;
2184	tp->ecn_flags &= ~TE_ACE_SETUP_ECT0;
2185	} else if (tp->ecn_flags & TE_ACE_SETUP_CE) {
2186	flags \|= (TH_AE \| TH_CWR);
2187	tp->ecn_flags &= ~TE_ACE_SETUP_CE;
2188	}
2189	tp->ecn_flags &= ~(TE_ACE_FINAL_ACK_3WHS);
2190	}
2191
2192	/*
2193	* Check if we should set the TCP CWR flag.
2194	* CWR flag is sent when we reduced the congestion window because
2195	* we received a TCP ECE or we performed a fast retransmit. We
2196	* never set the CWR flag on retransmitted packets. We only set
2197	* the CWR flag on data packets. Pure acks don't have this set.
2198	*/
2199	if ((tp->ecn_flags & TE_SENDCWR) != `0` && len != `0` &&
2200	!SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
2201	flags \|= TH_CWR;
2202	tp->ecn_flags &= ~TE_SENDCWR;
2203	}
2204
2205	/*
2206	* Check if we should set the TCP ECE flag.
2207	*/
2208	if ((tp->ecn_flags & TE_SENDECE) != `0` && len == `0`) {
2209	flags \|= TH_ECE;
2210	tcpstat.tcps_ecn_sent_ece++;
2211	}
2212
2213	hdrlen += optlen;
2214
2215	/ Reset DSACK sequence numbers /
2216	tp->t_dsack_lseq = `0`;
2217	tp->t_dsack_rseq = `0`;
2218
2219	if (isipv6) {
2220	ipoptlen = ip6_optlen(inp);
2221	} else {
2222	if (tp_inp_options) {
2223	ipoptlen = tp_inp_options->m_len -
2224	offsetof(struct ipoption, ipopt_list);
2225	} else {
2226	ipoptlen = `0`;
2227	}
2228	}
2229	#if IPSEC
2230	ipoptlen += ipsec_optlen;
2231	#endif
2232
2233	/*
2234	* Adjust data length if insertion of options will
2235	* bump the packet length beyond the t_maxopd length.
2236	* Clear the FIN bit because we cut off the tail of
2237	* the segment.
2238	*
2239	* When doing TSO limit a burst to TCP_MAXWIN minus the
2240	* IP, TCP and Options length to keep ip->ip_len from
2241	* overflowing. Prevent the last segment from being
2242	* fractional thus making them all equal sized and set
2243	* the flag to continue sending. TSO is disabled when
2244	* IP options or IPSEC are present.
2245	*/
2246	if (len + optlen + ipoptlen > tp->t_maxopd) {
2247	/*
2248	* If there is still more to send,
2249	* don't close the connection.
2250	*/
2251	flags &= ~TH_FIN;
2252	if (tso) {
2253	int32_t tso_maxlen;
2254
2255	tso_maxlen = tp->tso_max_segment_size ?
2256	tp->tso_max_segment_size : TCP_MAXWIN;
2257
2258	/ hdrlen includes optlen /
2259	if (len > tso_maxlen - hdrlen) {
2260	len = tso_maxlen - hdrlen;
2261	sendalot = `1`;
2262	} else if (tp->t_flags & TF_NEEDFIN) {
2263	sendalot = `1`;
2264	}
2265
2266	if (len % (tp->t_maxopd - optlen) != `0`) {
2267	len = len - (len % (tp->t_maxopd - optlen));
2268	sendalot = `1`;
2269	}
2270	} else {
2271	len = tp->t_maxopd - optlen - ipoptlen;
2272	sendalot = `1`;
2273	}
2274	}
2275
2276	if (max_linkhdr + hdrlen > MCLBYTES) {
2277	panic("tcphdr too big");
2278	}
2279
2280	/ Check if there is enough data in the send socket*
2281	* buffer to start measuring bandwidth
2282	*/
2283	if ((tp->t_flagsext & TF_MEASURESNDBW) != `0` &&
2284	(tp->t_bwmeas != NULL) &&
2285	(tp->t_flagsext & TF_BWMEAS_INPROGRESS) == `0`) {
2286	tp->t_bwmeas->bw_size = min(a: min(
2287	a: (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)),
2288	b: tp->snd_cwnd), b: tp->snd_wnd);
2289	if (tp->t_bwmeas->bw_minsize > `0` &&
2290	tp->t_bwmeas->bw_size < tp->t_bwmeas->bw_minsize) {
2291	tp->t_bwmeas->bw_size = `0`;
2292	}
2293	if (tp->t_bwmeas->bw_maxsize > `0`) {
2294	tp->t_bwmeas->bw_size = min(a: tp->t_bwmeas->bw_size,
2295	b: tp->t_bwmeas->bw_maxsize);
2296	}
2297	if (tp->t_bwmeas->bw_size > `0`) {
2298	tp->t_flagsext \|= TF_BWMEAS_INPROGRESS;
2299	tp->t_bwmeas->bw_start = tp->snd_max;
2300	tp->t_bwmeas->bw_ts = tcp_now;
2301	}
2302	}
2303
2304	VERIFY(inp->inp_flowhash != `0`);
2305	/*
2306	* Grab a header mbuf, attaching a copy of data to
2307	* be transmitted, and initialize the header from
2308	* the template for sends on this connection.
2309	*/
2310	if (len) {
2311	/ Remember what the last head-of-line packet-size was /
2312	if (tp->t_pmtud_lastseg_size == `0` && tp->snd_nxt == tp->snd_una) {
2313	ASSERT(len + optlen + ipoptlen <= IP_MAXPACKET);
2314	tp->t_pmtud_lastseg_size = (uint16_t)(len + optlen + ipoptlen);
2315	}
2316	if ((tp->t_flagsext & TF_FORCE) && len == `1`) {
2317	tcpstat.tcps_sndprobe++;
2318	} else if (SEQ_LT(tp->snd_nxt, tp->snd_max) \|\| sack_rxmit) {
2319	tcpstat.tcps_sndrexmitpack++;
2320	tcpstat.tcps_sndrexmitbyte += len;
2321	if (nstat_collect) {
2322	nstat_route_tx(rte: inp->inp_route.ro_rt, packets: `1`,
2323	bytes: len, flags: NSTAT_TX_FLAG_RETRANSMIT);
2324	INP_ADD_STAT(inp, cell, wifi, wired,
2325	txpackets, `1`);
2326	INP_ADD_STAT(inp, cell, wifi, wired,
2327	txbytes, len);
2328	tp->t_stat.txretransmitbytes += len;
2329	tp->t_stat.rxmitpkts++;
2330	}
2331	if (tp->ecn_flags & TE_SENDIPECT) {
2332	tp->t_ecn_capable_packets_lost++;
2333	}
2334	} else {
2335	tcpstat.tcps_sndpack++;
2336	tcpstat.tcps_sndbyte += len;
2337
2338	if (nstat_collect) {
2339	INP_ADD_STAT(inp, cell, wifi, wired,
2340	txpackets, `1`);
2341	INP_ADD_STAT(inp, cell, wifi, wired,
2342	txbytes, len);
2343	}
2344	if (tp->ecn_flags & TE_SENDIPECT) {
2345	tp->t_ecn_capable_packets_sent++;
2346	}
2347	inp_decr_sndbytes_unsent(so, len);
2348	}
2349	inp_set_activity_bitmap(inp);
2350	#if MPTCP
2351	if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
2352	tcpstat.tcps_mp_sndpacks++;
2353	tcpstat.tcps_mp_sndbytes += len;
2354	}
2355	#endif /* MPTCP */
2356	/*
2357	* try to use the new interface that allocates all
2358	* the necessary mbuf hdrs under 1 mbuf lock and
2359	* avoids rescanning the socket mbuf list if
2360	* certain conditions are met. This routine can't
2361	* be used in the following cases...
2362	* 1) the protocol headers exceed the capacity of
2363	* of a single mbuf header's data area (no cluster attached)
2364	* 2) the length of the data being transmitted plus
2365	* the protocol headers fits into a single mbuf header's
2366	* data area (no cluster attached)
2367	*/
2368	m = NULL;
2369
2370	/ minimum length we are going to allocate /
2371	allocated_len = MHLEN;
2372	if (MHLEN < hdrlen + max_linkhdr) {
2373	MGETHDR(m, M_DONTWAIT, MT_HEADER);
2374	if (m == NULL) {
2375	error = ENOBUFS;
2376	goto out;
2377	}
2378	MCLGET(m, M_DONTWAIT);
2379	if ((m->m_flags & M_EXT) == `0`) {
2380	m_freem(m);
2381	error = ENOBUFS;
2382	goto out;
2383	}
2384	m->m_data += max_linkhdr;
2385	m->m_len = hdrlen;
2386	allocated_len = MCLBYTES;
2387	}
2388	if (len <= allocated_len - hdrlen - max_linkhdr) {
2389	if (m == NULL) {
2390	VERIFY(allocated_len <= MHLEN);
2391	MGETHDR(m, M_DONTWAIT, MT_HEADER);
2392	if (m == NULL) {
2393	error = ENOBUFS;
2394	goto out;
2395	}
2396	m->m_data += max_linkhdr;
2397	m->m_len = hdrlen;
2398	}
2399	/ makes sure we still have data left to be sent at this point /
2400	if (so->so_snd.sb_mb == NULL \|\| off < `0`) {
2401	if (m != NULL) {
2402	m_freem(m);
2403	}
2404	error = `0`; / should we return an error? /
2405	goto out;
2406	}
2407	m_copydata(so->so_snd.sb_mb, off, (int) len,
2408	mtod(m, caddr_t) + hdrlen);
2409	m->m_len += len;
2410	} else {
2411	uint32_t copymode;
2412	/*
2413	* Retain packet header metadata at the socket
2414	* buffer if this is is an MPTCP subflow,
2415	* otherwise move it.
2416	*/
2417	copymode = M_COPYM_MOVE_HDR;
2418	#if MPTCP
2419	if (so->so_flags & SOF_MP_SUBFLOW) {
2420	copymode = M_COPYM_NOOP_HDR;
2421	}
2422	#endif /* MPTCP */
2423	if (m != NULL) {
2424	if (so->so_snd.sb_flags & SB_SENDHEAD) {
2425	VERIFY(so->so_snd.sb_flags & SB_SENDHEAD);
2426	VERIFY(so->so_snd.sb_sendoff <= so->so_snd.sb_cc);
2427
2428	m->m_next = m_copym_mode(so->so_snd.sb_mb,
2429	off, (int)len, M_DONTWAIT,
2430	&so->so_snd.sb_sendhead,
2431	&so->so_snd.sb_sendoff, copymode);
2432
2433	VERIFY(so->so_snd.sb_sendoff <= so->so_snd.sb_cc);
2434	} else {
2435	m->m_next = m_copym_mode(so->so_snd.sb_mb,
2436	off, (int)len, M_DONTWAIT,
2437	NULL, NULL, copymode);
2438	}
2439	if (m->m_next == NULL) {
2440	(void) m_free(m);
2441	error = ENOBUFS;
2442	goto out;
2443	}
2444	} else {
2445	/*
2446	* make sure we still have data left
2447	* to be sent at this point
2448	*/
2449	if (so->so_snd.sb_mb == NULL) {
2450	error = `0`; / should we return an error? /
2451	goto out;
2452	}
2453
2454	/*
2455	* m_copym_with_hdrs will always return the
2456	* last mbuf pointer and the offset into it that
2457	* it acted on to fullfill the current request,
2458	* whether a valid 'hint' was passed in or not.
2459	*/
2460	if (so->so_snd.sb_flags & SB_SENDHEAD) {
2461	VERIFY(so->so_snd.sb_flags & SB_SENDHEAD);
2462	VERIFY(so->so_snd.sb_sendoff <= so->so_snd.sb_cc);
2463
2464	m = m_copym_with_hdrs(so->so_snd.sb_mb,
2465	off, len, M_DONTWAIT, &so->so_snd.sb_sendhead,
2466	&so->so_snd.sb_sendoff, copymode);
2467
2468	VERIFY(so->so_snd.sb_sendoff <= so->so_snd.sb_cc);
2469	} else {
2470	m = m_copym_with_hdrs(so->so_snd.sb_mb,
2471	off, len, M_DONTWAIT, NULL,
2472	NULL, copymode);
2473	}
2474	if (m == NULL) {
2475	error = ENOBUFS;
2476	goto out;
2477	}
2478	m->m_data += max_linkhdr;
2479	m->m_len = hdrlen;
2480	}
2481	}
2482	/*
2483	* If we're sending everything we've got, set PUSH.
2484	* (This will keep happy those implementations which only
2485	* give data to the user when a buffer fills or
2486	* a PUSH comes in.)
2487	*
2488	* On SYN-segments we should not add the PUSH-flag.
2489	*/
2490	if (off + len == so->so_snd.sb_cc && !(flags & TH_SYN)) {
2491	flags \|= TH_PUSH;
2492	}
2493	} else {
2494	if (tp->t_flags & TF_ACKNOW) {
2495	tcpstat.tcps_sndacks++;
2496	} else if (flags & (TH_SYN \| TH_FIN \| TH_RST)) {
2497	tcpstat.tcps_sndctrl++;
2498	} else if (SEQ_GT(tp->snd_up, tp->snd_una)) {
2499	tcpstat.tcps_sndurg++;
2500	} else {
2501	tcpstat.tcps_sndwinup++;
2502	}
2503
2504	MGETHDR(m, M_DONTWAIT, MT_HEADER); / MAC-OK /
2505	if (m == NULL) {
2506	error = ENOBUFS;
2507	goto out;
2508	}
2509	if (MHLEN < (hdrlen + max_linkhdr)) {
2510	MCLGET(m, M_DONTWAIT);
2511	if ((m->m_flags & M_EXT) == `0`) {
2512	m_freem(m);
2513	error = ENOBUFS;
2514	goto out;
2515	}
2516	}
2517	m->m_data += max_linkhdr;
2518	m->m_len = hdrlen;
2519	}
2520	m->m_pkthdr.rcvif = `0`;
2521	m_add_crumb(m, PKT_CRUMB_TCP_OUTPUT);
2522
2523	/ Any flag other than pure-ACK: Do not compress! /
2524	if (flags & ~(TH_ACK)) {
2525	do_not_compress = TRUE;
2526	}
2527
2528	if (tp->rcv_scale == `0`) {
2529	do_not_compress = TRUE;
2530	}
2531
2532	if (do_not_compress) {
2533	m->m_pkthdr.comp_gencnt = `0`;
2534	} else {
2535	if (TSTMP_LT(tp->t_comp_lastinc + tcp_ack_compression_rate, tcp_now)) {
2536	tp->t_comp_gencnt++;
2537	/ 0 means no compression, thus jump this /
2538	if (tp->t_comp_gencnt <= TCP_ACK_COMPRESSION_DUMMY) {
2539	tp->t_comp_gencnt = TCP_ACK_COMPRESSION_DUMMY + `1`;
2540	}
2541	tp->t_comp_lastinc = tcp_now;
2542	}
2543	m->m_pkthdr.comp_gencnt = tp->t_comp_gencnt;
2544	}
2545
2546	if (isipv6) {
2547	ip6 = mtod(m, struct ip6_hdr *);
2548	th = (struct tcphdr )(void* *)(ip6 + `1`);
2549	tcp_fillheaders(m, tp, ip6, th);
2550	if ((tp->ecn_flags & TE_SENDIPECT) != `0` && len &&
2551	!SEQ_LT(tp->snd_nxt, tp->snd_max) && !sack_rxmit) {
2552	ip6->ip6_flow \|= htonl(IPTOS_ECN_ECT0 << `20`);
2553	}
2554	svc_flags \|= PKT_SCF_IPV6;
2555	#if PF_ECN
2556	m_pftag(m)->pftag_hdr = (void *)ip6;
2557	m_pftag(m)->pftag_flags \|= PF_TAG_HDR_INET6;
2558	#endif /* PF_ECN */
2559	} else {
2560	ip = mtod(m, struct ip *);
2561	th = (struct tcphdr )(void* *)(ip + `1`);
2562	/ this picks up the pseudo header (w/o the length) /
2563	tcp_fillheaders(m, tp, ip, th);
2564	if ((tp->ecn_flags & TE_SENDIPECT) != `0` && len &&
2565	!SEQ_LT(tp->snd_nxt, tp->snd_max) &&
2566	!sack_rxmit && !(flags & TH_SYN)) {
2567	ip->ip_tos \|= IPTOS_ECN_ECT0;
2568	}
2569	#if PF_ECN
2570	m_pftag(m)->pftag_hdr = (void *)ip;
2571	m_pftag(m)->pftag_flags \|= PF_TAG_HDR_INET;
2572	#endif /* PF_ECN */
2573	}
2574
2575	/*
2576	* Fill in fields, remembering maximum advertised
2577	* window for use in delaying messages about window sizes.
2578	* If resending a FIN, be sure not to use a new sequence number.
2579	*/
2580	if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
2581	tp->snd_nxt == tp->snd_max) {
2582	tp->snd_nxt--;
2583	}
2584	/*
2585	* If we are doing retransmissions, then snd_nxt will
2586	* not reflect the first unsent octet. For ACK only
2587	* packets, we do not want the sequence number of the
2588	* retransmitted packet, we want the sequence number
2589	* of the next unsent octet. So, if there is no data
2590	* (and no SYN or FIN), use snd_max instead of snd_nxt
2591	* when filling in ti_seq. But if we are in persist
2592	* state, snd_max might reflect one byte beyond the
2593	* right edge of the window, so use snd_nxt in that
2594	* case, since we know we aren't doing a retransmission.
2595	* (retransmit and persist are mutually exclusive...)
2596	*
2597	* Note the state of this retransmit segment to detect spurious
2598	* retransmissions.
2599	*/
2600	if (sack_rxmit == `0`) {
2601	if (len \|\| (flags & (TH_SYN \| TH_FIN)) \|\|
2602	tp->t_timer[TCPT_PERSIST]) {
2603	th->th_seq = htonl(tp->snd_nxt);
2604	if (len > `0`) {
2605	m->m_pkthdr.tx_start_seq = tp->snd_nxt;
2606	m->m_pkthdr.pkt_flags \|= PKTF_START_SEQ;
2607	}
2608	if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
2609	if (SACK_ENABLED(tp) && len > `1` &&
2610	!(tp->t_flagsext & TF_SENT_TLPROBE)) {
2611	tcp_rxtseg_insert(tp, tp->snd_nxt,
2612	(tp->snd_nxt + len - `1`));
2613	}
2614	if (len > `0`) {
2615	m->m_pkthdr.pkt_flags \|=
2616	PKTF_TCP_REXMT;
2617	}
2618	}
2619	} else {
2620	th->th_seq = htonl(tp->snd_max);
2621	}
2622	} else {
2623	th->th_seq = htonl(p->rxmit);
2624	if (len > `0`) {
2625	m->m_pkthdr.pkt_flags \|=
2626	(PKTF_TCP_REXMT \| PKTF_START_SEQ);
2627	m->m_pkthdr.tx_start_seq = p->rxmit;
2628	}
2629	tcp_rxtseg_insert(tp, p->rxmit, (p->rxmit + len - `1`));
2630	p->rxmit += len;
2631	tp->sackhint.sack_bytes_rexmit += len;
2632	}
2633	th->th_ack = htonl(tp->rcv_nxt);
2634	tp->last_ack_sent = tp->rcv_nxt;
2635	if (optlen) {
2636	bcopy(src: opt, dst: th + `1`, n: optlen);
2637	th->th_off = (sizeof(struct tcphdr) + optlen) >> `2`;
2638	}
2639	/ Separate AE from flags /
2640	th->th_flags = (flags & (TH_FLAGS_ALL));
2641	th->th_x2 = (flags & (TH_AE)) >> `8`;
2642	th->th_win = htons((u_short) (recwin >> tp->rcv_scale));
2643	tp->t_last_recwin = recwin;
2644	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2645	if (recwin > `0` && SEQ_LT(tp->rcv_adv, tp->rcv_nxt + recwin)) {
2646	tp->rcv_adv = tp->rcv_nxt + recwin;
2647	}
2648	} else {
2649	struct mptcb *mp_tp = tptomptp(tp);
2650	if (recwin > `0`) {
2651	tp->rcv_adv = tp->rcv_nxt + recwin;
2652	}
2653
2654	if (recwin > `0` && MPTCP_SEQ_LT(mp_tp->mpt_rcvadv, mp_tp->mpt_rcvnxt + recwin)) {
2655	mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + recwin;
2656	}
2657	}
2658
2659	/*
2660	* Adjust the RXWIN0SENT flag - indicate that we have advertised
2661	* a 0 window. This may cause the remote transmitter to stall. This
2662	* flag tells soreceive() to disable delayed acknowledgements when
2663	* draining the buffer. This can occur if the receiver is attempting
2664	* to read more data then can be buffered prior to transmitting on
2665	* the connection.
2666	*/
2667	if (th->th_win == `0`) {
2668	tp->t_flags \|= TF_RXWIN0SENT;
2669	} else {
2670	tp->t_flags &= ~TF_RXWIN0SENT;
2671	}
2672
2673	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
2674	th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
2675	th->th_flags \|= TH_URG;
2676	} else {
2677	/*
2678	* If no urgent pointer to send, then we pull
2679	* the urgent pointer to the left edge of the send window
2680	* so that it doesn't drift into the send window on sequence
2681	* number wraparound.
2682	*/
2683	tp->snd_up = tp->snd_una; / drag it along /
2684	}
2685
2686	/*
2687	* Put TCP length in extended header, and then
2688	* checksum extended header and data.
2689	*/
2690	m->m_pkthdr.len = hdrlen + len; / in6_cksum() need this /
2691
2692	/*
2693	* If this is potentially the last packet on the stream, then mark
2694	* it in order to enable some optimizations in the underlying
2695	* layers
2696	*/
2697	if (tp->t_state != TCPS_ESTABLISHED &&
2698	(tp->t_state == TCPS_CLOSING \|\| tp->t_state == TCPS_TIME_WAIT
2699	\|\| tp->t_state == TCPS_LAST_ACK \|\| (th->th_flags & TH_RST))) {
2700	m->m_pkthdr.pkt_flags \|= PKTF_LAST_PKT;
2701	}
2702
2703	if (isipv6) {
2704	/*
2705	* ip6_plen is not need to be filled now, and will be filled
2706	* in ip6_output.
2707	*/
2708	m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
2709	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2710	if (len + optlen) {
2711	th->th_sum = in_addword(th->th_sum,
2712	htons((u_short)(optlen + len)));
2713	}
2714	} else {
2715	m->m_pkthdr.csum_flags = CSUM_TCP;
2716	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
2717	if (len + optlen) {
2718	th->th_sum = in_addword(th->th_sum,
2719	htons((u_short)(optlen + len)));
2720	}
2721	}
2722
2723	/*
2724	* Enable TSO and specify the size of the segments.
2725	* The TCP pseudo header checksum is always provided.
2726	*/
2727	if (tso) {
2728	if (isipv6) {
2729	m->m_pkthdr.csum_flags \|= CSUM_TSO_IPV6;
2730	} else {
2731	m->m_pkthdr.csum_flags \|= CSUM_TSO_IPV4;
2732	}
2733
2734	m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen;
2735	} else {
2736	m->m_pkthdr.tso_segsz = `0`;
2737	}
2738
2739	/*
2740	* In transmit state, time the transmission and arrange for
2741	* the retransmit. In persist state, just set snd_max.
2742	*/
2743	if (!(tp->t_flagsext & TF_FORCE)
2744	\|\| tp->t_timer[TCPT_PERSIST] == `0`) {
2745	tcp_seq startseq = tp->snd_nxt;
2746
2747	/*
2748	* Advance snd_nxt over sequence space of this segment.
2749	*/
2750	if (flags & (TH_SYN \| TH_FIN)) {
2751	if (flags & TH_SYN) {
2752	tp->snd_nxt++;
2753	}
2754	if ((flags & TH_FIN) &&
2755	!(tp->t_flags & TF_SENTFIN)) {
2756	tp->snd_nxt++;
2757	tp->t_flags \|= TF_SENTFIN;
2758	}
2759	}
2760	if (sack_rxmit) {
2761	goto timer;
2762	}
2763	if (sack_rescue_rxt == TRUE) {
2764	tp->snd_nxt = old_snd_nxt;
2765	sack_rescue_rxt = FALSE;
2766	tcpstat.tcps_pto_in_recovery++;
2767	} else {
2768	tp->snd_nxt += len;
2769	}
2770	if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
2771	tp->snd_max = tp->snd_nxt;
2772	tp->t_sndtime = tcp_now;
2773	/*
2774	* Time this transmission if not a retransmission and
2775	* not currently timing anything.
2776	*/
2777	if (tp->t_rtttime == `0`) {
2778	tp->t_rtttime = tcp_now;
2779	tp->t_rtseq = startseq;
2780	tcpstat.tcps_segstimed++;
2781
2782	/ update variables related to pipe ack /
2783	tp->t_pipeack_lastuna = tp->snd_una;
2784	}
2785	}
2786
2787	/*
2788	* Set retransmit timer if not currently set,
2789	* and not doing an ack or a keep-alive probe.
2790	*/
2791	timer:
2792	if (tp->t_timer[TCPT_REXMT] == `0` &&
2793	((sack_rxmit && tp->snd_nxt != tp->snd_max) \|\|
2794	tp->snd_nxt != tp->snd_una \|\| (flags & TH_FIN))) {
2795	if (tp->t_timer[TCPT_PERSIST]) {
2796	tp->t_timer[TCPT_PERSIST] = `0`;
2797	tp->t_persist_stop = `0`;
2798	TCP_RESET_REXMT_STATE(tp);
2799	}
2800	tp->t_timer[TCPT_REXMT] =
2801	OFFSET_FROM_START(tp, tp->t_rxtcur);
2802	}
2803
2804	/*
2805	* Set tail loss probe timeout if new data is being
2806	* transmitted. This will be supported only when
2807	* SACK option is enabled on a connection.
2808	*
2809	* Every time new data is sent PTO will get reset.
2810	*/
2811	if (tcp_enable_tlp && len != `0` && tp->t_state == TCPS_ESTABLISHED &&
2812	SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
2813	tp->snd_nxt == tp->snd_max &&
2814	SEQ_GT(tp->snd_nxt, tp->snd_una) &&
2815	tp->t_rxtshift == `0` &&
2816	(tp->t_flagsext & (TF_SENT_TLPROBE \| TF_PKTS_REORDERED)) == `0`) {
2817	uint32_t pto, srtt;
2818
2819	if (tcp_do_better_lr) {
2820	srtt = tp->t_srtt >> TCP_RTT_SHIFT;
2821	pto = `2` * srtt;
2822	if ((tp->snd_max - tp->snd_una) <= tp->t_maxseg) {
2823	pto += tcp_delack;
2824	} else {
2825	pto += `2`;
2826	}
2827	} else {
2828	/*
2829	* Using SRTT alone to set PTO can cause spurious
2830	* retransmissions on wireless networks where there
2831	* is a lot of variance in RTT. Taking variance
2832	* into account will avoid this.
2833	*/
2834	srtt = tp->t_srtt >> TCP_RTT_SHIFT;
2835	pto = ((TCP_REXMTVAL(tp)) * `3`) >> `1`;
2836	pto = max(a: `2` * srtt, b: pto);
2837	if ((tp->snd_max - tp->snd_una) == tp->t_maxseg) {
2838	pto = max(a: pto,
2839	b: (((`3` * pto) >> `2`) + tcp_delack * `2`));
2840	} else {
2841	pto = max(a: `10`, b: pto);
2842	}
2843	}
2844
2845	/ if RTO is less than PTO, choose RTO instead /
2846	if (tp->t_rxtcur < pto) {
2847	pto = tp->t_rxtcur;
2848	}
2849
2850	tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto);
2851	}
2852	} else {
2853	/*
2854	* Persist case, update snd_max but since we are in
2855	* persist mode (no window) we do not update snd_nxt.
2856	*/
2857	int xlen = len;
2858	if (flags & TH_SYN) {
2859	++xlen;
2860	}
2861	if ((flags & TH_FIN) &&
2862	!(tp->t_flags & TF_SENTFIN)) {
2863	++xlen;
2864	tp->t_flags \|= TF_SENTFIN;
2865	}
2866	if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) {
2867	tp->snd_max = tp->snd_nxt + len;
2868	tp->t_sndtime = tcp_now;
2869	}
2870	}
2871
2872	#if TCPDEBUG
2873	/*
2874	* Trace.
2875	*/
2876	if (so_options & SO_DEBUG) {
2877	tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, `0`);
2878	}
2879	#endif
2880
2881	/*
2882	* Fill in IP length and desired time to live and
2883	* send to IP level. There should be a better way
2884	* to handle ttl and tos; we could keep them in
2885	* the template, but need a way to checksum without them.
2886	*/
2887	/*
2888	* m->m_pkthdr.len should have been set before cksum calcuration,
2889	* because in6_cksum() need it.
2890	*/
2891	if (isipv6) {
2892	/*
2893	* we separately set hoplimit for every segment, since the
2894	* user might want to change the value via setsockopt.
2895	* Also, desired default hop limit might be changed via
2896	* Neighbor Discovery.
2897	*/
2898	ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
2899	inp->in6p_route.ro_rt->rt_ifp : NULL);
2900
2901	/ Don't set ECT bit if requested by an app /
2902
2903	/ Set ECN bits for testing purposes /
2904	if (tp->ecn_flags & TE_FORCE_ECT1) {
2905	ip6->ip6_flow \|= htonl(IPTOS_ECN_ECT1 << `20`);
2906	} else if (tp->ecn_flags & TE_FORCE_ECT0) {
2907	ip6->ip6_flow \|= htonl(IPTOS_ECN_ECT0 << `20`);
2908	}
2909
2910	KERNEL_DEBUG(DBG_LAYER_BEG,
2911	((inp->inp_fport << `16`) \| inp->inp_lport),
2912	(((inp->in6p_laddr.s6_addr16[`0`] & `0xffff`) << `16`) \|
2913	(inp->in6p_faddr.s6_addr16[`0`] & `0xffff`)),
2914	sendalot, `0`, `0`);
2915	} else {
2916	ASSERT(m->m_pkthdr.len <= IP_MAXPACKET);
2917	ip->ip_len = (u_short)m->m_pkthdr.len;
2918	ip->ip_ttl = inp->inp_ip_ttl; / XXX /
2919
2920	/ Don't set ECN bit if requested by an app /
2921	ip->ip_tos \|= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);
2922
2923	/ Set ECN bits for testing purposes /
2924	if (tp->ecn_flags & TE_FORCE_ECT1) {
2925	ip->ip_tos \|= IPTOS_ECN_ECT1;
2926	} else if (tp->ecn_flags & TE_FORCE_ECT0) {
2927	ip->ip_tos \|= IPTOS_ECN_ECT0;
2928	}
2929
2930	KERNEL_DEBUG(DBG_LAYER_BEG,
2931	((inp->inp_fport << `16`) \| inp->inp_lport),
2932	(((inp->inp_laddr.s_addr & `0xffff`) << `16`) \|
2933	(inp->inp_faddr.s_addr & `0xffff`)), `0`, `0`, `0`);
2934	}
2935
2936	/*
2937	* See if we should do MTU discovery.
2938	* Look at the flag updated on the following criterias:
2939	* 1) Path MTU discovery is authorized by the sysctl
2940	* 2) The route isn't set yet (unlikely but could happen)
2941	* 3) The route is up
2942	* 4) the MTU is not locked (if it is, then discovery has been
2943	* disabled for that route)
2944	*/
2945	if (!isipv6) {
2946	if (path_mtu_discovery && (tp->t_flags & TF_PMTUD)) {
2947	ip->ip_off \|= IP_DF;
2948	}
2949	}
2950
2951	#if NECP
2952	{
2953	necp_kernel_policy_id policy_id;
2954	necp_kernel_policy_id skip_policy_id;
2955	u_int32_t route_rule_id;
2956	u_int32_t pass_flags;
2957	if (!necp_socket_is_allowed_to_send_recv(inp, NULL, pf_tag: `0`, return_policy_id: &policy_id, return_route_rule_id: &route_rule_id, return_skip_policy_id: &skip_policy_id, return_pass_flags: &pass_flags)) {
2958	TCP_LOG_DROP_NECP(isipv6 ? (void )ip6 : (void* *)ip, th, tp, true);
2959	m_freem(m);
2960	error = EHOSTUNREACH;
2961	goto out;
2962	}
2963	necp_mark_packet_from_socket(packet: m, inp, policy_id, route_rule_id, skip_policy_id, pass_flags);
2964
2965	if (net_qos_policy_restricted != `0`) {
2966	necp_socket_update_qos_marking(inp, route: inp->inp_route.ro_rt, route_rule_id);
2967	}
2968	}
2969	#endif /* NECP */
2970
2971	#if IPSEC
2972	if (inp->inp_sp != NULL) {
2973	ipsec_setsocket(m, so);
2974	}
2975	#endif /IPSEC/
2976
2977	/*
2978	* The socket is kept locked while sending out packets in ip_output, even if packet chaining is not active.
2979	*/
2980	lost = `0`;
2981
2982	/*
2983	* Embed the flow hash in pkt hdr and mark the packet as
2984	* capable of flow controlling
2985	*/
2986	m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
2987	m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
2988	m->m_pkthdr.pkt_flags \|= (PKTF_FLOW_ID \| PKTF_FLOW_LOCALSRC \| PKTF_FLOW_ADV);
2989	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
2990	m->m_pkthdr.tx_tcp_pid = so->last_pid;
2991	if (so->so_flags & SOF_DELEGATED) {
2992	m->m_pkthdr.tx_tcp_e_pid = so->e_pid;
2993	} else {
2994	m->m_pkthdr.tx_tcp_e_pid = `0`;
2995	}
2996
2997	m->m_nextpkt = NULL;
2998
2999	if (inp->inp_last_outifp != NULL &&
3000	!(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3001	/ Hint to prioritize this packet if*
3002	* 1. if the packet has no data
3003	* 2. the interface supports transmit-start model and did
3004	* not disable ACK prioritization.
3005	* 3. Only ACK flag is set.
3006	* 4. there is no outstanding data on this connection.
3007	*/
3008	if (len == `0` && (inp->inp_last_outifp->if_eflags & (IFEF_TXSTART \| IFEF_NOACKPRI)) == IFEF_TXSTART) {
3009	if (th->th_flags == TH_ACK &&
3010	tp->snd_una == tp->snd_max &&
3011	tp->t_timer[TCPT_REXMT] == `0`) {
3012	svc_flags \|= PKT_SCF_TCP_ACK;
3013	}
3014	if (th->th_flags & TH_SYN) {
3015	svc_flags \|= PKT_SCF_TCP_SYN;
3016	}
3017	}
3018	set_packet_service_class(m, so, sotc, svc_flags);
3019	} else {
3020	/*
3021	* Optimization for loopback just set the mbuf
3022	* service class
3023	*/
3024	(void) m_set_service_class(m, so_tc2msc(sotc));
3025	}
3026
3027	if ((th->th_flags & TH_SYN) && tp->t_syn_sent < UINT8_MAX) {
3028	tp->t_syn_sent++;
3029	}
3030	if ((th->th_flags & TH_FIN) && tp->t_fin_sent < UINT8_MAX) {
3031	tp->t_fin_sent++;
3032	}
3033	if ((th->th_flags & TH_RST) && tp->t_rst_sent < UINT8_MAX) {
3034	tp->t_rst_sent++;
3035	}
3036	TCP_LOG_TH_FLAGS(isipv6 ? (void )ip6 : (void* *)ip, th, tp, true,
3037	inp->inp_last_outifp != NULL ? inp->inp_last_outifp :
3038	inp->inp_boundifp);
3039
3040	tp->t_pktlist_sentlen += len;
3041	tp->t_lastchain++;
3042
3043	if (isipv6) {
3044	DTRACE_TCP5(send, struct mbuf , m, struct* inpcb *, inp,
3045	struct ip6 , ip6, struct* tcpcb , tp, struct* tcphdr *,
3046	th);
3047	} else {
3048	DTRACE_TCP5(send, struct mbuf , m, struct* inpcb *, inp,
3049	struct ip , ip, struct* tcpcb , tp, struct* tcphdr *, th);
3050	}
3051
3052	if (tp->t_pktlist_head != NULL) {
3053	tp->t_pktlist_tail->m_nextpkt = m;
3054	tp->t_pktlist_tail = m;
3055	} else {
3056	packchain_newlist++;
3057	tp->t_pktlist_head = tp->t_pktlist_tail = m;
3058	}
3059
3060	if (sendalot == `0` \|\| (tp->t_state != TCPS_ESTABLISHED) \|\|
3061	(tp->t_flags & TF_ACKNOW) \|\|
3062	(tp->t_flagsext & TF_FORCE) \|\|
3063	tp->t_lastchain >= tcp_packet_chaining) {
3064	error = `0`;
3065	while (inp->inp_sndinprog_cnt == `0` &&
3066	tp->t_pktlist_head != NULL) {
3067	packetlist = tp->t_pktlist_head;
3068	packchain_listadd = tp->t_lastchain;
3069	packchain_sent++;
3070	lost = tp->t_pktlist_sentlen;
3071	TCP_PKTLIST_CLEAR(tp);
3072
3073	error = tcp_ip_output(so, tp, packetlist,
3074	packchain_listadd, tp_inp_options,
3075	(so_options & SO_DONTROUTE),
3076	(sack_rxmit \|\| (sack_bytes_rxmt != `0`)), isipv6);
3077	if (error) {
3078	/*
3079	* Take into account the rest of unsent
3080	* packets in the packet list for this tcp
3081	* into "lost", since we're about to free
3082	* the whole list below.
3083	*/
3084	lost += tp->t_pktlist_sentlen;
3085	break;
3086	} else {
3087	lost = `0`;
3088	}
3089	}
3090	/ tcp was closed while we were in ip; resume close /
3091	if (inp->inp_sndinprog_cnt == `0` &&
3092	(tp->t_flags & TF_CLOSING)) {
3093	tp->t_flags &= ~TF_CLOSING;
3094	(void) tcp_close(tp);
3095	return `0`;
3096	}
3097	} else {
3098	error = `0`;
3099	packchain_looped++;
3100	tcpstat.tcps_sndtotal++;
3101
3102	goto again;
3103	}
3104	if (error) {
3105	/*
3106	* Assume that the packets were lost, so back out the
3107	* sequence number advance, if any. Note that the "lost"
3108	* variable represents the amount of user data sent during
3109	* the recent call to ip_output_list() plus the amount of
3110	* user data in the packet list for this tcp at the moment.
3111	*/
3112	if (!(tp->t_flagsext & TF_FORCE)
3113	\|\| tp->t_timer[TCPT_PERSIST] == `0`) {
3114	/*
3115	* No need to check for TH_FIN here because
3116	* the TF_SENTFIN flag handles that case.
3117	*/
3118	if ((flags & TH_SYN) == `0`) {
3119	if (sack_rxmit) {
3120	if (SEQ_GT((p->rxmit - lost),
3121	tp->snd_una)) {
3122	p->rxmit -= lost;
3123
3124	if (SEQ_LT(p->rxmit, p->start)) {
3125	p->rxmit = p->start;
3126	}
3127	} else {
3128	lost = p->rxmit - tp->snd_una;
3129	p->rxmit = tp->snd_una;
3130
3131	if (SEQ_LT(p->rxmit, p->start)) {
3132	p->rxmit = p->start;
3133	}
3134	}
3135	tp->sackhint.sack_bytes_rexmit -= lost;
3136	if (tp->sackhint.sack_bytes_rexmit < `0`) {
3137	tp->sackhint.sack_bytes_rexmit = `0`;
3138	}
3139	} else {
3140	if (SEQ_GT((tp->snd_nxt - lost),
3141	tp->snd_una)) {
3142	tp->snd_nxt -= lost;
3143	} else {
3144	tp->snd_nxt = tp->snd_una;
3145	}
3146	}
3147	}
3148	}
3149	out:
3150	if (tp->t_pktlist_head != NULL) {
3151	m_freem_list(tp->t_pktlist_head);
3152	}
3153	TCP_PKTLIST_CLEAR(tp);
3154
3155	if (error == ENOBUFS) {
3156	/*
3157	* Set retransmit timer if not currently set
3158	* when we failed to send a segment that can be
3159	* retransmitted (i.e. not pure ack or rst)
3160	*/
3161	if (tp->t_timer[TCPT_REXMT] == `0` &&
3162	tp->t_timer[TCPT_PERSIST] == `0` &&
3163	(len != `0` \|\| (flags & (TH_SYN \| TH_FIN)) != `0` \|\|
3164	so->so_snd.sb_cc > `0`)) {
3165	tp->t_timer[TCPT_REXMT] =
3166	OFFSET_FROM_START(tp, tp->t_rxtcur);
3167	}
3168	tp->snd_cwnd = tp->t_maxseg;
3169	tp->t_bytes_acked = `0`;
3170	tcp_check_timer_state(tp);
3171	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
3172
3173	TCP_LOG_OUTPUT(tp, "error ENOBUFS silently handled");
3174
3175	tcp_ccdbg_trace(tp, NULL, event: TCP_CC_OUTPUT_ERROR);
3176	return `0`;
3177	}
3178	if (error == EMSGSIZE) {
3179	/*
3180	* ip_output() will have already fixed the route
3181	* for us. tcp_mtudisc() will, as its last action,
3182	* initiate retransmission, so it is important to
3183	* not do so here.
3184	*
3185	* If TSO was active we either got an interface
3186	* without TSO capabilits or TSO was turned off.
3187	* Disable it for this connection as too and
3188	* immediatly retry with MSS sized segments generated
3189	* by this function.
3190	*/
3191	if (tso) {
3192	tp->t_flags &= ~TF_TSO;
3193	}
3194
3195	tcp_mtudisc(inp, `0`);
3196	tcp_check_timer_state(tp);
3197
3198	TCP_LOG_OUTPUT(tp, "error EMSGSIZE silently handled");
3199
3200	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
3201	return `0`;
3202	}
3203	/*
3204	* Unless this is due to interface restriction policy,
3205	* treat EHOSTUNREACH/ENETDOWN/EADDRNOTAVAIL as a soft error.
3206	*/
3207	if ((error == EHOSTUNREACH \|\| error == ENETDOWN \|\| error == EADDRNOTAVAIL) &&
3208	TCPS_HAVERCVDSYN(tp->t_state) &&
3209	!inp_restricted_send(inp, inp->inp_last_outifp)) {
3210	tp->t_softerror = error;
3211	TCP_LOG_OUTPUT(tp, "soft error %d silently handled", error);
3212	error = `0`;
3213	} else {
3214	TCP_LOG_OUTPUT(tp, "error %d", error);
3215	}
3216	tcp_check_timer_state(tp);
3217	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
3218	return error;
3219	}
3220
3221	tcpstat.tcps_sndtotal++;
3222
3223	KERNEL_DEBUG(DBG_FNC_TCP_OUTPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
3224	if (sendalot) {
3225	goto again;
3226	}
3227
3228	tcp_check_timer_state(tp);
3229
3230	return `0`;
3231	}
3232
3233	static int
3234	tcp_ip_output(struct socket so, struct* tcpcb tp, struct* mbuf *pkt,
3235	int cnt, struct mbuf opt, int* flags, int sack_in_progress, boolean_t isipv6)
3236	{
3237	int error = `0`;
3238	boolean_t chain;
3239	boolean_t unlocked = FALSE;
3240	boolean_t ifdenied = FALSE;
3241	struct inpcb *inp = tp->t_inpcb;
3242	struct ifnet *outif = NULL;
3243	bool check_qos_marking_again = (so->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE) ? FALSE : TRUE;
3244
3245	union {
3246	struct route _ro;
3247	struct route_in6 _ro6;
3248	} route_u_ = {};
3249	#define ro route_u_._ro
3250	#define ro6 route_u_._ro6
3251
3252	union {
3253	struct ip_out_args _ipoa;
3254	struct ip6_out_args _ip6oa;
3255	} out_args_u_ = {};
3256	#define ipoa out_args_u_._ipoa
3257	#define ip6oa out_args_u_._ip6oa
3258
3259	if (isipv6) {
3260	ip6oa.ip6oa_boundif = IFSCOPE_NONE;
3261	ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF \| IP6OAF_BOUND_SRCADDR;
3262	ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
3263	ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3264	} else {
3265	ipoa.ipoa_boundif = IFSCOPE_NONE;
3266	ipoa.ipoa_flags = IPOAF_SELECT_SRCIF \| IPOAF_BOUND_SRCADDR;
3267	ipoa.ipoa_sotc = SO_TC_UNSPEC;
3268	ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3269	}
3270
3271	struct flowadv *adv =
3272	(isipv6 ? &ip6oa.ip6oa_flowadv : &ipoa.ipoa_flowadv);
3273
3274	/ If socket was bound to an ifindex, tell ip_output about it /
3275	if (inp->inp_flags & INP_BOUND_IF) {
3276	if (isipv6) {
3277	ip6oa.ip6oa_boundif = inp->inp_boundifp->if_index;
3278	ip6oa.ip6oa_flags \|= IP6OAF_BOUND_IF;
3279	} else {
3280	ipoa.ipoa_boundif = inp->inp_boundifp->if_index;
3281	ipoa.ipoa_flags \|= IPOAF_BOUND_IF;
3282	}
3283	} else if (!in6_embedded_scope && isipv6 && (IN6_IS_SCOPE_EMBED(&inp->in6p_faddr))) {
3284	ip6oa.ip6oa_boundif = inp->inp_fifscope;
3285	ip6oa.ip6oa_flags \|= IP6OAF_BOUND_IF;
3286	}
3287
3288	if (INP_NO_CELLULAR(inp)) {
3289	if (isipv6) {
3290	ip6oa.ip6oa_flags \|= IP6OAF_NO_CELLULAR;
3291	} else {
3292	ipoa.ipoa_flags \|= IPOAF_NO_CELLULAR;
3293	}
3294	}
3295	if (INP_NO_EXPENSIVE(inp)) {
3296	if (isipv6) {
3297	ip6oa.ip6oa_flags \|= IP6OAF_NO_EXPENSIVE;
3298	} else {
3299	ipoa.ipoa_flags \|= IPOAF_NO_EXPENSIVE;
3300	}
3301	}
3302	if (INP_NO_CONSTRAINED(inp)) {
3303	if (isipv6) {
3304	ip6oa.ip6oa_flags \|= IP6OAF_NO_CONSTRAINED;
3305	} else {
3306	ipoa.ipoa_flags \|= IPOAF_NO_CONSTRAINED;
3307	}
3308	}
3309	if (INP_AWDL_UNRESTRICTED(inp)) {
3310	if (isipv6) {
3311	ip6oa.ip6oa_flags \|= IP6OAF_AWDL_UNRESTRICTED;
3312	} else {
3313	ipoa.ipoa_flags \|= IPOAF_AWDL_UNRESTRICTED;
3314	}
3315	}
3316	if (INP_INTCOPROC_ALLOWED(inp) && isipv6) {
3317	ip6oa.ip6oa_flags \|= IP6OAF_INTCOPROC_ALLOWED;
3318	}
3319	if (INP_MANAGEMENT_ALLOWED(inp)) {
3320	if (isipv6) {
3321	ip6oa.ip6oa_flags \|= IP6OAF_MANAGEMENT_ALLOWED;
3322	} else {
3323	ipoa.ipoa_flags \|= IPOAF_MANAGEMENT_ALLOWED;
3324	}
3325	}
3326	if (isipv6) {
3327	ip6oa.ip6oa_sotc = so->so_traffic_class;
3328	ip6oa.ip6oa_netsvctype = so->so_netsvctype;
3329	ip6oa.qos_marking_gencount = inp->inp_policyresult.results.qos_marking_gencount;
3330	} else {
3331	ipoa.ipoa_sotc = so->so_traffic_class;
3332	ipoa.ipoa_netsvctype = so->so_netsvctype;
3333	ipoa.qos_marking_gencount = inp->inp_policyresult.results.qos_marking_gencount;
3334	}
3335	if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
3336	if (isipv6) {
3337	ip6oa.ip6oa_flags \|= IP6OAF_QOSMARKING_ALLOWED;
3338	} else {
3339	ipoa.ipoa_flags \|= IPOAF_QOSMARKING_ALLOWED;
3340	}
3341	}
3342	if (check_qos_marking_again) {
3343	if (isipv6) {
3344	ip6oa.ip6oa_flags \|= IP6OAF_REDO_QOSMARKING_POLICY;
3345	} else {
3346	ipoa.ipoa_flags \|= IPOAF_REDO_QOSMARKING_POLICY;
3347	}
3348	}
3349	if (isipv6) {
3350	flags \|= IPV6_OUTARGS;
3351	} else {
3352	flags \|= IP_OUTARGS;
3353	}
3354
3355	/ Copy the cached route and take an extra reference /
3356	if (isipv6) {
3357	in6p_route_copyout(inp, &ro6);
3358	} else {
3359	inp_route_copyout(inp, &ro);
3360	}
3361	#if (DEBUG \|\| DEVELOPMENT)
3362	if ((so->so_flags & SOF_MARK_WAKE_PKT) && pkt != NULL) {
3363	so->so_flags &= ~SOF_MARK_WAKE_PKT;
3364	pkt->m_pkthdr.pkt_flags \|= PKTF_WAKE_PKT;
3365	}
3366	#endif /* (DEBUG \|\| DEVELOPMENT) */
3367
3368	/*
3369	* Make sure ACK/DELACK conditions are cleared before
3370	* we unlock the socket.
3371	*/
3372	tp->last_ack_sent = tp->rcv_nxt;
3373	tp->t_flags &= ~(TF_ACKNOW \| TF_DELACK);
3374	tp->t_timer[TCPT_DELACK] = `0`;
3375	tp->t_unacksegs = `0`;
3376	tp->t_unacksegs_ce = `0`;
3377
3378	/ Increment the count of outstanding send operations /
3379	inp->inp_sndinprog_cnt++;
3380
3381	/*
3382	* If allowed, unlock TCP socket while in IP
3383	* but only if the connection is established and
3384	* in a normal mode where reentrancy on the tcpcb won't be
3385	* an issue:
3386	* - there is no SACK episode
3387	* - we're not in Fast Recovery mode
3388	* - if we're not sending from an upcall.
3389	*/
3390	if (tcp_output_unlocked && !so->so_upcallusecount &&
3391	(tp->t_state == TCPS_ESTABLISHED) && (sack_in_progress == `0`) &&
3392	!IN_FASTRECOVERY(tp) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3393	unlocked = TRUE;
3394	socket_unlock(so, refcount: `0`);
3395	}
3396
3397	/*
3398	* Don't send down a chain of packets when:
3399	* - TCP chaining is disabled
3400	* - there is an IPsec rule set
3401	* - there is a non default rule set for the firewall
3402	*/
3403
3404	chain = tcp_packet_chaining > `1`
3405	#if IPSEC
3406	&& ipsec_bypass
3407	#endif
3408	; // I'm important, not extraneous
3409
3410	while (pkt != NULL) {
3411	struct mbuf *npkt = pkt->m_nextpkt;
3412
3413	if (!chain) {
3414	pkt->m_nextpkt = NULL;
3415	/*
3416	* If we are not chaining, make sure to set the packet
3417	* list count to 0 so that IP takes the right path;
3418	* this is important for cases such as IPsec where a
3419	* single mbuf might result in multiple mbufs as part
3420	* of the encapsulation. If a non-zero count is passed
3421	* down to IP, the head of the chain might change and
3422	* we could end up skipping it (thus generating bogus
3423	* packets). Fixing it in IP would be desirable, but
3424	* for now this would do it.
3425	*/
3426	cnt = `0`;
3427	}
3428	if (isipv6) {
3429	error = ip6_output_list(pkt, cnt,
3430	inp->in6p_outputopts, &ro6, flags, NULL, NULL,
3431	&ip6oa);
3432	ifdenied = (ip6oa.ip6oa_flags & IP6OAF_R_IFDENIED);
3433	} else {
3434	error = ip_output_list(pkt, cnt, opt, &ro, flags, NULL,
3435	&ipoa);
3436	ifdenied = (ipoa.ipoa_flags & IPOAF_R_IFDENIED);
3437	}
3438
3439	if (chain \|\| error) {
3440	/*
3441	* If we sent down a chain then we are done since
3442	* the callee had taken care of everything; else
3443	* we need to free the rest of the chain ourselves.
3444	*/
3445	if (!chain) {
3446	m_freem_list(npkt);
3447	}
3448	break;
3449	}
3450	pkt = npkt;
3451	}
3452
3453	if (unlocked) {
3454	socket_lock(so, refcount: `0`);
3455	}
3456
3457	/*
3458	* Enter flow controlled state if the connection is established
3459	* and is not in recovery. Flow control is allowed only if there
3460	* is outstanding data.
3461	*
3462	* A connection will enter suspended state even if it is in
3463	* recovery.
3464	*/
3465	if (((adv->code == FADV_FLOW_CONTROLLED && !IN_FASTRECOVERY(tp)) \|\|
3466	adv->code == FADV_SUSPENDED) &&
3467	!(tp->t_flags & TF_CLOSING) &&
3468	tp->t_state == TCPS_ESTABLISHED &&
3469	SEQ_GT(tp->snd_max, tp->snd_una)) {
3470	int rc;
3471	rc = inp_set_fc_state(inp, advcode: adv->code);
3472
3473	if (rc == `1`) {
3474	tcp_ccdbg_trace(tp, NULL,
3475	event: ((adv->code == FADV_FLOW_CONTROLLED) ?
3476	TCP_CC_FLOW_CONTROL : TCP_CC_SUSPEND));
3477	if (adv->code == FADV_FLOW_CONTROLLED) {
3478	TCP_LOG_OUTPUT(tp, "flow controlled");
3479	} else {
3480	TCP_LOG_OUTPUT(tp, "flow suspended");
3481	}
3482	}
3483	}
3484
3485	/*
3486	* When an interface queue gets suspended, some of the
3487	* packets are dropped. Return ENOBUFS, to update the
3488	* pcb state.
3489	*/
3490	if (adv->code == FADV_SUSPENDED) {
3491	error = ENOBUFS;
3492	}
3493
3494	VERIFY(inp->inp_sndinprog_cnt > `0`);
3495	if (--inp->inp_sndinprog_cnt == `0`) {
3496	inp->inp_flags &= ~(INP_FC_FEEDBACK);
3497	if (inp->inp_sndingprog_waiters > `0`) {
3498	wakeup(chan: &inp->inp_sndinprog_cnt);
3499	}
3500	}
3501
3502	if (isipv6) {
3503	/*
3504	* When an NECP IP tunnel policy forces the outbound interface,
3505	* ip6_output_list() informs the transport layer what is the actual
3506	* outgoing interface
3507	*/
3508	if (ip6oa.ip6oa_flags & IP6OAF_BOUND_IF) {
3509	outif = ifindex2ifnet[ip6oa.ip6oa_boundif];
3510	} else if (ro6.ro_rt != NULL) {
3511	outif = ro6.ro_rt->rt_ifp;
3512	}
3513	} else {
3514	if (ro.ro_rt != NULL) {
3515	outif = ro.ro_rt->rt_ifp;
3516	}
3517	}
3518	if (check_qos_marking_again) {
3519	uint32_t qos_marking_gencount;
3520	bool allow_qos_marking;
3521	if (isipv6) {
3522	qos_marking_gencount = ip6oa.qos_marking_gencount;
3523	allow_qos_marking = ip6oa.ip6oa_flags & IP6OAF_QOSMARKING_ALLOWED ? TRUE : FALSE;
3524	} else {
3525	qos_marking_gencount = ipoa.qos_marking_gencount;
3526	allow_qos_marking = ipoa.ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE;
3527	}
3528	inp->inp_policyresult.results.qos_marking_gencount = qos_marking_gencount;
3529	if (allow_qos_marking == TRUE) {
3530	inp->inp_socket->so_flags1 \|= SOF1_QOSMARKING_ALLOWED;
3531	} else {
3532	inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
3533	}
3534	}
3535
3536	if (outif != NULL && outif != inp->inp_last_outifp) {
3537	/ Update the send byte count /
3538	if (so->so_snd.sb_cc > `0` && so->so_snd.sb_flags & SB_SNDBYTE_CNT) {
3539	inp_decr_sndbytes_total(so, so->so_snd.sb_cc);
3540	inp_decr_sndbytes_allunsent(so, tp->snd_una);
3541	so->so_snd.sb_flags &= ~SB_SNDBYTE_CNT;
3542	}
3543	inp->inp_last_outifp = outif;
3544	#if SKYWALK
3545	if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3546	netns_set_ifnet(token: &inp->inp_netns_token, ifp: inp->inp_last_outifp);
3547	}
3548	#endif /* SKYWALK */
3549	}
3550
3551	if (error != `0` && ifdenied &&
3552	(INP_NO_CELLULAR(inp) \|\| INP_NO_EXPENSIVE(inp) \|\| INP_NO_CONSTRAINED(inp))) {
3553	soevent(so,
3554	hint: (SO_FILT_HINT_LOCKED \| SO_FILT_HINT_IFDENIED));
3555	}
3556
3557	/ Synchronize cached PCB route & options /
3558	if (isipv6) {
3559	in6p_route_copyin(inp, &ro6);
3560	} else {
3561	inp_route_copyin(inp, &ro);
3562	}
3563
3564	if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift == `0` &&
3565	tp->t_inpcb->inp_route.ro_rt != NULL) {
3566	/ If we found the route and there is an rtt on it*
3567	* reset the retransmit timer
3568	*/
3569	tcp_getrt_rtt(tp, rt: tp->t_inpcb->in6p_route.ro_rt);
3570	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
3571	}
3572	return error;
3573	#undef ro
3574	#undef ro6
3575	#undef ipoa
3576	#undef ip6oa
3577	}
3578
3579	int tcptv_persmin_val = TCPTV_PERSMIN;
3580
3581	void
3582	tcp_setpersist(struct tcpcb *tp)
3583	{
3584	int t = ((tp->t_srtt >> `2`) + tp->t_rttvar) >> `1`;
3585
3586	/ If a PERSIST_TIMER option was set we will limit the*
3587	* time the persist timer will be active for that connection
3588	* in order to avoid DOS by using zero window probes.
3589	* see rdar://5805356
3590	*/
3591
3592	if (tp->t_persist_timeout != `0` &&
3593	tp->t_timer[TCPT_PERSIST] == `0` &&
3594	tp->t_persist_stop == `0`) {
3595	tp->t_persist_stop = tcp_now + tp->t_persist_timeout;
3596	}
3597
3598	/*
3599	* Start/restart persistance timer.
3600	*/
3601	TCPT_RANGESET(tp->t_timer[TCPT_PERSIST],
3602	t * tcp_backoff[tp->t_rxtshift],
3603	tcptv_persmin_val, TCPTV_PERSMAX, `0`);
3604	tp->t_timer[TCPT_PERSIST] = OFFSET_FROM_START(tp, tp->t_timer[TCPT_PERSIST]);
3605
3606	if (tp->t_rxtshift < TCP_MAXRXTSHIFT) {
3607	tp->t_rxtshift++;
3608	}
3609	}
3610
3611	static int
3612	tcp_recv_throttle(struct tcpcb *tp)
3613	{
3614	uint32_t base_rtt, newsize;
3615	struct sockbuf *sbrcv = &tp->t_inpcb->inp_socket->so_rcv;
3616
3617	if (tcp_use_rtt_recvbg == `1` &&
3618	TSTMP_SUPPORTED(tp)) {
3619	/*
3620	* Timestamps are supported on this connection. Use
3621	* RTT to look for an increase in latency.
3622	*/
3623
3624	/*
3625	* If the connection is already being throttled, leave it
3626	* in that state until rtt comes closer to base rtt
3627	*/
3628	if (tp->t_flagsext & TF_RECV_THROTTLE) {
3629	return `1`;
3630	}
3631
3632	base_rtt = get_base_rtt(tp);
3633
3634	if (base_rtt != `0` && tp->t_rttcur != `0`) {
3635	/*
3636	* if latency increased on a background flow,
3637	* return 1 to start throttling.
3638	*/
3639	if (tp->t_rttcur > (base_rtt + target_qdelay)) {
3640	tp->t_flagsext \|= TF_RECV_THROTTLE;
3641	if (tp->t_recv_throttle_ts == `0`) {
3642	tp->t_recv_throttle_ts = tcp_now;
3643	}
3644	/*
3645	* Reduce the recv socket buffer size to
3646	* minimize latecy.
3647	*/
3648	if (sbrcv->sb_idealsize >
3649	tcp_recv_throttle_minwin) {
3650	newsize = sbrcv->sb_idealsize >> `1`;
3651	/ Set a minimum of 16 K /
3652	newsize =
3653	max(a: newsize,
3654	b: tcp_recv_throttle_minwin);
3655	sbrcv->sb_idealsize = newsize;
3656	}
3657	return `1`;
3658	} else {
3659	return `0`;
3660	}
3661	}
3662	}
3663
3664	/*
3665	* Timestamps are not supported or there is no good RTT
3666	* measurement. Use IPDV in this case.
3667	*/
3668	if (tp->acc_iaj > tcp_acc_iaj_react_limit) {
3669	return `1`;
3670	}
3671
3672	return `0`;
3673	}
3674

Browse the source code of xnu/bsd/netinet/tcp_output.c