tcp_sack.c source code [xnu/bsd/netinet/tcp_sack.c]

1	/*
2	* Copyright (c) 2004-2016 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30	* The Regents of the University of California. All rights reserved.
31	*
32	* Redistribution and use in source and binary forms, with or without
33	* modification, are permitted provided that the following conditions
34	* are met:
35	* 1. Redistributions of source code must retain the above copyright
36	* notice, this list of conditions and the following disclaimer.
37	* 2. Redistributions in binary form must reproduce the above copyright
38	* notice, this list of conditions and the following disclaimer in the
39	* documentation and/or other materials provided with the distribution.
40	* 3. All advertising materials mentioning features or use of this software
41	* must display the following acknowledgement:
42	* This product includes software developed by the University of
43	* California, Berkeley and its contributors.
44	* 4. Neither the name of the University nor the names of its contributors
45	* may be used to endorse or promote products derived from this software
46	* without specific prior written permission.
47	*
48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58	* SUCH DAMAGE.
59	*
60	*/
61
62	#define _IP_VHL
63
64
65	#include <sys/param.h>
66	#include <sys/systm.h>
67	#include <sys/kernel.h>
68	#include <sys/sysctl.h>
69	#include <sys/mbuf.h>
70	#include <sys/domain.h>
71	#include <sys/protosw.h>
72	#include <sys/socket.h>
73	#include <sys/socketvar.h>
74
75	#include <kern/zalloc.h>
76
77	#include <net/route.h>
78
79	#include <netinet/in.h>
80	#include <netinet/in_systm.h>
81	#include <netinet/ip.h>
82	#include <netinet/in_pcb.h>
83	#include <netinet/ip_var.h>
84	#include <netinet6/in6_pcb.h>
85	#include <netinet/ip6.h>
86	#include <netinet6/ip6_var.h>
87	#include <netinet/tcp.h>
88	#include <netinet/tcp_fsm.h>
89	#include <netinet/tcp_seq.h>
90	#include <netinet/tcp_timer.h>
91	#include <netinet/tcp_var.h>
92	#include <netinet/tcpip.h>
93	#include <netinet/tcp_cache.h>
94	#if TCPDEBUG
95	#include <netinet/tcp_debug.h>
96	#endif
97	#include <sys/kdebug.h>
98
99	#if IPSEC
100	#include <netinet6/ipsec.h>
101	#endif /IPSEC/
102
103	#include <libkern/OSAtomic.h>
104
105	SYSCTL_SKMEM_TCP_INT(OID_AUTO, sack, CTLFLAG_RW \| CTLFLAG_LOCKED,
106	int, tcp_do_sack, `1`, "Enable/Disable TCP SACK support");
107	SYSCTL_SKMEM_TCP_INT(OID_AUTO, sack_maxholes, CTLFLAG_RW \| CTLFLAG_LOCKED,
108	static int, tcp_sack_maxholes, `128`,
109	"Maximum number of TCP SACK holes allowed per connection");
110
111	SYSCTL_SKMEM_TCP_INT(OID_AUTO, sack_globalmaxholes,
112	CTLFLAG_RW \| CTLFLAG_LOCKED, static int, tcp_sack_globalmaxholes, `65536`,
113	"Global maximum number of TCP SACK holes");
114
115	static SInt32 tcp_sack_globalholes = `0`;
116	SYSCTL_INT(_net_inet_tcp, OID_AUTO, sack_globalholes, CTLFLAG_RD \| CTLFLAG_LOCKED,
117	&tcp_sack_globalholes, `0`,
118	"Global number of TCP SACK holes currently allocated");
119
120	static KALLOC_TYPE_DEFINE(sack_hole_zone, struct sackhole, NET_KT_DEFAULT);
121
122	#define TCP_VALIDATE_SACK_SEQ_NUMBERS(_tp_, _sb_, _ack_) \
123	(SEQ_GT((_sb_)->end, (_sb_)->start) && \
124	SEQ_GT((_sb_)->start, (_tp_)->snd_una) && \
125	SEQ_GT((_sb_)->start, (_ack_)) && \
126	SEQ_LT((_sb_)->start, (_tp_)->snd_max) && \
127	SEQ_GT((_sb_)->end, (_tp_)->snd_una) && \
128	SEQ_LEQ((_sb_)->end, (_tp_)->snd_max))
129
130	/*
131	* This function is called upon receipt of new valid data (while not in header
132	* prediction mode), and it updates the ordered list of sacks.
133	*/
134	void
135	tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
136	{
137	/*
138	* First reported block MUST be the most recent one. Subsequent
139	* blocks SHOULD be in the order in which they arrived at the
140	* receiver. These two conditions make the implementation fully
141	* compliant with RFC 2018.
142	*/
143	struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
144	int num_head, num_saved, i;
145
146	/ SACK block for the received segment. /
147	head_blk.start = rcv_start;
148	head_blk.end = rcv_end;
149
150	/*
151	* Merge updated SACK blocks into head_blk, and
152	* save unchanged SACK blocks into saved_blks[].
153	* num_saved will have the number of the saved SACK blocks.
154	*/
155	num_saved = `0`;
156	for (i = `0`; i < tp->rcv_numsacks; i++) {
157	tcp_seq start = tp->sackblks[i].start;
158	tcp_seq end = tp->sackblks[i].end;
159	if (SEQ_GEQ(start, end) \|\| SEQ_LEQ(start, tp->rcv_nxt)) {
160	/*
161	* Discard this SACK block.
162	*/
163	} else if (SEQ_LEQ(head_blk.start, end) &&
164	SEQ_GEQ(head_blk.end, start)) {
165	/*
166	* Merge this SACK block into head_blk.
167	* This SACK block itself will be discarded.
168	*/
169	if (SEQ_GT(head_blk.start, start)) {
170	head_blk.start = start;
171	}
172	if (SEQ_LT(head_blk.end, end)) {
173	head_blk.end = end;
174	}
175	} else {
176	/*
177	* Save this SACK block.
178	*/
179	saved_blks[num_saved].start = start;
180	saved_blks[num_saved].end = end;
181	num_saved++;
182	}
183	}
184
185	/*
186	* Update SACK list in tp->sackblks[].
187	*/
188	num_head = `0`;
189	if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
190	/*
191	* The received data segment is an out-of-order segment.
192	* Put head_blk at the top of SACK list.
193	*/
194	tp->sackblks[`0`] = head_blk;
195	num_head = `1`;
196	/*
197	* If the number of saved SACK blocks exceeds its limit,
198	* discard the last SACK block.
199	*/
200	if (num_saved >= MAX_SACK_BLKS) {
201	num_saved--;
202	}
203	}
204	if (num_saved > `0`) {
205	/*
206	* Copy the saved SACK blocks back.
207	*/
208	bcopy(src: saved_blks, dst: &tp->sackblks[num_head], n: sizeof(struct sackblk) * num_saved);
209	}
210
211	/ Save the number of SACK blocks. /
212	tp->rcv_numsacks = num_head + num_saved;
213
214	/ If we are requesting SACK recovery, reset the stretch-ack state*
215	* so that connection will generate more acks after recovery and
216	* sender's cwnd will open.
217	*/
218	if ((tp->t_flags & TF_STRETCHACK) != `0` && tp->rcv_numsacks > `0`) {
219	tcp_reset_stretch_ack(tp);
220	}
221	if (tp->rcv_numsacks > `0`) {
222	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
223	}
224
225	#if TRAFFIC_MGT
226	if (tp->acc_iaj > `0` && tp->rcv_numsacks > `0`) {
227	reset_acc_iaj(tp);
228	}
229	#endif /* TRAFFIC_MGT */
230	}
231
232	/*
233	* Delete all receiver-side SACK information.
234	*/
235	void
236	tcp_clean_sackreport( struct tcpcb *tp)
237	{
238	tp->rcv_numsacks = `0`;
239	bzero(s: &tp->sackblks[`0`], n: sizeof(struct sackblk) * MAX_SACK_BLKS);
240	}
241
242	/*
243	* Allocate struct sackhole.
244	*/
245	static struct sackhole *
246	tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end)
247	{
248	struct sackhole *hole;
249
250	if (tp->snd_numholes >= tcp_sack_maxholes \|\|
251	tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
252	tcpstat.tcps_sack_sboverflow++;
253	return NULL;
254	}
255
256	hole = zalloc_flags(sack_hole_zone, Z_WAITOK \| Z_NOFAIL);
257
258	hole->start = start;
259	hole->end = end;
260	hole->rxmit = start;
261
262	tp->snd_numholes++;
263	OSIncrementAtomic(&tcp_sack_globalholes);
264
265	return hole;
266	}
267
268	/*
269	* Free struct sackhole.
270	*/
271	static void
272	tcp_sackhole_free(struct tcpcb tp, struct* sackhole *hole)
273	{
274	zfree(sack_hole_zone, hole);
275
276	tp->snd_numholes--;
277	OSDecrementAtomic(&tcp_sack_globalholes);
278	}
279
280	/*
281	* Insert new SACK hole into scoreboard.
282	*/
283	static struct sackhole *
284	tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end,
285	struct sackhole *after)
286	{
287	struct sackhole *hole;
288
289	/ Allocate a new SACK hole. /
290	hole = tcp_sackhole_alloc(tp, start, end);
291	if (hole == NULL) {
292	return NULL;
293	}
294	hole->rxmit_start = tcp_now;
295	/ Insert the new SACK hole into scoreboard /
296	if (after != NULL) {
297	TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink);
298	} else {
299	TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink);
300	}
301
302	/ Update SACK hint. /
303	if (tp->sackhint.nexthole == NULL) {
304	tp->sackhint.nexthole = hole;
305	}
306
307	return hole;
308	}
309
310	/*
311	* Remove SACK hole from scoreboard.
312	*/
313	static void
314	tcp_sackhole_remove(struct tcpcb tp, struct* sackhole *hole)
315	{
316	/ Update SACK hint. /
317	if (tp->sackhint.nexthole == hole) {
318	tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink);
319	}
320
321	/ Remove this SACK hole. /
322	TAILQ_REMOVE(&tp->snd_holes, hole, scblink);
323
324	/ Free this SACK hole. /
325	tcp_sackhole_free(tp, hole);
326	}
327	/*
328	* When a new ack with SACK is received, check if it indicates packet
329	* reordering. If there is packet reordering, the socket is marked and
330	* the late time offset by which the packet was reordered with
331	* respect to its closest neighboring packets is computed.
332	*/
333	static void
334	tcp_sack_detect_reordering(struct tcpcb tp, struct* sackhole *s,
335	tcp_seq sacked_seq, tcp_seq snd_fack)
336	{
337	int32_t rext = `0`, reordered = `0`;
338
339	/*
340	* If the SACK hole is past snd_fack, this is from new SACK
341	* information, so we can ignore it.
342	*/
343	if (SEQ_GT(s->end, snd_fack)) {
344	return;
345	}
346	/*
347	* If there has been a retransmit timeout, then the timestamp on
348	* the SACK segment will be newer. This might lead to a
349	* false-positive. Avoid re-ordering detection in this case.
350	*/
351	if (tp->t_rxtshift > `0`) {
352	return;
353	}
354
355	/*
356	* Detect reordering from SACK information by checking
357	* if recently sacked data was never retransmitted from this hole.
358	*
359	* First, we look for the byte in the list of retransmitted segments. This one
360	* will contain even the segments that are retransmitted thanks to RTO/TLP.
361	*
362	* Then, we check the sackhole which indicates whether or not the sackhole
363	* was subject to retransmission.
364	*/
365	if (SEQ_LT(s->rxmit, sacked_seq) &&
366	(!tcp_do_better_lr \|\| tcp_rxtseg_find(tp, sacked_seq - `1`, sacked_seq - `1`) == NULL)) {
367	reordered = `1`;
368	tcpstat.tcps_avoid_rxmt++;
369	}
370
371	if (reordered) {
372	if (!(tp->t_flagsext & TF_PKTS_REORDERED)) {
373	tp->t_flagsext \|= TF_PKTS_REORDERED;
374	tcpstat.tcps_detect_reordering++;
375	}
376
377	tcpstat.tcps_reordered_pkts++;
378	tp->t_reordered_pkts++;
379
380	/*
381	* If reordering is seen on a connection wth ECN enabled,
382	* increment the heuristic
383	*/
384	if (TCP_ECN_ENABLED(tp)) {
385	INP_INC_IFNET_STAT(tp->t_inpcb, ecn_fallback_reorder);
386	tcpstat.tcps_ecn_fallback_reorder++;
387	tcp_heuristic_ecn_aggressive(tp);
388	}
389
390	VERIFY(SEQ_GEQ(snd_fack, s->rxmit));
391
392	if (s->rxmit_start > `0`) {
393	rext = timer_diff(t1: tcp_now, toff1: `0`, t2: s->rxmit_start, toff2: `0`);
394	if (rext < `0`) {
395	return;
396	}
397
398	/*
399	* We take the maximum reorder window to schedule
400	* DELAYFR timer as that will take care of jitter
401	* on the network path.
402	*
403	* Computing average and standard deviation seems
404	* to cause unnecessary retransmissions when there
405	* is high jitter.
406	*
407	* We set a maximum of SRTT/2 and a minimum of
408	* 10 ms on the reorder window.
409	*/
410	tp->t_reorderwin = max(a: tp->t_reorderwin, b: rext);
411	tp->t_reorderwin = min(a: tp->t_reorderwin,
412	b: (tp->t_srtt >> (TCP_RTT_SHIFT - `1`)));
413	tp->t_reorderwin = max(a: tp->t_reorderwin, b: `10`);
414	}
415	}
416	}
417
418	static void
419	tcp_sack_update_byte_counter(struct tcpcb *tp, uint32_t start, uint32_t end,
420	uint32_t newbytes_acked, uint32_t towards_fr_acked)
421	{
422	*newbytes_acked += (end - start);
423	if (SEQ_GEQ(start, tp->send_highest_sack)) {
424	*towards_fr_acked += (end - start);
425	}
426	}
427
428	/*
429	* Process cumulative ACK and the TCP SACK option to update the scoreboard.
430	* tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
431	* the sequence space).
432	*/
433	void
434	tcp_sack_doack(struct tcpcb tp, struct* tcpopt to, struct* tcphdr *th,
435	u_int32_t newbytes_acked, uint32_t after_rexmit_acked)
436	{
437	struct sackhole cur, temp;
438	struct sackblk sack, sack_blocks[TCP_MAX_SACK + `1`], *sblkp;
439	int i, j, num_sack_blks;
440	tcp_seq old_snd_fack = `0`, th_ack = th->th_ack;
441
442	num_sack_blks = `0`;
443	/*
444	* If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
445	* treat [SND.UNA, SEG.ACK) as if it is a SACK block.
446	*/
447	if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
448	sack_blocks[num_sack_blks].start = tp->snd_una;
449	sack_blocks[num_sack_blks++].end = th_ack;
450	}
451	/*
452	* Append received valid SACK blocks to sack_blocks[].
453	* Check that the SACK block range is valid.
454	*/
455	for (i = `0`; i < to->to_nsacks; i++) {
456	bcopy(src: (to->to_sacks + i * TCPOLEN_SACK),
457	dst: &sack, n: sizeof(sack));
458	sack.start = ntohl(sack.start);
459	sack.end = ntohl(sack.end);
460	if (TCP_VALIDATE_SACK_SEQ_NUMBERS(tp, &sack, th_ack)) {
461	sack_blocks[num_sack_blks++] = sack;
462	}
463	}
464
465	/*
466	* Return if SND.UNA is not advanced and no valid SACK block
467	* is received.
468	*/
469	if (num_sack_blks == `0`) {
470	return;
471	}
472
473	VERIFY(num_sack_blks <= (TCP_MAX_SACK + `1`));
474	/*
475	* Sort the SACK blocks so we can update the scoreboard
476	* with just one pass. The overhead of sorting upto 4+1 elements
477	* is less than making upto 4+1 passes over the scoreboard.
478	*/
479	for (i = `0`; i < num_sack_blks; i++) {
480	for (j = i + `1`; j < num_sack_blks; j++) {
481	if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
482	sack = sack_blocks[i];
483	sack_blocks[i] = sack_blocks[j];
484	sack_blocks[j] = sack;
485	}
486	}
487	}
488	if (TAILQ_EMPTY(&tp->snd_holes)) {
489	/*
490	* Empty scoreboard. Need to initialize snd_fack (it may be
491	* uninitialized or have a bogus value). Scoreboard holes
492	* (from the sack blocks received) are created later below (in
493	* the logic that adds holes to the tail of the scoreboard).
494	*/
495	tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
496	}
497
498	old_snd_fack = tp->snd_fack;
499	/*
500	* In the while-loop below, incoming SACK blocks (sack_blocks[])
501	* and SACK holes (snd_holes) are traversed from their tails with
502	* just one pass in order to reduce the number of compares especially
503	* when the bandwidth-delay product is large.
504	* Note: Typically, in the first RTT of SACK recovery, the highest
505	* three or four SACK blocks with the same ack number are received.
506	* In the second RTT, if retransmitted data segments are not lost,
507	* the highest three or four SACK blocks with ack number advancing
508	* are received.
509	*/
510	sblkp = &sack_blocks[num_sack_blks - `1`]; / Last SACK block /
511	if (SEQ_LT(tp->snd_fack, sblkp->start)) {
512	/*
513	* The highest SACK block is beyond fack.
514	* Append new SACK hole at the tail.
515	* If the second or later highest SACK blocks are also
516	* beyond the current fack, they will be inserted by
517	* way of hole splitting in the while-loop below.
518	*/
519	temp = tcp_sackhole_insert(tp, start: tp->snd_fack, end: sblkp->start, NULL);
520	if (temp != NULL) {
521	tp->snd_fack = sblkp->end;
522	tcp_sack_update_byte_counter(tp, start: sblkp->start, end: sblkp->end, newbytes_acked, towards_fr_acked: after_rexmit_acked);
523
524	/ Go to the previous sack block. /
525	sblkp--;
526	} else {
527	/*
528	* We failed to add a new hole based on the current
529	* sack block. Skip over all the sack blocks that
530	* fall completely to the right of snd_fack and proceed
531	* to trim the scoreboard based on the remaining sack
532	* blocks. This also trims the scoreboard for th_ack
533	* (which is sack_blocks[0]).
534	*/
535	while (sblkp >= sack_blocks &&
536	SEQ_LT(tp->snd_fack, sblkp->start)) {
537	sblkp--;
538	}
539	if (sblkp >= sack_blocks &&
540	SEQ_LT(tp->snd_fack, sblkp->end)) {
541	tcp_sack_update_byte_counter(tp, start: tp->snd_fack, end: sblkp->end, newbytes_acked, towards_fr_acked: after_rexmit_acked);
542	tp->snd_fack = sblkp->end;
543	}
544	}
545	} else if (SEQ_LT(tp->snd_fack, sblkp->end)) {
546	/ fack is advanced. /
547	tcp_sack_update_byte_counter(tp, start: tp->snd_fack, end: sblkp->end, newbytes_acked, towards_fr_acked: after_rexmit_acked);
548	tp->snd_fack = sblkp->end;
549	}
550	/ We must have at least one SACK hole in scoreboard /
551	cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); / Last SACK hole /
552	/*
553	* Since the incoming sack blocks are sorted, we can process them
554	* making one sweep of the scoreboard.
555	*/
556	while (sblkp >= sack_blocks && cur != NULL) {
557	if (SEQ_GEQ(sblkp->start, cur->end)) {
558	/*
559	* SACKs data beyond the current hole.
560	* Go to the previous sack block.
561	*/
562	sblkp--;
563	continue;
564	}
565	if (SEQ_LEQ(sblkp->end, cur->start)) {
566	/*
567	* SACKs data before the current hole.
568	* Go to the previous hole.
569	*/
570	cur = TAILQ_PREV(cur, sackhole_head, scblink);
571	continue;
572	}
573	tp->sackhint.sack_bytes_rexmit -= (cur->rxmit - cur->start);
574	if (tp->sackhint.sack_bytes_rexmit < `0`) {
575	tp->sackhint.sack_bytes_rexmit = `0`;
576	}
577
578	if (SEQ_LEQ(sblkp->start, cur->start)) {
579	/ Data acks at least the beginning of hole /
580	if (SEQ_GEQ(sblkp->end, cur->end)) {
581	/ Acks entire hole, so delete hole /
582	tcp_sack_update_byte_counter(tp, start: cur->start, end: cur->end, newbytes_acked, towards_fr_acked: after_rexmit_acked);
583
584	tcp_sack_detect_reordering(tp, s: cur,
585	sacked_seq: cur->end, snd_fack: old_snd_fack);
586	temp = cur;
587	cur = TAILQ_PREV(cur, sackhole_head, scblink);
588	tcp_sackhole_remove(tp, hole: temp);
589	/*
590	* The sack block may ack all or part of the next
591	* hole too, so continue onto the next hole.
592	*/
593	continue;
594	} else {
595	/ Move start of hole forward /
596	tcp_sack_update_byte_counter(tp, start: cur->start, end: sblkp->end, newbytes_acked, towards_fr_acked: after_rexmit_acked);
597	tcp_sack_detect_reordering(tp, s: cur,
598	sacked_seq: sblkp->end, snd_fack: old_snd_fack);
599	cur->start = sblkp->end;
600	cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
601	}
602	} else {
603	/ Data acks at least the end of hole /
604	if (SEQ_GEQ(sblkp->end, cur->end)) {
605	/ Move end of hole backward /
606	tcp_sack_update_byte_counter(tp, start: sblkp->start, end: cur->end, newbytes_acked, towards_fr_acked: after_rexmit_acked);
607	tcp_sack_detect_reordering(tp, s: cur,
608	sacked_seq: cur->end, snd_fack: old_snd_fack);
609	cur->end = sblkp->start;
610	cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
611	} else {
612	/*
613	* ACKs some data in the middle of a hole;
614	* need to split current hole
615	*/
616	tcp_sack_detect_reordering(tp, s: cur,
617	sacked_seq: sblkp->end, snd_fack: old_snd_fack);
618	temp = tcp_sackhole_insert(tp, start: sblkp->end,
619	end: cur->end, after: cur);
620	if (temp != NULL) {
621	tcp_sack_update_byte_counter(tp, start: sblkp->start, end: sblkp->end, newbytes_acked, towards_fr_acked: after_rexmit_acked);
622	if (SEQ_GT(cur->rxmit, temp->rxmit)) {
623	temp->rxmit = cur->rxmit;
624	tp->sackhint.sack_bytes_rexmit
625	+= (temp->rxmit
626	- temp->start);
627	}
628	cur->end = sblkp->start;
629	cur->rxmit = SEQ_MIN(cur->rxmit,
630	cur->end);
631	/*
632	* Reset the rxmit_start to that of
633	* the current hole as that will
634	* help to compute the reorder
635	* window correctly
636	*/
637	temp->rxmit_start = cur->rxmit_start;
638	}
639	}
640	}
641	tp->sackhint.sack_bytes_rexmit += (cur->rxmit - cur->start);
642	/*
643	* Testing sblkp->start against cur->start tells us whether
644	* we're done with the sack block or the sack hole.
645	* Accordingly, we advance one or the other.
646	*/
647	if (SEQ_LEQ(sblkp->start, cur->start)) {
648	cur = TAILQ_PREV(cur, sackhole_head, scblink);
649	} else {
650	sblkp--;
651	}
652	}
653	}
654
655	/*
656	* Free all SACK holes to clear the scoreboard.
657	*/
658	void
659	tcp_free_sackholes(struct tcpcb *tp)
660	{
661	struct sackhole *q;
662
663	while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
664	tcp_sackhole_remove(tp, hole: q);
665	}
666	tp->sackhint.sack_bytes_rexmit = `0`;
667	tp->sackhint.sack_bytes_acked = `0`;
668	tp->t_new_dupacks = `0`;
669	tp->sackhint.nexthole = NULL;
670	tp->sack_newdata = `0`;
671	}
672
673	/*
674	* Partial ack handling within a sack recovery episode.
675	* Keeping this very simple for now. When a partial ack
676	* is received, force snd_cwnd to a value that will allow
677	* the sender to transmit no more than 2 segments.
678	* If necessary, a better scheme can be adopted at a
679	* later point, but for now, the goal is to prevent the
680	* sender from bursting a large amount of data in the midst
681	* of sack recovery.
682	*/
683	void
684	tcp_sack_partialack(struct tcpcb tp, struct* tcphdr *th)
685	{
686	int num_segs = `1`;
687
688	tp->t_timer[TCPT_REXMT] = `0`;
689	tp->t_rtttime = `0`;
690	/ send one or 2 segments based on how much new data was acked /
691	if (((BYTES_ACKED(th, tp)) / tp->t_maxseg) > `2`) {
692	num_segs = `2`;
693	}
694	if (tcp_do_better_lr) {
695	tp->snd_cwnd = tcp_flight_size(tp) + num_segs * tp->t_maxseg;
696	} else {
697	tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
698	(tp->snd_nxt - tp->sack_newdata) +
699	num_segs * tp->t_maxseg);
700	}
701	if (tp->snd_cwnd > tp->snd_ssthresh) {
702	tp->snd_cwnd = tp->snd_ssthresh;
703	}
704	if (SEQ_LT(tp->snd_fack, tp->snd_recover) &&
705	tp->snd_fack == th->th_ack && TAILQ_EMPTY(&tp->snd_holes)) {
706	struct sackhole *temp;
707	/*
708	* we received a partial ack but there is no sack_hole
709	* that will cover the remaining seq space. In this case,
710	* create a hole from snd_fack to snd_recover so that
711	* the sack recovery will continue.
712	*/
713	temp = tcp_sackhole_insert(tp, start: tp->snd_fack,
714	end: tp->snd_recover, NULL);
715	if (temp != NULL) {
716	tp->snd_fack = tp->snd_recover;
717	}
718	}
719	(void) tcp_output(tp);
720	}
721
722	/*
723	* Debug version of tcp_sack_output() that walks the scoreboard. Used for
724	* now to sanity check the hint.
725	*/
726	static struct sackhole *
727	tcp_sack_output_debug(struct tcpcb tp, int* *sack_bytes_rexmt)
728	{
729	struct sackhole *p;
730
731	*sack_bytes_rexmt = `0`;
732	TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
733	if (SEQ_LT(p->rxmit, p->end)) {
734	if (SEQ_LT(p->rxmit, tp->snd_una)) {/ old SACK hole /
735	continue;
736	}
737	*sack_bytes_rexmt += (p->rxmit - p->start);
738	break;
739	}
740	*sack_bytes_rexmt += (p->rxmit - p->start);
741	}
742	return p;
743	}
744
745	/*
746	* Returns the next hole to retransmit and the number of retransmitted bytes
747	* from the scoreboard. We store both the next hole and the number of
748	* retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
749	* reception). This avoids scoreboard traversals completely.
750	*
751	* The loop here will traverse at most one link. Here's the argument.
752	* For the loop to traverse more than 1 link before finding the next hole to
753	* retransmit, we would need to have at least 1 node following the current hint
754	* with (rxmit == end). But, for all holes following the current hint,
755	* (start == rxmit), since we have not yet retransmitted from them. Therefore,
756	* in order to traverse more 1 link in the loop below, we need to have at least
757	* one node following the current hint with (start == rxmit == end).
758	* But that can't happen, (start == end) means that all the data in that hole
759	* has been sacked, in which case, the hole would have been removed from the
760	* scoreboard.
761	*/
762	struct sackhole *
763	tcp_sack_output(struct tcpcb tp, int* *sack_bytes_rexmt)
764	{
765	struct sackhole hole = NULL, dbg_hole = NULL;
766	int dbg_bytes_rexmt;
767
768	dbg_hole = tcp_sack_output_debug(tp, sack_bytes_rexmt: &dbg_bytes_rexmt);
769	*sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
770	hole = tp->sackhint.nexthole;
771	if (hole == NULL \|\| SEQ_LT(hole->rxmit, hole->end)) {
772	goto out;
773	}
774	while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) {
775	if (SEQ_LT(hole->rxmit, hole->end)) {
776	tp->sackhint.nexthole = hole;
777	break;
778	}
779	}
780	out:
781	if (dbg_hole != hole) {
782	printf("%s: Computed sack hole not the same as cached value\n", __func__);
783	hole = dbg_hole;
784	}
785	if (*sack_bytes_rexmt != dbg_bytes_rexmt) {
786	printf("%s: Computed sack_bytes_retransmitted (%d) not "
787	"the same as cached value (%d)\n",
788	__func__, dbg_bytes_rexmt, *sack_bytes_rexmt);
789	*sack_bytes_rexmt = dbg_bytes_rexmt;
790	}
791	return hole;
792	}
793
794	void
795	tcp_sack_lost_rexmit(struct tcpcb *tp)
796	{
797	struct sackhole *hole = TAILQ_FIRST(&tp->snd_holes);
798
799	while (hole) {
800	hole->rxmit = hole->start;
801	hole->rxmit_start = tcp_now;
802
803	hole = TAILQ_NEXT(hole, scblink);
804	}
805
806	tp->sackhint.nexthole = TAILQ_FIRST(&tp->snd_holes);
807	tp->sackhint.sack_bytes_rexmit = `0`;
808	tp->sack_newdata = tp->snd_nxt;
809	}
810
811	/*
812	* After a timeout, the SACK list may be rebuilt. This SACK information
813	* should be used to avoid retransmitting SACKed data. This function
814	* traverses the SACK list to see if snd_nxt should be moved forward.
815	*/
816	uint32_t
817	tcp_sack_adjust(struct tcpcb *tp)
818	{
819	struct sackhole p, cur = TAILQ_FIRST(&tp->snd_holes);
820
821	if (cur == NULL) {
822	return `0`; / No holes /
823	}
824	if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) {
825	return `0`; / We're already beyond any SACKed blocks /
826	}
827	/*
828	* Two cases for which we want to advance snd_nxt:
829	* i) snd_nxt lies between end of one hole and beginning of another
830	* ii) snd_nxt lies between end of last hole and snd_fack
831	*/
832	while ((p = TAILQ_NEXT(cur, scblink)) != NULL) {
833	if (SEQ_LT(tp->snd_nxt, cur->end)) {
834	return cur->end - tp->snd_nxt;
835	}
836	if (SEQ_GEQ(tp->snd_nxt, p->start)) {
837	cur = p;
838	} else {
839	tp->snd_nxt = p->start;
840	return p->end - tp->snd_nxt;
841	}
842	}
843	if (SEQ_LT(tp->snd_nxt, cur->end)) {
844	return cur->end - tp->snd_nxt;
845	}
846	tp->snd_nxt = tp->snd_fack;
847	return `0`;
848	}
849
850	/*
851	* This function returns TRUE if more than (tcprexmtthresh - 1) * SMSS
852	* bytes with sequence numbers greater than snd_una have been SACKed.
853	*/
854	boolean_t
855	tcp_sack_byte_islost(struct tcpcb *tp)
856	{
857	u_int32_t unacked_bytes, sndhole_bytes = `0`;
858	struct sackhole *sndhole;
859	if (!SACK_ENABLED(tp) \|\| IN_FASTRECOVERY(tp) \|\|
860	TAILQ_EMPTY(&tp->snd_holes) \|\|
861	(tp->t_flagsext & TF_PKTS_REORDERED)) {
862	return FALSE;
863	}
864
865	unacked_bytes = tp->snd_max - tp->snd_una;
866
867	TAILQ_FOREACH(sndhole, &tp->snd_holes, scblink) {
868	sndhole_bytes += (sndhole->end - sndhole->start);
869	}
870
871	VERIFY(unacked_bytes >= sndhole_bytes);
872	return (unacked_bytes - sndhole_bytes) >
873	((tcprexmtthresh - `1`) * tp->t_maxseg);
874	}
875
876	/*
877	* Process any DSACK options that might be present on an input packet
878	*/
879
880	boolean_t
881	tcp_sack_process_dsack(struct tcpcb tp, struct* tcpopt *to,
882	struct tcphdr *th)
883	{
884	struct sackblk first_sack, second_sack;
885
886	bcopy(src: to->to_sacks, dst: &first_sack, n: sizeof(first_sack));
887	first_sack.start = ntohl(first_sack.start);
888	first_sack.end = ntohl(first_sack.end);
889
890	if (to->to_nsacks > `1`) {
891	bcopy(src: (to->to_sacks + TCPOLEN_SACK), dst: &second_sack,
892	n: sizeof(second_sack));
893	second_sack.start = ntohl(second_sack.start);
894	second_sack.end = ntohl(second_sack.end);
895	}
896
897	if (SEQ_LT(first_sack.start, th->th_ack) &&
898	SEQ_LEQ(first_sack.end, th->th_ack)) {
899	/*
900	* There is a dsack option reporting a duplicate segment
901	* also covered by cumulative acknowledgement.
902	*
903	* Validate the sequence numbers before looking at dsack
904	* option. The duplicate notification can come after
905	* snd_una moves forward. In order to set a window of valid
906	* sequence numbers to look for, we set a maximum send
907	* window within which the DSACK option will be processed.
908	*/
909	if (!(TCP_DSACK_SEQ_IN_WINDOW(tp, first_sack.start, th->th_ack) &&
910	TCP_DSACK_SEQ_IN_WINDOW(tp, first_sack.end, th->th_ack))) {
911	to->to_nsacks--;
912	to->to_sacks += TCPOLEN_SACK;
913	tcpstat.tcps_dsack_recvd_old++;
914
915	/*
916	* returning true here so that the ack will not be
917	* treated as duplicate ack.
918	*/
919	return TRUE;
920	}
921	} else if (to->to_nsacks > `1` &&
922	SEQ_LEQ(second_sack.start, first_sack.start) &&
923	SEQ_GEQ(second_sack.end, first_sack.end)) {
924	/*
925	* there is a dsack option in the first block not
926	* covered by the cumulative acknowledgement but covered
927	* by the second sack block.
928	*
929	* verify the sequence numbes on the second sack block
930	* before processing the DSACK option. Returning false
931	* here will treat the ack as a duplicate ack.
932	*/
933	if (!TCP_VALIDATE_SACK_SEQ_NUMBERS(tp, &second_sack,
934	th->th_ack)) {
935	to->to_nsacks--;
936	to->to_sacks += TCPOLEN_SACK;
937	tcpstat.tcps_dsack_recvd_old++;
938	return TRUE;
939	}
940	} else {
941	/ no dsack options, proceed with processing the sack /
942	return FALSE;
943	}
944
945	/ Update the tcpopt pointer to exclude dsack block /
946	to->to_nsacks--;
947	to->to_sacks += TCPOLEN_SACK;
948	tcpstat.tcps_dsack_recvd++;
949	tp->t_dsack_recvd++;
950
951	/ Update the sender's retransmit segment state /
952	if (((tp->t_rxtshift == `1` && first_sack.start == tp->snd_una) \|\|
953	((tp->t_flagsext & TF_SENT_TLPROBE) &&
954	first_sack.end == tp->t_tlphighrxt)) &&
955	TAILQ_EMPTY(&tp->snd_holes) &&
956	SEQ_GT(th->th_ack, tp->snd_una)) {
957	/*
958	* If the dsack is for a retransmitted packet and one of
959	* the two cases is true, it indicates ack loss:
960	* - retransmit timeout and first_sack.start == snd_una
961	* - TLP probe and first_sack.end == tlphighrxt
962	*
963	* Ignore dsack and do not update state when there is
964	* ack loss
965	*/
966	tcpstat.tcps_dsack_ackloss++;
967
968	return TRUE;
969	} else {
970	tcp_rxtseg_set_spurious(tp, start: first_sack.start, end: (first_sack.end - `1`));
971	}
972	return TRUE;
973	}
974

Browse the source code of xnu/bsd/netinet/tcp_sack.c