patch-2.1.92 linux/net/ipv4/tcp_input.c

Next file: linux/net/ipv4/tcp_ipv4.c
Previous file: linux/net/ipv4/tcp.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.91/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
@@ -5,7 +5,7 @@
  *
  *		Implementation of the Transmission Control Protocol(TCP).
  *
- * Version:	$Id: tcp_input.c,v 1.98 1998/03/23 22:54:48 davem Exp $
+ * Version:	$Id: tcp_input.c,v 1.103 1998/03/30 08:41:12 davem Exp $
  *
  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -50,15 +50,6 @@
 #include <net/tcp.h>
 #include <linux/ipsec.h>
 
-typedef void			(*tcp_sys_cong_ctl_t)(struct sock *sk,
-						      u32 seq, u32 ack,
-						      u32 seq_rtt);
-
-static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack,
-				u32 seq_rtt);
-static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
-				 u32 seq_rtt);
-
 #ifdef CONFIG_SYSCTL
 #define SYNC_INIT 0 /* let the user enable it */
 #else
@@ -80,8 +71,6 @@
 int sysctl_tcp_stdurg;
 int sysctl_tcp_rfc1337;
 
-static tcp_sys_cong_ctl_t tcp_sys_cong_ctl_f = &tcp_cong_avoid_vanj;
-
 /* There is something which you must keep in mind when you analyze the
  * behavior of the tp->ato delayed ack timeout interval.  When a
  * connection starts up, we want to ack as quickly as possible.  The
@@ -164,7 +153,7 @@
 static __inline__ void tcp_set_rto(struct tcp_opt *tp)
 {
 	tp->rto = (tp->srtt >> 3) + tp->mdev;
-	tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));
+	tp->rto += (tp->rto >> 2) + (tp->rto >> ((tp->snd_cwnd>>TCP_CWND_SHIFT)-1));
 }
  
 
@@ -176,7 +165,7 @@
  * way to avoid the problem. Is it possible to drop the lower
  * bound and still avoid trouble with BSD stacks? Perhaps
  * some modification to the RTO calculation that takes delayed
- * ack bais into account? This needs serious thought. -- erics
+ * ack bias into account? This needs serious thought. -- erics
  */
 static __inline__ void tcp_bound_rto(struct tcp_opt *tp)
 {
@@ -193,19 +182,27 @@
 	 * test is last_ack_sent <= end_seq.
 	 * (RFC1323 stated last_ack_sent < end_seq.)
 	 */
-	if (!before(end_seq,tp->last_ack_sent)) {
-		tp->ts_recent = tp->rcv_tsval;
-		tp->ts_recent_stamp = jiffies;
+	if (!before(end_seq, tp->last_ack_sent)) {
+		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
+		 * extra check below makes sure this can only happen
+		 * for pure ACK frames.  -DaveM
+		 */
+		if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) {
+			tp->ts_recent = tp->rcv_tsval;
+			tp->ts_recent_stamp = jiffies;
+		}
 	}
 }
 
 #define PAWS_24DAYS	(HZ * 60 * 60 * 24 * 24)
 
-extern __inline__ int tcp_paws_discard(struct tcp_opt *tp)
+extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, __u16 len)
 {
 	/* ts_recent must be younger than 24 days */
 	return (((jiffies - tp->ts_recent_stamp) >= PAWS_24DAYS) ||
-		((s32)(tp->rcv_tsval-tp->ts_recent) < 0));
+		(((s32)(tp->rcv_tsval-tp->ts_recent) < 0) &&
+		 /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */
+		 (len != (th->doff * 4))));
 }
 
 
@@ -266,15 +263,34 @@
 		struct sk_buff *skb = skb_peek(&sk->write_queue);
 		__u32 start_seq = ntohl(sp->start_seq);
 		__u32 end_seq = ntohl(sp->end_seq);
+		int fack_count = 0;
 
 		while((skb != NULL) &&
 		      (skb != tp->send_head) &&
 		      (skb != (struct sk_buff *)&sk->write_queue)) {
+			/* The retransmission queue is always in order, so
+			 * we can short-circuit the walk early.
+			 */
+			if(!before(start_seq, TCP_SKB_CB(skb)->end_seq))
+				break;
+
 			/* We play conservative, we don't allow SACKS to partially
 			 * tag a sequence space.
 			 */
-			if(!after(start_seq, skb->seq) && !before(end_seq, skb->end_seq))
+			fack_count++;
+			if(!after(start_seq, TCP_SKB_CB(skb)->seq) &&
+			   !before(end_seq, TCP_SKB_CB(skb)->end_seq)) {
+				/* If this was a retransmitted frame, account for it. */
+				if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+					tp->retrans_out--;
 				TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
+
+				/* RULE: All new SACKs will either decrease retrans_out
+				 *       or advance fackets_out.
+				 */
+				if(fack_count > tp->fackets_out)
+					tp->fackets_out = fack_count;
+			}
 			skb = skb->next;
 		}
 		sp++; /* Move on to the next SACK block. */
@@ -388,19 +404,43 @@
 	return 1;
 }
 
+#if 0 /* Not working yet... -DaveM */
+static void tcp_compute_tsack(struct sock *sk, struct tcp_opt *tp)
+{
+	struct sk_buff *skb = skb_peek(&sk->write_queue);
+	__u32 tstamp = tp->rcv_tsecr;
+	int fack_count = 0;
+
+	while((skb != NULL) &&
+	      (skb != tp->send_head) &&
+	      (skb != (struct sk_buff *)&sk->write_queue)) {
+		if(TCP_SKB_CB(skb)->when == tstamp) {
+			__u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+			sacked |= TCPCB_SACKED_ACKED;
+			if(sacked & TCPCB_SACKED_RETRANS)
+				tp->retrans_out--;
+			TCP_SKB_CB(skb)->sacked = sacked;
+		}
+		if(!before(TCP_SKB_CB(skb)->when, tstamp))
+			fack_count++;
+		skb = skb->next;
+	}
+	if(fack_count > tp->fackets_out)
+		tp->fackets_out = fack_count;
+}
+#endif
+
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
 #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
 #define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
 
-static __inline__ void clear_fast_retransmit(struct sock *sk)
+static __inline__ void clear_fast_retransmit(struct tcp_opt *tp)
 {
-	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+	if (tp->dup_acks > 3)
+		tp->snd_cwnd = (tp->snd_ssthresh << TCP_CWND_SHIFT);
 
-	if (tp->dup_acks > 3) {
-		tp->retrans_head = NULL;
-		tp->snd_cwnd = max(tp->snd_ssthresh, 1);
-	}
 	tp->dup_acks = 0;
 }
 
@@ -409,10 +449,9 @@
  */
 static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
 {
-	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
-	/*
-	 * Note: If not_dup is set this implies we got a
+	/* Note: If not_dup is set this implies we got a
 	 * data carrying packet or a window update.
 	 * This carries no new information about possible
 	 * lost packets, so we have to ignore it for the purposes
@@ -422,22 +461,31 @@
 	 * the code below much more complex. For now if I see such
 	 * a packet I clear the fast retransmit phase.
 	 */
-
 	if (ack == tp->snd_una && tp->packets_out && (not_dup == 0)) {
 		/* This is the standard reno style fast retransmit branch. */
 
+#if 0	/* Not working yet... -DaveM */
+		/* If not doing SACK, but doing timestamps, compute timestamp
+		 * based pseudo-SACKs when we see duplicate ACKs.
+		 */
+		if(!tp->sack_ok && tp->saw_tstamp)
+			tcp_compute_tsack(sk, tp);
+#endif
                 /* 1. When the third duplicate ack is received, set ssthresh 
                  * to one half the current congestion window, but no less 
                  * than two segments. Retransmit the missing segment.
                  */
 		if (tp->high_seq == 0 || after(ack, tp->high_seq)) {
 			tp->dup_acks++;
-			if (tp->dup_acks == 3) {
+			if ((tp->fackets_out > 3) || (tp->dup_acks == 3)) {
 				tp->dup_acks++;
-                                tp->snd_ssthresh = max(tp->snd_cwnd >> 1, 2);
-                                tp->snd_cwnd = tp->snd_ssthresh + 3;
+                                tp->snd_ssthresh = max(tp->snd_cwnd >> (TCP_CWND_SHIFT + 1), 2);
+                                tp->snd_cwnd = (tp->snd_ssthresh + 3) << TCP_CWND_SHIFT;
 				tp->high_seq = tp->snd_nxt;
-                                tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+				if(!tp->fackets_out)
+					tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+				else
+					tcp_fack_retransmit(sk);
                                 tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 			}
 		}
@@ -446,10 +494,22 @@
                  * cwnd by the segment size. [...] Transmit a packet...
                  *
                  * Packet transmission will be done on normal flow processing
-                 * since we're not in "retransmit mode"
+                 * since we're not in "retransmit mode".  We do not use duplicate
+		 * ACKs to artificially inflate the congestion window when
+		 * doing FACK.
                  */
-                if (tp->dup_acks > 3)
-                        tp->snd_cwnd++;
+                if (tp->dup_acks > 3) {
+			if(!tp->fackets_out) {
+				tp->snd_cwnd += (1 << TCP_CWND_SHIFT);
+			} else {
+				/* Fill any further holes which may have appeared.
+				 * We may want to change this to run every further
+				 * multiple-of-3 dup ack increments, to be more robust
+				 * against out-of-order packet delivery.  -DaveM
+				 */
+				tcp_fack_retransmit(sk);
+			}
+		}
 	} else if (tp->high_seq != 0) {
 		/* In this branch we deal with clearing the Floyd style
 		 * block on duplicate fast retransmits, and if requested
@@ -463,15 +523,17 @@
 			 * Note that we do want to accept a window
 			 * update since this is expected with Hoe's algorithm.
 			 */
-			clear_fast_retransmit(sk);
+			clear_fast_retransmit(tp);
 
 			/* After we have cleared up to high_seq we can
 			 * clear the Floyd style block.
 			 */
-			if (after(ack, tp->high_seq))
+			if (!before(ack, tp->high_seq)) {
 				tp->high_seq = 0;
+				tp->fackets_out = 0;
+			}
 		} else if (tp->dup_acks >= 3) {
-			if (sysctl_tcp_hoe_retransmits) {
+			if (!tp->fackets_out) {
 				/* Hoe Style. We didn't ack the whole
 				 * window. Take this as a cue that
 				 * another packet was lost and retransmit it.
@@ -490,131 +552,34 @@
                                 	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
 				}
 			} else {
-				/* Reno style. We didn't ack the whole
-				 * window, now we have to drop out of
-				 * fast retransmit and wait for a timeout.
+				/* FACK style, fill any remaining holes in
+				 * receiver's queue.
 				 */
-				clear_fast_retransmit(sk);
+				tcp_fack_retransmit(sk);
 			}
 		}
 	}
 }
 
-/*
- *      TCP slow start and congestion avoidance in two flavors:
- *      RFC 1122 and TCP Vegas.
+/* This is Jacobson's slow start and congestion avoidance. 
+ * SIGCOMM '88, p. 328.
  *
- *      This is a /proc/sys configurable option. 
+ * FIXME: What happens when the congestion window gets larger
+ * than the maximum receiver window by some large factor
+ * Suppose the pipeline never looses packets for a long
+ * period of time, then traffic increases causing packet loss.
+ * The congestion window should be reduced, but what it should
+ * be reduced to is not clear, since 1/2 the old window may
+ * still be larger than the maximum sending rate we ever achieved.
  */
-
-#define SHIFT_FACTOR 16
-
-static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
-				 u32 seq_rtt)
+static void tcp_cong_avoid(struct tcp_opt *tp, u32 seq, u32 ack, u32 seq_rtt)
 {
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	unsigned int actual, expected;
-	unsigned int inv_rtt, inv_basertt, inv_basebd;
-	u32 snt_bytes;
-
-	/*	From:
-	 *      TCP Vegas: New Techniques for Congestion 
-	 *	Detection and Avoidance.
-	 *
-	 *	Warning: This code is a scratch implementation taken
-	 *	from the paper only. The code they distribute seams
-	 *	to have improved several things over the initial spec.
-	 */
-
-	if (!seq_rtt)
-		seq_rtt = 1;
-
-	if (tp->basertt)
-		tp->basertt = min(seq_rtt, tp->basertt);
-	else
-		tp->basertt = seq_rtt;
-
-	/*	actual	 = throughput for this segment.
-	 *	expected = number_of_bytes in transit / BaseRTT
-	 */
-
-	snt_bytes = ack - seq;
-
-	inv_rtt = (1 << SHIFT_FACTOR) / seq_rtt;
-	inv_basertt = (1 << SHIFT_FACTOR) / tp->basertt;
-
-	actual =  snt_bytes * inv_rtt;
-
-	expected = (tp->snd_nxt - tp->snd_una) * inv_basertt;
-
-	inv_basebd = sk->mss * inv_basertt;
-
-	/* Slow Start */
-	if (tp->snd_cwnd < tp->snd_ssthresh &&
-	    (seq == tp->snd_nxt ||
-	     (expected - actual <= TCP_VEGAS_GAMMA * inv_basebd))) {
-		/* "Vegas allows exponential growth only every other RTT" */
-		if (tp->snd_cwnd_cnt++) {
-			tp->snd_cwnd++;
-			tp->snd_cwnd_cnt = 0;
-		}
-	} else {
-		/* Congestion Avoidance */
-		if (expected - actual <= TCP_VEGAS_ALPHA * inv_basebd) {
-			/* Increase Linearly */
-			if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
-				tp->snd_cwnd++;
-				tp->snd_cwnd_cnt = 0;
-			}
-		}
-
-		if (expected - actual >= TCP_VEGAS_BETA * inv_basebd) {
-			/* Decrease Linearly */
-			if (tp->snd_cwnd_cnt++ >= tp->snd_cwnd) {
-				tp->snd_cwnd--;
-				tp->snd_cwnd_cnt = 0;
-			}
-
-			/* Never less than 2 segments. */
-			if (tp->snd_cwnd < 2)
-				tp->snd_cwnd = 2;
-		}
-	}
-}
-
-static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt)
-{
-	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
-	
-        /* This is Jacobson's slow start and congestion avoidance. 
-         * SIGCOMM '88, p. 328.  Because we keep cong_window in 
-         * integral mss's, we can't do cwnd += 1 / cwnd.  
-         * Instead, maintain a counter and increment it once every 
-         * cwnd times.  
-	 * FIXME: Check to be sure the mathematics works out right
-	 * on this trick when we have to reduce the congestion window.
-	 * The snd_cwnd_cnt has to be reset properly when reduction events
-	 * happen.
-	 * FIXME: What happens when the congestion window gets larger
-	 * than the maximum receiver window by some large factor
-	 * Suppose the pipeline never looses packets for a long
-	 * period of time, then traffic increases causing packet loss.
-	 * The congestion window should be reduced, but what it should
-	 * be reduced to is not clear, since 1/2 the old window may
-	 * still be larger than the maximum sending rate we ever achieved.
-         */
-        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+        if ((tp->snd_cwnd>>TCP_CWND_SHIFT) <= tp->snd_ssthresh) {
                 /* In "safe" area, increase. */
-                tp->snd_cwnd++;
+                tp->snd_cwnd += (1 << TCP_CWND_SHIFT);
 	} else {
-                /* In dangerous area, increase slowly.  In theory this is
-                 * tp->snd_cwnd += 1 / tp->snd_cwnd
-                 */
-                if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
-                        tp->snd_cwnd++;
-                        tp->snd_cwnd_cnt = 0;
-                } else 
-                        tp->snd_cwnd_cnt++;
+                /* In dangerous area, increase slowly. */
+		tp->snd_cwnd += 1;
         }       
 }
 
@@ -632,7 +597,7 @@
 		 * discard it as it's confirmed to have arrived at
 		 * the other end.
 		 */
-		if (after(skb->end_seq, ack))
+		if (after(TCP_SKB_CB(skb)->end_seq, ack))
 			break;
 
 		/* Initial outgoing SYN's get put onto the write_queue
@@ -643,16 +608,30 @@
 		 * quickly.  This is severely frowned upon behavior.
 		 */
 		if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) {
+			__u8 sacked = TCP_SKB_CB(skb)->sacked;
+
 			acked |= FLAG_DATA_ACKED;
-			if(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+			if(sacked & TCPCB_SACKED_RETRANS) {
 				acked |= FLAG_RETRANS_DATA_ACKED;
+
+				/* XXX The race is, fast retrans frame -->
+				 * XXX retrans timeout sends older frame -->
+				 * XXX ACK arrives for fast retrans frame -->
+				 * XXX retrans_out goes negative --> splat.
+				 * XXX Please help me find a better way -DaveM
+				 */
+				if(tp->retrans_out)
+					tp->retrans_out--;
+			}
+			if(tp->fackets_out)
+				tp->fackets_out--;
 		} else {
 			tp->retrans_head = NULL;
 		}		
 		tp->packets_out--;
-		*seq = skb->seq;
-		*seq_rtt = now - skb->when;
-		skb_unlink(skb);
+		*seq = TCP_SKB_CB(skb)->seq;
+		*seq_rtt = now - TCP_SKB_CB(skb)->when;
+		__skb_unlink(skb, skb->list);
 		kfree_skb(skb);
 	}
 
@@ -672,7 +651,7 @@
 	
 	/* should always be non-null */
 	if (tp->send_head != NULL &&
-	    !before (ack + tp->snd_wnd, tp->send_head->end_seq)) {
+	    !before (ack + tp->snd_wnd, TCP_SKB_CB(tp->send_head)->end_seq)) {
 		tp->backoff = 0;
 		tp->pending = 0;
 		tcp_clear_xmit_timer(sk, TIME_PROBE0);
@@ -693,6 +672,8 @@
 	if (tp->retransmits) {
 		if (tp->packets_out == 0) {
 			tp->retransmits = 0;
+			tp->fackets_out = 0;
+			tp->retrans_out = 0;
 			tp->backoff = 0;
 			tcp_set_rto(tp);
 		} else {
@@ -703,7 +684,7 @@
 	} else {
 		tcp_set_rto(tp);
 		if (flag & FLAG_DATA_ACKED)
-			(*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+			tcp_cong_avoid(tp, seq, ack, seq_rtt);
 	}
 	/* NOTE: safe here so long as cong_ctl doesn't use rto */
 	tcp_bound_rto(tp);
@@ -712,7 +693,7 @@
 static void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp)
 {
 	struct sk_buff *skb = skb_peek(&sk->write_queue);
-	long when = tp->rto - (jiffies - skb->when);
+	long when = tp->rto - (jiffies - TCP_SKB_CB(skb)->when);
 
 	/* Some data was ACK'd, if still retransmitting (due to a
 	 * timeout), resend more of the retransmit queue.  The
@@ -801,8 +782,11 @@
 	} else {
 		/* If we were retransmiting don't count rtt estimate. */
 		if (tp->retransmits) {
-			if (tp->packets_out == 0)
+			if (tp->packets_out == 0) {
 				tp->retransmits = 0;
+				tp->fackets_out = 0;
+				tp->retrans_out = 0;
+			}
 		} else {
 			/* We don't have a timestamp. Can only use
 			 * packets that are not retransmitted to determine
@@ -812,13 +796,14 @@
 			 * where the network delay has increased suddenly.
 			 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 			 */
-			if ((flag & FLAG_DATA_ACKED) &&
-			    !(flag & FLAG_RETRANS_DATA_ACKED)) {
-				tp->backoff = 0;
-				tcp_rtt_estimator(tp, seq_rtt);
-				tcp_set_rto(tp);
-				tcp_bound_rto(tp);
-				(*tcp_sys_cong_ctl_f)(sk, seq, ack, seq_rtt);
+			if (flag & FLAG_DATA_ACKED) {
+				if(!(flag & FLAG_RETRANS_DATA_ACKED)) {
+					tp->backoff = 0;
+					tcp_rtt_estimator(tp, seq_rtt);
+					tcp_set_rto(tp);
+					tcp_bound_rto(tp);
+				}
+				tcp_cong_avoid(tp, seq, ack, seq_rtt);
 			}
 		}
 	}
@@ -898,7 +883,7 @@
 	 *	(2)  returns to TIME-WAIT state if the SYN turns out 
 	 *	to be an old duplicate".
 	 */
-	if(th->syn && !th->rst && after(skb->seq, tw->rcv_nxt)) {
+	if(th->syn && !th->rst && after(TCP_SKB_CB(skb)->seq, tw->rcv_nxt)) {
 		struct sock *sk;
 		struct tcp_func *af_specific = tw->af_specific;
 		__u32 isn;
@@ -1051,7 +1036,7 @@
  
 static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
 {
-	sk->tp_pinfo.af_tcp.fin_seq = skb->end_seq;
+	sk->tp_pinfo.af_tcp.fin_seq = TCP_SKB_CB(skb)->end_seq;
 
 	tcp_send_ack(sk);
 
@@ -1174,14 +1159,14 @@
 	 * "in order". ;-)  This also satisfies the requirements
 	 * of RFC2018 about ordering of SACKs.
 	 */
-	if(sp->end_seq == skb->seq) {
-		sp->end_seq = skb->end_seq;
+	if(sp->end_seq == TCP_SKB_CB(skb)->seq) {
+		sp->end_seq = TCP_SKB_CB(skb)->end_seq;
 		tcp_sack_maybe_coalesce(tp, sp);
-	} else if(sp->start_seq == skb->end_seq) {
+	} else if(sp->start_seq == TCP_SKB_CB(skb)->end_seq) {
 		/* Re-ordered arrival, in this case, can be optimized
 		 * as well.
 		 */
-		sp->start_seq = skb->seq;
+		sp->start_seq = TCP_SKB_CB(skb)->seq;
 		tcp_sack_maybe_coalesce(tp, sp);
 	} else {
 		int cur_sacks = tp->num_sacks;
@@ -1195,12 +1180,12 @@
 			int this_sack;
 
 			for(this_sack = 1; this_sack < cur_sacks; this_sack++, swap++) {
-				if((swap->end_seq == skb->seq) ||
-				   (swap->start_seq == skb->end_seq)) {
-					if(swap->end_seq == skb->seq)
-						swap->end_seq = skb->end_seq;
+				if((swap->end_seq == TCP_SKB_CB(skb)->seq) ||
+				   (swap->start_seq == TCP_SKB_CB(skb)->end_seq)) {
+					if(swap->end_seq == TCP_SKB_CB(skb)->seq)
+						swap->end_seq = TCP_SKB_CB(skb)->end_seq;
 					else
-						swap->start_seq = skb->seq;
+						swap->start_seq = TCP_SKB_CB(skb)->seq;
 					tcp_sack_swap(sp, swap);
 					tcp_sack_maybe_coalesce(tp, sp);
 					return;
@@ -1221,8 +1206,8 @@
 		}
 
 		/* Build head SACK, and we're done. */
-		sp->start_seq = skb->seq;
-		sp->end_seq = skb->end_seq;
+		sp->start_seq = TCP_SKB_CB(skb)->seq;
+		sp->end_seq = TCP_SKB_CB(skb)->end_seq;
 		if(tp->num_sacks < max_sacks)
 			tp->num_sacks++;
 	}
@@ -1234,9 +1219,13 @@
 	int num_sacks = tp->num_sacks;
 	int this_sack;
 
-	/* We know this removed SKB will eat from the front of a SACK. */
+	/* This is an in order data segment _or_ an out-of-order SKB being
+	 * moved to the receive queue, so we know this removed SKB will eat
+	 * from the front of a SACK.
+	 */
 	for(this_sack = 0; this_sack < num_sacks; this_sack++, sp++) {
-		if(sp->start_seq == skb->seq)
+		if(!after(sp->start_seq, TCP_SKB_CB(skb)->seq) &&
+		   before(sp->start_seq, TCP_SKB_CB(skb)->end_seq))
 			break;
 	}
 
@@ -1247,7 +1236,7 @@
 	if(this_sack >= num_sacks)
 		return;
 
-	sp->start_seq = skb->end_seq;
+	sp->start_seq = TCP_SKB_CB(skb)->end_seq;
 	if(!before(sp->start_seq, sp->end_seq)) {
 		/* Zap this SACK, by moving forward any other SACKS. */
 		for(this_sack += 1; this_sack < num_sacks; this_sack++, sp++) {
@@ -1266,12 +1255,12 @@
 	int this_sack;
 
 	for(this_sack = 0; this_sack < num_sacks; this_sack++, tp++) {
-		if(sp->end_seq == old_skb->end_seq)
+		if(sp->end_seq == TCP_SKB_CB(old_skb)->end_seq)
 			break;
 	}
 	if(this_sack >= num_sacks)
 		return;
-	sp->end_seq = new_skb->end_seq;
+	sp->end_seq = TCP_SKB_CB(new_skb)->end_seq;
 }
 
 /* This one checks to see if we can put data from the
@@ -1283,23 +1272,24 @@
 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
 	while ((skb = skb_peek(&tp->out_of_order_queue))) {
-		if (after(skb->seq, tp->rcv_nxt))
+		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
 			break;
 
-		if (!after(skb->end_seq, tp->rcv_nxt)) {
+		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 			SOCK_DEBUG(sk, "ofo packet was already received \n");
-			skb_unlink(skb);
+			__skb_unlink(skb, skb->list);
 			kfree_skb(skb);
 			continue;
 		}
 		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
-			   tp->rcv_nxt, skb->seq, skb->end_seq);
+			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+			   TCP_SKB_CB(skb)->end_seq);
 
 		if(tp->sack_ok)
 			tcp_sack_remove_skb(tp, skb);
-		skb_unlink(skb);
-		skb_queue_tail(&sk->receive_queue, skb);
-		tp->rcv_nxt = skb->end_seq;
+		__skb_unlink(skb, skb->list);
+		__skb_queue_tail(&sk->receive_queue, skb);
+		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin)
 			tcp_fin(skb, sk, skb->h.th);
 	}
@@ -1314,12 +1304,12 @@
 	 *  Packets in sequence go to the receive queue.
 	 *  Out of sequence packets to out_of_order_queue.
 	 */
-	if (skb->seq == tp->rcv_nxt) {
+	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
 		/* Ok. In sequence. */
 	queue_and_out:
 		dst_confirm(sk->dst_cache);
-		skb_queue_tail(&sk->receive_queue, skb);
-		tp->rcv_nxt = skb->end_seq;
+		__skb_queue_tail(&sk->receive_queue, skb);
+		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 		if(skb->h.th->fin) {
 			tcp_fin(skb, sk, skb->h.th);
 		} else {
@@ -1341,18 +1331,19 @@
 	}
 	
 	/* An old packet, either a retransmit or some packet got lost. */
-	if (!after(skb->end_seq, tp->rcv_nxt)) {
+	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
 		/* A retransmit, 2nd most common case.  Force an imediate ack. */
-		SOCK_DEBUG(sk, "retransmit received: seq %X\n", skb->seq);
+		SOCK_DEBUG(sk, "retransmit received: seq %X\n", TCP_SKB_CB(skb)->seq);
 		tcp_enter_quickack_mode(tp);
 		kfree_skb(skb);
 		return;
 	}
 
-	if (before(skb->seq, tp->rcv_nxt)) {
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 		/* Partial packet, seq < rcv_next < end_seq */
 		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
-			   tp->rcv_nxt, skb->seq, skb->end_seq);
+			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+			   TCP_SKB_CB(skb)->end_seq);
 
 		goto queue_and_out;
 	}
@@ -1365,25 +1356,25 @@
 	tp->pred_flags = 0;
 
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
-		   tp->rcv_nxt, skb->seq, skb->end_seq);
+		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
 
 	if (skb_peek(&tp->out_of_order_queue) == NULL) {
 		/* Initial out of order segment, build 1 SACK. */
 		if(tp->sack_ok) {
 			tp->num_sacks = 1;
-			tp->selective_acks[0].start_seq = skb->seq;
-			tp->selective_acks[0].end_seq = skb->end_seq;
+			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+			tp->selective_acks[0].end_seq = TCP_SKB_CB(skb)->end_seq;
 		}
-		skb_queue_head(&tp->out_of_order_queue,skb);
+		__skb_queue_head(&tp->out_of_order_queue,skb);
 	} else {
 		for(skb1=tp->out_of_order_queue.prev; ; skb1 = skb1->prev) {
 			/* Already there. */
-			if (skb->seq == skb1->seq) {
+			if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb1)->seq) {
 				if (skb->len >= skb1->len) {
 					if(tp->sack_ok)
 						tcp_sack_extend(tp, skb1, skb);
-					skb_append(skb1, skb);
-					skb_unlink(skb1);
+					__skb_append(skb1, skb);
+					__skb_unlink(skb1, skb1->list);
 					kfree_skb(skb1);
 				} else {
 					/* A duplicate, smaller than what is in the
@@ -1394,8 +1385,8 @@
 				break;
 			}
 			
-			if (after(skb->seq, skb1->seq)) {
-				skb_append(skb1,skb);
+			if (after(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) {
+				__skb_append(skb1, skb);
 				if(tp->sack_ok)
 					tcp_sack_new_ofo_skb(sk, skb);
 				break;
@@ -1403,7 +1394,7 @@
 
                         /* See if we've hit the start. If so insert. */
 			if (skb1 == skb_peek(&tp->out_of_order_queue)) {
-				skb_queue_head(&tp->out_of_order_queue,skb);
+				__skb_queue_head(&tp->out_of_order_queue,skb);
 				if(tp->sack_ok)
 					tcp_sack_new_ofo_skb(sk, skb);
 				break;
@@ -1455,8 +1446,8 @@
 	struct sk_buff *skb;
 
 	if ((skb = tp->send_head)) {
-		if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
-		    tp->packets_out < tp->snd_cwnd ) {
+		if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) &&
+		    tcp_packets_in_flight(tp) < (tp->snd_cwnd >> TCP_CWND_SHIFT)) {
 			/* Put more data onto the wire. */
 			tcp_write_xmit(sk);
 		} else if (tp->packets_out == 0 && !tp->pending) {
@@ -1601,7 +1592,7 @@
 	/* Start with the end because there are probably the least
 	 * useful packets (crossing fingers).
 	 */
-	while ((skb = skb_dequeue_tail(&tp->out_of_order_queue))) { 
+	while ((skb = __skb_dequeue_tail(&tp->out_of_order_queue))) { 
 		kfree_skb(skb);
 		if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf)
 			return;
@@ -1616,15 +1607,16 @@
 			break;
 
 		/* Never remove packets that have been already acked */
-		if (before(skb->end_seq, tp->last_ack_sent+1)) {
+		if (before(TCP_SKB_CB(skb)->end_seq, tp->last_ack_sent+1)) {
 			printk(KERN_DEBUG "prune_queue: hit acked data c=%x,%x,%x\n",
-				tp->copied_seq, skb->end_seq, tp->last_ack_sent);
+				tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->last_ack_sent);
 			break; 
 		}
-		skb_unlink(skb);
-		tp->rcv_nxt = skb->seq;
+		__skb_unlink(skb, skb->list);
+		tp->rcv_nxt = TCP_SKB_CB(skb)->seq;
 		SOCK_DEBUG(sk, "prune_queue: removing %x-%x (c=%x)\n",
-			   skb->seq, skb->end_seq, tp->copied_seq); 
+			   TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+			   tp->copied_seq); 
 		kfree_skb(skb);
 		if (atomic_read(&sk->rmem_alloc) <= sk->rcvbuf) 
 			break;
@@ -1658,13 +1650,13 @@
 	 */
 	if (tcp_fast_parse_options(sk, th, tp)) {
 		if (tp->saw_tstamp) {
-			if (tcp_paws_discard(tp)) {
+			if (tcp_paws_discard(tp, th, len)) {
 				if (!th->rst) {
 					tcp_send_ack(sk);
 					goto discard;
 				}
 			}
-			tcp_replace_ts_recent(tp,skb->end_seq);
+			tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->end_seq);
 		}
 	}
 
@@ -1678,11 +1670,12 @@
 	 *	 space for instance)
 	 */
 
-	if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt) {
+	if (flg == tp->pred_flags && TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
 		if (len <= th->doff*4) {
 			/* Bulk data transfer: sender */
 			if (len == th->doff*4) {
-				tcp_ack(sk, th, skb->seq, skb->ack_seq, len); 
+				tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
+					TCP_SKB_CB(skb)->ack_seq, len); 
 				kfree_skb(skb); 
 				tcp_data_snd_check(sk);
 				return 0;
@@ -1690,7 +1683,7 @@
 				tcp_statistics.TcpInErrs++;
 				goto discard;
 			}
-		} else if (skb->ack_seq == tp->snd_una) {
+		} else if (TCP_SKB_CB(skb)->ack_seq == tp->snd_una) {
 			/* Bulk data transfer: receiver */
 			if (atomic_read(&sk->rmem_alloc) > sk->rcvbuf) 
 				goto discard;
@@ -1700,8 +1693,8 @@
 			/* DO NOT notify forward progress here.
 			 * It saves dozen of CPU instructions in fast path. --ANK
 			 */
-			skb_queue_tail(&sk->receive_queue, skb);
-			tp->rcv_nxt = skb->end_seq;
+			__skb_queue_tail(&sk->receive_queue, skb);
+			tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 
 			/* FIN bit check is not done since if FIN is set in
 			 * this frame, the pred_flags won't match up. -DaveM
@@ -1719,11 +1712,11 @@
 		}
 	}
 
-	if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
+	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
 		if (!th->rst) {
-			if (after(skb->seq, tp->rcv_nxt)) {
+			if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
 				SOCK_DEBUG(sk, "seq:%d end:%d wup:%d wnd:%d\n",
-					   skb->seq, skb->end_seq,
+					   TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 					   tp->rcv_wup, tp->rcv_wnd);
 			}
 			tcp_send_ack(sk);
@@ -1731,7 +1724,7 @@
 		}
 	}
 
-	if(th->syn && skb->seq != tp->syn_seq) {
+	if(th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
 		SOCK_DEBUG(sk, "syn in established state\n");
 		tcp_statistics.TcpInErrs++;
 		tcp_reset(sk, skb);
@@ -1744,7 +1737,7 @@
 	}
 
 	if(th->ack)
-		tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
+		tcp_ack(sk, th, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->ack_seq, len);
 	
 	/* Process urgent data. */
 	tcp_urg(sk, th, len);
@@ -1793,7 +1786,7 @@
 		flg &= __constant_htonl(0x00170000);
 		/* Only SYN set? */
 		if (flg == __constant_htonl(0x00020000)) {
-			if (!after(skb->seq, req->rcv_isn)) {
+			if (!after(TCP_SKB_CB(skb)->seq, req->rcv_isn)) {
 				/*	retransmited syn.
 				 */
 				req->class->rtx_syn_ack(sk, req); 
@@ -1811,8 +1804,8 @@
 		 * but we do it here to prevent syn flood attackers
 		 * from creating big SYN_RECV sockets.
 		 */ 
-		if (!between(skb->ack_seq, req->snt_isn, req->snt_isn+1) ||
-		    !between(skb->seq, req->rcv_isn, 
+		if (!between(TCP_SKB_CB(skb)->ack_seq, req->snt_isn, req->snt_isn+1) ||
+		    !between(TCP_SKB_CB(skb)->seq, req->rcv_isn, 
 			     req->rcv_isn+1+req->rcv_wnd)) {
 			req->class->send_reset(skb);
 			return NULL;
@@ -1885,10 +1878,11 @@
 		 * not be in line code. [AC]
 		 */
 		if(th->ack) {
-			tp->snd_wl1 = skb->seq;
+			tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
 
 			/* We got an ack, but it's not a good ack. */
-			if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len)) {
+			if(!tcp_ack(sk,th, TCP_SKB_CB(skb)->seq,
+				    TCP_SKB_CB(skb)->ack_seq, len)) {
 				tcp_statistics.TcpAttemptFails++;
 				return 1;
 			}
@@ -1909,13 +1903,13 @@
 			/* Ok.. it's good. Set up sequence numbers and
 			 * move to established.
 			 */
-			tp->rcv_nxt = skb->seq+1;
-			tp->rcv_wup = skb->seq+1;
+			tp->rcv_nxt = TCP_SKB_CB(skb)->seq+1;
+			tp->rcv_wup = TCP_SKB_CB(skb)->seq+1;
 
 			tp->snd_wnd = htons(th->window) << tp->snd_wscale;
-			tp->snd_wl1 = skb->seq;
-			tp->snd_wl2 = skb->ack_seq;
-			tp->fin_seq = skb->seq;
+			tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+			tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
+			tp->fin_seq = TCP_SKB_CB(skb)->seq;
 
 			tcp_set_state(sk, TCP_ESTABLISHED);
 			tcp_parse_options(sk, th, tp, 0);
@@ -1983,11 +1977,11 @@
 					tp->ts_recent_stamp = jiffies;
 				}
 				
-				tp->rcv_nxt = skb->seq + 1;
-				tp->rcv_wup = skb->seq + 1;
+				tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+				tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
 
 				tp->snd_wnd = htons(th->window);
-				tp->snd_wl1 = skb->seq;
+				tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
 				
 				tcp_send_synack(sk);
 				goto discard;
@@ -2008,18 +2002,18 @@
 		 * guarantee this.
 		 */
 		if (tp->saw_tstamp) {
-			if (tcp_paws_discard(tp)) {
+			if (tcp_paws_discard(tp, th, len)) {
 				if (!th->rst) {
 					tcp_send_ack(sk);
 					goto discard;
 				}
 			}
-			tcp_replace_ts_recent(tp,skb->end_seq);
+			tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->end_seq);
 		}
 	}
 
 	/* step 1: check sequence number */
-	if (!tcp_sequence(tp, skb->seq, skb->end_seq)) {
+	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
 		if (!th->rst) {
 			tcp_send_ack(sk);
 			goto discard;
@@ -2050,14 +2044,15 @@
 	 *	original syn. 
 	 */
 
-	if (th->syn && skb->seq!=tp->syn_seq) {
+	if (th->syn && TCP_SKB_CB(skb)->seq != tp->syn_seq) {
 		tcp_reset(sk, skb);
 		return 1;
 	}
 
 	/* step 5: check the ACK field */
 	if (th->ack) {
-		int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len);
+		int acceptable = tcp_ack(sk, th, TCP_SKB_CB(skb)->seq,
+					 TCP_SKB_CB(skb)->ack_seq, len);
 		
 		switch(sk->state) {
 		case TCP_SYN_RECV:
@@ -2069,10 +2064,10 @@
 				if(!sk->dead)
 					sk->state_change(sk);		
 
-				tp->snd_una = skb->ack_seq;
+				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
 				tp->snd_wnd = htons(th->window) << tp->snd_wscale;
-				tp->snd_wl1 = skb->seq;
-				tp->snd_wl2 = skb->ack_seq;
+				tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+				tp->snd_wl2 = TCP_SKB_CB(skb)->ack_seq;
 
 			} else {
 				SOCK_DEBUG(sk, "bad ack\n");
@@ -2117,7 +2112,7 @@
 	switch (sk->state) {
 	case TCP_CLOSE_WAIT:
 	case TCP_CLOSING:
-		if (!before(skb->seq, tp->fin_seq))
+		if (!before(TCP_SKB_CB(skb)->seq, tp->fin_seq))
 			break;
 	
 	case TCP_FIN_WAIT1:
@@ -2127,7 +2122,7 @@
 		 * BSD 4.4 also does reset.
 		 */
 		if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead) {
-			if (after(skb->end_seq - th->fin, tp->rcv_nxt)) {
+			if (after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
 				tcp_reset(sk, skb);
 				return 1;
 			}
@@ -2150,27 +2145,4 @@
 		kfree_skb(skb);
 	}
 	return 0;
-}
-
-int tcp_sysctl_congavoid(ctl_table *ctl, int write, struct file * filp,
-			 void *buffer, size_t *lenp)
-{
-	int val = sysctl_tcp_cong_avoidance;
-	int retv;
-	static tcp_sys_cong_ctl_t tab[] = { 
-		tcp_cong_avoid_vanj, 
-		tcp_cong_avoid_vegas
-	};
-
-	retv = proc_dointvec(ctl, write, filp, buffer, lenp);
-
-	if (write) {
-		if ((unsigned)sysctl_tcp_cong_avoidance > 1) {
-			retv = -EINVAL;
-			sysctl_tcp_cong_avoidance = val;
-		} else {
-			tcp_sys_cong_ctl_f = tab[sysctl_tcp_cong_avoidance];
-		}
-	}
-	return retv;
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov