patch-2.1.8 linux/net/ipv4/tcp_input.c

Next file: linux/net/ipv4/tcp_ipv4.c
Previous file: linux/net/ipv4/tcp.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.7/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c
@@ -18,81 +18,85 @@
  *		Matthew Dillon, <dillon@apollo.west.oic.com>
  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ *	TODO
+ *		- A better sock cache
+ *
+ */
+
+/*
+ * Changes:
+ *		Pedro Roque	:	Fast Retransmit/Recovery.
+ *					Two receive queues.
+ *					Retransmit queue handled by TCP.
+ *					Better retransmit timer handling.
+ *					New congestion avoidance.
+ *					Header prediction.
+ *					Variable renaming.
  *
- * FIXES
- *		Pedro Roque	:	Double ACK bug
+ *		Eric		:	Fast Retransmit.
+ *		Randy Scott	:	MSS option defines.
  *		Eric Schenk	:	Fixes to slow start algorithm.
  *		Eric Schenk	:	Yet another double ACK bug.
  *		Eric Schenk	:	Delayed ACK bug fixes.
  *		Eric Schenk	:	Floyd style fast retrans war avoidance.
- *		Eric Schenk	: 	Skip fast retransmit on small windows.
- *		Eric schenk	:	Fixes to retransmission code to
- *				:	avoid extra retransmission.
- *		Theodore Ts'o	:	Do secure TCP sequence numbers.
  */
 
 #include <linux/config.h>
-#include <linux/types.h>
-#include <linux/random.h>
 #include <net/tcp.h>
 
+
 /*
- *	Policy code extracted so it's now separate
+ *	Policy code extracted so it's now seperate
  */
 
 /*
  *	Called each time to estimate the delayed ack timeout. This is
- *	how it should be done so a fast link isn't impacted by ack delay.
+ *	how it should be done so a fast link isnt impacted by ack delay.
+ *
+ *	I think we need a medium deviation here also...
+ *	The estimated value is changing to fast
  */
  
-extern __inline__ void tcp_delack_estimator(struct sock *sk)
+extern __inline__ void tcp_delack_estimator(struct tcp_opt *tp)
 {
+	int m;
+
 	/*
 	 *	Delayed ACK time estimator.
 	 */
 	
-	if (sk->lrcvtime == 0) 
-	{
-		sk->lrcvtime = jiffies;
-		sk->ato = HZ/3;
-	}
-	else 
+	m = jiffies - tp->lrcvtime;
+
+	tp->lrcvtime = jiffies;
+
+	if (m < 0)
+		return;
+
+	/*
+	 * if the mesured value is bigger than
+	 * twice the round trip time ignore it.
+	 */
+	if ((m << 2) <= tp->srtt) 
 	{
-		int m;
-		
-		m = jiffies - sk->lrcvtime;
+		m -= (tp->iat >> 3);
+		tp->iat += m;
 
-		sk->lrcvtime = jiffies;
+		if (m <0)
+			m = -m;
 
-		if (m <= 0)
-			m = 1;
+		m -= (tp->iat_mdev >> 2);
+		tp->iat_mdev += m;
 
-		/* This used to test against sk->rtt.
-		 * On a purely receiving link, there is no rtt measure.
-		 * The result is that we lose delayed ACKs on one-way links.
-		 * Therefore we test against sk->rto, which will always
-		 * at least have a default value.
-		 */
-		if (m > sk->rto)
-		{
-			sk->ato = sk->rto;
-			/*
-			 * printk(KERN_DEBUG "ato: rtt %lu\n", sk->ato);
-			 */
-		}
-		else 
-		{
-			/*
-		 	 * Very fast acting estimator.
-		 	 * May fluctuate too much. Probably we should be
-			 * doing something like the rtt estimator here.
-			 */
-			sk->ato = (sk->ato >> 1) + m;
-			/*
-			 * printk(KERN_DEBUG "ato: m %lu\n", sk->ato);
-			 */
-		}
+		tp->ato = (tp->iat >> 3) + (tp->iat_mdev >> 2);
+
+		if (tp->ato < HZ/50)
+			tp->ato = HZ/50;
 	}
+	else
+		tp->ato = 0;
 }
 
 /*
@@ -100,8 +104,8 @@
  *	retransmitted [see Karn/Partridge Proceedings SIGCOMM 87]. 
  *	The algorithm is from the SIGCOMM 88 piece by Van Jacobson.
  */
- 
-extern __inline__ void tcp_rtt_estimator(struct sock *sk, struct sk_buff *oskb)
+
+extern __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt)
 {
 	long m;
 	/*
@@ -111,130 +115,72 @@
 	 *	This is designed to be as fast as possible 
 	 *	m stands for "measurement".
 	 */
-	
-	m = jiffies - oskb->when;  /* RTT */
+	/*
+	 *	On a 1990 paper the rto value is changed to:
+	 *	RTO = rtt + 4 * mdev
+	 */
 
-	if (sk->rtt != 0) {
+	m = mrtt;  /* RTT */
+
+	if (tp->srtt != 0) {
 		if(m<=0)
 			m=1;		/* IS THIS RIGHT FOR <0 ??? */
-		m -= (sk->rtt >> 3);    /* m is now error in rtt est */
-		sk->rtt += m;           /* rtt = 7/8 rtt + 1/8 new */
+		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
+		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
 		if (m < 0)
 			m = -m;		/* m is now abs(error) */
-		m -= (sk->mdev >> 2);   /* similar update on mdev */
-		sk->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
+		m -= (tp->mdev >> 2);   /* similar update on mdev */
+		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
 	} else {
-		/* no previous measure. */
-		sk->rtt = m<<3;		/* take the measured time to be rtt */
-		sk->mdev = m<<1;	/* make sure rto = 3*rtt */
+					/* no previous measure. */
+		tp->srtt = m<<3;	/* take the measured time to be rtt */
+		tp->mdev = m<<2;	/* make sure rto = 3*rtt */
 	}
 
+
 	/*
 	 *	Now update timeout.  Note that this removes any backoff.
 	 */
 			 
-	/* Jacobson's algorithm calls for rto = R + 4V.
-	 * We diverge from Jacobson's algorithm here. See the commentary
-	 * in tcp_ack to understand why.
-	 */
-	sk->rto = (sk->rtt >> 3) + sk->mdev;
-	sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1));
-	if (sk->rto > 120*HZ)
-		sk->rto = 120*HZ;
-	if (sk->rto < HZ/5)	/* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
-		sk->rto = HZ/5;
-	sk->backoff = 0;
-}
-
-/*
- *	Cached last hit socket
- */
- 
-static volatile unsigned long 	th_cache_saddr, th_cache_daddr;
-static volatile unsigned short  th_cache_dport, th_cache_sport;
-static volatile struct sock *th_cache_sk;
-
-void tcp_cache_zap(void)
-{
-	th_cache_sk=NULL;
-}
+	tp->rto = (tp->srtt >> 3) + tp->mdev;
 
-/*
- *	Find the socket, using the last hit cache if applicable. The cache is not quite
- *	right...
- */
+	if (tp->rto > 120*HZ)
+		tp->rto = 120*HZ;
 
-static inline struct sock * get_tcp_sock(u32 saddr, u16 sport, u32 daddr, u16 dport, u32 paddr, u16 pport)
-{
-	struct sock * sk;
+	/* Was 1*HZ - keep .2 as minimum cos of the BSD delayed acks */
+	if (tp->rto < HZ/5)
+		tp->rto = HZ/5;
 
-	sk = (struct sock *) th_cache_sk;
-	if (!sk || saddr != th_cache_saddr || daddr != th_cache_daddr ||
-	    sport != th_cache_sport || dport != th_cache_dport) {
-		sk = get_sock(&tcp_prot, dport, saddr, sport, daddr, paddr, pport);
-		if (sk) {
-			th_cache_saddr=saddr;
-			th_cache_daddr=daddr;
-  			th_cache_dport=dport;
-			th_cache_sport=sport;
-			th_cache_sk=sk;
-		}
-	}
-	return sk;
+	tp->backoff = 0;
 }
+ 
 
 /*
- * React to an out-of-window TCP sequence number in an incoming packet
+ *	This functions checks to see if the tcp header is actually acceptable. 
  */
  
-static void bad_tcp_sequence(struct sock *sk, struct tcphdr *th, u32 end_seq,
-	      struct device *dev)
+extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 seg_nxt)
 {
-	if (th->rst)
-		return;
+	u32 end_window = tp->rcv_wup + tp->rcv_wnd;
+	u32 end_seq = seg_nxt;
 
 	/*
-	 *	Send a reset if we get something not ours and we are
-	 *	unsynchronized. Note: We don't do anything to our end. We
-	 *	are just killing the bogus remote connection then we will
-	 *	connect again and it will work (with luck).
+	 *	When the window is open (most common case)
+	 *	we want to accept segments if they have yet unseen data
+	 *	or in the case of a dataless segment if seg.seq == rcv.nxt
+	 *	this means:
+	 *
+	 *	if (seq == end_seq)
+	 *		end_seq >= rcv.nxt
+	 *	else
+	 *		end_seq >  rcv.nxt
 	 */
-  	 
-	if (sk->state==TCP_SYN_SENT || sk->state==TCP_SYN_RECV) 
-	{
-		tcp_send_reset(sk->saddr,sk->daddr,th,sk->prot,NULL,dev, sk->ip_tos,sk->ip_ttl);
-		return;
-	}
 
-	/*
-	 * 	This packet is old news. Usually this is just a resend
-	 * 	from the far end, but sometimes it means the far end lost
-	 *	an ACK we sent, so we better send an ACK.
-	 */
-	/*
-	 *	BEWARE! Unconditional answering by ack to out-of-window ack
-	 *	can result in infinite exchange of empty acks.
-	 *	This check cures bug, found by Michiel Boland, but
-	 *	not another possible cases.
-	 *	If we are in TCP_TIME_WAIT, we have already received
-	 *	FIN, so that our peer need not window update. If our
-	 *	ACK were lost, peer would retransmit his FIN anyway. --ANK
-	 */
-	if (sk->state != TCP_TIME_WAIT || ntohl(th->seq) != end_seq)
-		tcp_send_ack(sk);
-}
+	if (seq == end_seq)
+		end_seq++;
 
-/*
- *	This functions checks to see if the tcp header is actually acceptable. 
- */
- 
-extern __inline__ int tcp_sequence(struct sock *sk, u32 seq, u32 end_seq)
-{
-	u32 end_window = sk->lastwin_seq + sk->window;
-	return	/* if start is at end of window, end must be too (zero window) */
-		(seq == end_window && seq == end_seq) ||
-		/* if start is before end of window, check for interest */
-		(before(seq, end_window) && !before(end_seq, sk->acked_seq));
+	return ((before(seq, end_window) && after(end_seq, tp->rcv_nxt)) ||
+		(seq == end_window && seq == end_seq));
 }
 
 /*
@@ -273,7 +219,7 @@
 #endif	
 	if (!sk->dead) 
 		sk->state_change(sk);
-	kfree_skb(skb, FREE_READ);
+
 	return(0);
 }
 
@@ -289,11 +235,11 @@
  *	as Linux gets deployed on 100Mb/sec networks.
  */
  
-static void tcp_options(struct sock *sk, struct tcphdr *th)
+int tcp_parse_options(struct tcphdr *th)
 {
 	unsigned char *ptr;
 	int length=(th->doff*4)-sizeof(struct tcphdr);
-	int mss_seen = 0;
+	int mss = 0;
     
 	ptr = (unsigned char *)(th + 1);
   
@@ -304,7 +250,7 @@
 	  	switch(opcode)
 	  	{
 	  		case TCPOPT_EOL:
-	  			return;
+	  			return 0;
 	  		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
 	  			length--;
 	  			ptr--;		/* the opsize=*ptr++ above was a mistake */
@@ -312,14 +258,13 @@
 	  		
 	  		default:
 	  			if(opsize<=2)	/* Avoid silly options looping forever */
-	  				return;
+	  				return 0;
 	  			switch(opcode)
 	  			{
 	  				case TCPOPT_MSS:
-	  					if(opsize==4 && th->syn)
+	  					if(opsize==TCPOLEN_MSS && th->syn)
 	  					{
-	  						sk->mtu=min(sk->mtu,ntohs(*(unsigned short *)ptr));
-							mss_seen = 1;
+							mss = ntohs(*(unsigned short *)ptr);
 	  					}
 	  					break;
 		  				/* Add other options here as people feel the urge to implement stuff like large windows */
@@ -328,612 +273,420 @@
 	  			length-=opsize;
 	  	}
 	}
-	if (th->syn) 
-	{
-		if (! mss_seen)
-		      sk->mtu=min(sk->mtu, 536);  /* default MSS if none sent */
-	}
-#ifdef CONFIG_INET_PCTCP
-	sk->mss = min(sk->max_window >> 1, sk->mtu);
-#else    
-	sk->mss = min(sk->max_window, sk->mtu);
-	sk->max_unacked = 2 * sk->mss;
-#endif  
+
+	return mss;
 }
 
 
-/*
- *	This routine handles a connection request.
- *	It should make sure we haven't already responded.
- *	Because of the way BSD works, we have to send a syn/ack now.
- *	This also means it will be harder to close a socket which is
- *	listening.
+/* 
+ *  See draft-stevens-tcpca-spec-01 for documentation.
  */
- 
-static void tcp_conn_request(struct sock *sk, struct sk_buff *skb,
-		 u32 daddr, u32 saddr, struct options *opt, struct device *dev, u32 seq)
-{
-	struct sock *newsk;
-	struct tcphdr *th;
-	struct rtable *rt;
-  
-	th = skb->h.th;
 
-	/* If the socket is dead, don't accept the connection. */
-	if (!sk->dead) 
-	{
-  		sk->data_ready(sk,0);
-	}
-	else 
-	{
-		if(sk->debug)
-			printk("Reset on %p: Connect on dead socket.\n",sk);
-		tcp_send_reset(daddr, saddr, th, sk->prot, opt, dev, sk->ip_tos,sk->ip_ttl);
-		tcp_statistics.TcpAttemptFails++;
-		kfree_skb(skb, FREE_READ);
-		return;
-	}
+static void tcp_fast_retrans(struct sock *sk, u32 ack, int not_dup)
+{
+	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
 
 	/*
-	 *	Make sure we can accept more.  This will prevent a
-	 *	flurry of syns from eating up all our memory.
-	 *
-	 *	BSD does some funnies here and allows 3/2 times the
-	 *	set backlog as a fudge factor. That's just too gross.
+	 * An ACK is a duplicate if:
+	 * (1) it has the same sequence number as the largest number we've 
+	 *     seen,
+	 * (2) it has the same window as the last ACK,
+	 * (3) we have outstanding data that has not been ACKed
+	 * (4) The packet was not carrying any data.
+	 * (5) [From Floyds paper on fast retransmit wars]
+	 *     The packet acked data after high_seq;
 	 */
 
-	if (sk->ack_backlog >= sk->max_ack_backlog) 
+	if (ack == tp->snd_una && sk->packets_out && (not_dup == 0) &&
+	    after(ack, tp->high_seq))
 	{
-		tcp_statistics.TcpAttemptFails++;
-		kfree_skb(skb, FREE_READ);
-		return;
-	}
-
-	/*
-	 * We need to build a new sock struct.
-	 * It is sort of bad to have a socket without an inode attached
-	 * to it, but the wake_up's will just wake up the listening socket,
-	 * and if the listening socket is destroyed before this is taken
-	 * off of the queue, this will take care of it.
-	 */
+		
+		sk->dup_acks++;	
+		
 
-	newsk = (struct sock *) kmalloc(sizeof(struct sock), GFP_ATOMIC);
-	if (newsk == NULL) 
-	{
-		/* just ignore the syn.  It will get retransmitted. */
-		tcp_statistics.TcpAttemptFails++;
-		kfree_skb(skb, FREE_READ);
-		return;
-	}
+		/*
+		 * 1. When the third duplicate ack is received, set ssthresh 
+		 * to one half the current congestion window, but no less 
+		 * than two segments. Retransmit the missing segment.
+		 */
+	
+		if (sk->dup_acks == 3) 
+		{
+			sk->ssthresh = max(sk->cong_window >> 1, 2);
+			sk->cong_window = sk->ssthresh + 3;
+			tcp_do_retransmit(sk, 0);
+		}
 
-	memcpy(newsk, sk, sizeof(*newsk));
-	newsk->opt = NULL;
-	newsk->ip_route_cache  = NULL;
-	if (opt && opt->optlen) 
-	{
-		sk->opt = (struct options*)kmalloc(sizeof(struct options)+opt->optlen, GFP_ATOMIC);
-		if (!sk->opt) 
+		/*
+		 * 2. Each time another duplicate ACK arrives, increment 
+		 * cwnd by the segment size. [...] Transmit a packet...
+		 *
+		 * Packet transmission will be done on normal flow processing
+		 * since we're not in "retransmit mode"
+		 */
+		
+		if (sk->dup_acks > 3) 
 		{
-	        	kfree_s(newsk, sizeof(struct sock));
-			tcp_statistics.TcpAttemptFails++;
-			kfree_skb(skb, FREE_READ);
-			return;
+			sk->cong_window++;
 		}
-		if (ip_options_echo(sk->opt, opt, daddr, saddr, skb)) 
+	}
+	else
+	{
+		/*
+		 * 3. When the next ACK arrives that acknowledges new data,
+		 *    set cwnd to ssthresh
+		 */
+
+		if (sk->dup_acks >= 3)
 		{
-			kfree_s(sk->opt, sizeof(struct options)+opt->optlen);
-	        	kfree_s(newsk, sizeof(struct sock));
-			tcp_statistics.TcpAttemptFails++;
-			kfree_skb(skb, FREE_READ);
-			return;
+			sk->tp_pinfo.af_tcp.retrans_head = NULL;
+			sk->cong_window = sk->ssthresh;
+			sk->retransmits = 0;
 		}
+		sk->dup_acks = 0;
 	}
-	skb_queue_head_init(&newsk->write_queue);
-	skb_queue_head_init(&newsk->receive_queue);
-	newsk->send_head = NULL;
-	newsk->send_tail = NULL;
-	newsk->send_next = NULL;
-	skb_queue_head_init(&newsk->back_log);
-	newsk->rtt = 0;
-	newsk->rto = TCP_TIMEOUT_INIT;
-	newsk->mdev = TCP_TIMEOUT_INIT;
-	newsk->max_window = 0;
-	/*
-	 * See draft-stevens-tcpca-spec-01 for discussion of the
-	 * initialization of these values.
-	 */
-	newsk->cong_window = 1;
-	newsk->cong_count = 0;
-	newsk->ssthresh = 0x7fffffff;
-
-	newsk->lrcvtime = 0;
-	newsk->idletime = 0;
-	newsk->high_seq = 0;
-	newsk->backoff = 0;
-	newsk->blog = 0;
-	newsk->intr = 0;
-	newsk->proc = 0;
-	newsk->done = 0;
-	newsk->partial = NULL;
-	newsk->pair = NULL;
-	newsk->wmem_alloc = 0;
-	newsk->rmem_alloc = 0;
-	newsk->localroute = sk->localroute;
-
-	newsk->max_unacked = MAX_WINDOW - TCP_WINDOW_DIFF;
-
-	newsk->err = 0;
-	newsk->shutdown = 0;
-	newsk->ack_backlog = 0;
-	newsk->acked_seq = skb->seq+1;
-	newsk->lastwin_seq = skb->seq+1;
-	newsk->delay_acks = 1;
-	newsk->copied_seq = skb->seq+1;
-	newsk->fin_seq = skb->seq;
-	newsk->syn_seq = skb->seq;
-	newsk->state = TCP_SYN_RECV;
-	newsk->timeout = 0;
-	newsk->ip_xmit_timeout = 0;
-	newsk->write_seq = seq; 
-	newsk->window_seq = newsk->write_seq;
-	newsk->rcv_ack_seq = newsk->write_seq;
-	newsk->urg_data = 0;
-	newsk->retransmits = 0;
-	newsk->linger=0;
-	newsk->destroy = 0;
-	init_timer(&newsk->timer);
-	newsk->timer.data = (unsigned long)newsk;
-	newsk->timer.function = &net_timer;
-	init_timer(&newsk->delack_timer);
-	newsk->delack_timer.data = (unsigned long)newsk;
-	newsk->delack_timer.function = tcp_delack_timer;
-	init_timer(&newsk->retransmit_timer);
-	newsk->retransmit_timer.data = (unsigned long)newsk;
-	newsk->retransmit_timer.function = tcp_retransmit_timer;
-	newsk->dummy_th.source = skb->h.th->dest;
-	newsk->dummy_th.dest = skb->h.th->source;
 	
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-	/* 
-	 *	Deal with possibly redirected traffic by setting num to
-	 *	the intended destination port of the received packet.
-	 */
-	newsk->num = ntohs(skb->h.th->dest);
-
-#endif
-	/*
-	 *	Swap these two, they are from our point of view. 
-	 */
-	 
-	newsk->daddr = saddr;
-	newsk->saddr = daddr;
-	newsk->rcv_saddr = daddr;
+}
 
-	put_sock(newsk->num,newsk);
-	newsk->acked_seq = skb->seq + 1;
-	newsk->copied_seq = skb->seq + 1;
-	newsk->socket = NULL;
+int sysctl_tcp_vegas_cong_avoidance = 1;
 
-	/*
-	 *	Grab the ttl and tos values and use them 
-	 */
+/*
+ *      TCP slow start and congestion avoidance in two flavors:
+ *      RFC 1122 and TCP Vegas.
+ *
+ *      This is a /proc/sys configurable option. 
+ */
 
-	newsk->ip_ttl=sk->ip_ttl;
-	newsk->ip_tos=skb->ip_hdr->tos;
+#define SHIFT_FACTOR 12
 
+static void tcp_cong_avoid_vegas(struct sock *sk, u32 seq, u32 ack,
+				 u32 seq_rtt)
+{
 	/*
-	 *	Use 512 or whatever user asked for 
+	 *	From:
+	 *      TCP Vegas: New Techniques for Congestion 
+	 *	Detection and Avoidance.
+	 *              
+	 *
+	 *	Warning: This code is a scratch implementation taken
+	 *	from the paper only. The code they distribute seams
+	 *	to have improved several things over the initial spec.
 	 */
 
-	/*
-	 * 	Note use of sk->user_mss, since user has no direct access to newsk 
-	 */
+	u32 Actual, Expected;
+	u32 snt_bytes;
+	struct tcp_opt * tp;
 
-	rt = ip_rt_route(newsk->opt && newsk->opt->srr ? newsk->opt->faddr : saddr, 0);
-	newsk->ip_route_cache = rt;
+	tp = &(sk->tp_pinfo.af_tcp);
+
+	if (!seq_rtt)
+		seq_rtt = 1;
 	
-	if(rt!=NULL && (rt->rt_flags&RTF_WINDOW))
-		newsk->window_clamp = rt->rt_window;
+	if (tp->basertt)
+		tp->basertt = min(seq_rtt, tp->basertt);
 	else
-		newsk->window_clamp = 0;
+		tp->basertt = seq_rtt;
+		
 		
-	if (sk->user_mss)
-		newsk->mtu = sk->user_mss;
-	else if (rt)
-		newsk->mtu = rt->rt_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr);
-	else 
-		newsk->mtu = 576 - sizeof(struct iphdr) - sizeof(struct tcphdr);
-
-	/*
-	 *	But not bigger than device MTU 
-	 */
-
-	newsk->mtu = min(newsk->mtu, dev->mtu - sizeof(struct iphdr) - sizeof(struct tcphdr));
-
-#ifdef CONFIG_SKIP
-	
-	/*
-	 *	SKIP devices set their MTU to 65535. This is so they can take packets
-	 *	unfragmented to security process then fragment. They could lie to the
-	 *	TCP layer about a suitable MTU, but it's easier to let skip sort it out
-	 *	simply because the final package we want unfragmented is going to be
-	 *
-	 *	[IPHDR][IPSP][Security data][Modified TCP data][Security data]
-	 */
-	 
-	if(skip_pick_mtu!=NULL)		/* If SKIP is loaded.. */
-		sk->mtu=skip_pick_mtu(sk->mtu,dev);
-#endif
 	/*
-	 *	This will min with what arrived in the packet 
+	 * 
+	 *	Actual	 = throughput for this segment.
+	 *	Expected = number_of_bytes in transit / BaseRTT
+	 * 
 	 */
 
-	tcp_options(newsk,skb->h.th);
-	
-	tcp_cache_zap();
-	tcp_send_synack(newsk, sk, skb);
-}
-
+	snt_bytes = (ack - seq) << SHIFT_FACTOR;
+		
+	Actual =  snt_bytes / seq_rtt;
+	Expected = ((tp->snd_nxt - tp->snd_una) << SHIFT_FACTOR) / tp->basertt;
 
-/*
- * Handle a TCP window that shrunk on us. It shouldn't happen,
- * but..
- *
- * We may need to move packets from the send queue
- * to the write queue, if the window has been shrunk on us.
- * The RFC says you are not allowed to shrink your window
- * like this, but if the other end does, you must be able
- * to deal with it.
- */
-void tcp_window_shrunk(struct sock * sk, u32 window_seq)
-{
-	struct sk_buff *skb;
-	struct sk_buff *skb2;
-	struct sk_buff *wskb = NULL;
- 	
-	skb2 = sk->send_head;
-	sk->send_head = NULL;
-	sk->send_tail = NULL;
-	sk->send_next = NULL;
-
-	/*
-	 *	This is an artifact of a flawed concept. We want one
-	 *	queue and a smarter send routine when we send all.
-	 */
-	cli();
-	while (skb2 != NULL) 
-	{
-		skb = skb2;
-		skb2 = skb->link3;
-		skb->link3 = NULL;
-		if (after(skb->end_seq, window_seq)) 
-		{
-			if (sk->packets_out > 0) 
-				sk->packets_out--;
-			/* We may need to remove this from the dev send list. */
-			if (skb->next != NULL) 
-			{
-				skb_unlink(skb);				
-			}
-			/* Now add it to the write_queue. */
-			if (wskb == NULL)
-				skb_queue_head(&sk->write_queue,skb);
-			else
-				skb_append(wskb,skb);
-			wskb = skb;
-		} 
-		else 
-		{
-			if (sk->send_head == NULL) 
-			{
-				sk->send_head = skb;
-				sk->send_tail = skb;
-				sk->send_next = skb;
+/*		
+	printk(KERN_DEBUG "A:%x E:%x rtt:%x srtt:%x win: %d\n", 
+	       Actual, Expected, seq_rtt, tp->srtt, sk->cong_window);
+      */
+	/*
+	 *      Slow Start
+	 */
+	
+	if (sk->cong_window < sk->ssthresh &&
+	    (seq == tp->snd_nxt ||
+	      (((Expected - Actual) <=
+		((TCP_VEGAS_GAMMA << SHIFT_FACTOR) * sk->mss / tp->basertt))
+	       )
+	     ))
+	{
+			
+		/*
+		 * "Vegas allows exponential growth only every other
+		 *  RTT"
+		 */
+			
+		if (sk->cong_count || sk->cong_window <= 2)
+		{
+			sk->cong_window++;
+			sk->cong_count = 0;
+		}
+		else
+			sk->cong_count++;
+	}
+	else 
+	{
+		/*
+		 *      Congestion Avoidance
+		 */
+			
+		if (Expected - Actual <=
+		    ((TCP_VEGAS_ALPHA << SHIFT_FACTOR) * sk->mss / tp->basertt))
+		{
+			/* Increase Linearly */
+				
+			if (sk->cong_count >= sk->cong_window)
+			{
+				sk->cong_window++;
+				sk->cong_count = 0;
 			}
 			else
+				sk->cong_count++;
+		}
+			
+		if (Expected - Actual >=
+		    ((TCP_VEGAS_BETA << SHIFT_FACTOR) * sk->mss / tp->basertt))
+		{
+			/* Decrease Linearly */
+				
+			if (sk->cong_count >= sk->cong_window)
 			{
-				sk->send_tail->link3 = skb;
-				sk->send_tail = skb;
+				sk->cong_window--;
+				sk->cong_count = 0;
 			}
-			skb->link3 = NULL;
+			else
+				sk->cong_count++;
+				
+				
+			/* Never less than 2 segments */
+			if (sk->cong_window < 2)
+				sk->cong_window = 2;
 		}
+
+
+	}
+}
+
+static void tcp_cong_avoid_vanj(struct sock *sk, u32 seq, u32 ack, u32 seq_rtt)
+{
+	
+        /* 
+         * This is Jacobson's slow start and congestion avoidance. 
+         * SIGCOMM '88, p. 328.  Because we keep cong_window in 
+         * integral mss's, we can't do cwnd += 1 / cwnd.  
+         * Instead, maintain a counter and increment it once every 
+         * cwnd times.  
+         */
+
+        if (sk->cong_window <= sk->ssthresh)  
+	{
+                /* 
+                 *	In "safe" area, increase
+                 */
+
+                sk->cong_window++;
 	}
-	sti();
+        else 
+	{
+                /*
+                 *	In dangerous area, increase slowly.  
+                 *      In theory this is
+                 *  	sk->cong_window += 1 / sk->cong_window
+                 */
+
+                if (sk->cong_count >= sk->cong_window) {
+			
+                        sk->cong_window++;
+                        sk->cong_count = 0;
+                }
+                else 
+                        sk->cong_count++;
+        }       
 }
 
 
+#define FLAG_DATA		0x01
+#define FLAG_WIN_UPDATE		0x02
+#define FLAG_DATA_ACKED		0x04
+ 
 /*
  *	This routine deals with incoming acks, but not outgoing ones.
- *
- *	This routine is totally _WRONG_. The list structuring is wrong,
- *	the algorithm is wrong, the code is wrong.
  */
 
-static int tcp_ack(struct sock *sk, struct tcphdr *th, u32 ack, int len)
+static int tcp_ack(struct sock *sk, struct tcphdr *th, 
+		   u32 ack_seq, u32 ack, int len)
 {
 	int flag = 0;
-	u32 window_seq;
+	u32 seq = 0;
+	u32 seq_rtt = 0;
+	struct sk_buff *skb;
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 
-	/* 
-	 * 1 - there was data in packet as well as ack or new data is sent or 
-	 *     in shutdown state
-	 * 2 - data from retransmit queue was acked and removed
-	 * 4 - window shrunk or data from retransmit queue was acked and removed
-	 */
 
 	if(sk->zapped)
 		return(1);	/* Dead, can't ack any more so why bother */
 
-	/*
-	 *	We have dropped back to keepalive timeouts. Thus we have
-	 *	no retransmits pending.
-	 */
 	 
-	if (sk->ip_xmit_timeout == TIME_KEEPOPEN)
-	  	sk->retransmits = 0;
+	if (tp->pending == TIME_KEEPOPEN) 
+	{
+	  	tp->probes_out = 0;
+	}
 
+	tp->rcv_tstamp = jiffies;
+		
 	/*
 	 *	If the ack is newer than sent or older than previous acks
 	 *	then we can probably ignore it.
 	 */
 	 
-	if (after(ack, sk->sent_seq) || before(ack, sk->rcv_ack_seq)) 
+	if (after(ack, tp->snd_nxt) || before(ack, tp->snd_una))
 		goto uninteresting_ack;
 
 	/*
-	 *	Have we discovered a larger window
+	 *	If there is data set flag 1
 	 */
-	window_seq = ntohs(th->window);
-	if (window_seq > sk->max_window) 
+	 
+	if (len != th->doff*4) 
 	{
-  		sk->max_window = window_seq;
-#ifdef CONFIG_INET_PCTCP
-		/* Hack because we don't send partial packets to non SWS
-		   handling hosts */
-		sk->mss = min(window_seq>>1, sk->mtu);
-#else
-		sk->mss = min(window_seq, sk->mtu);
-#endif	
+		flag |= FLAG_DATA;
+		tcp_delack_estimator(tp);
 	}
-	window_seq += ack;
 
 	/*
-	 *	See if our window has been shrunk. 
+	 *	Update our send window
 	 */
-	if (after(sk->window_seq, window_seq))
-		tcp_window_shrunk(sk, window_seq);
 
 	/*
-	 *	Pipe has emptied
-	 */	 
-	if (sk->send_tail == NULL || sk->send_head == NULL) 
+	 *	This is the window update code as per RFC 793
+	 *	snd_wl{1,2} are used to prevent unordered
+	 *	segments from shrinking the window 
+	 */
+
+	if ((tp->snd_wl1 == 0) || before(tp->snd_wl1, ack_seq) ||
+	    (tp->snd_wl1 == ack_seq && !after(tp->snd_wl2, ack)))
 	{
-		sk->send_head = NULL;
-		sk->send_tail = NULL;
-		sk->send_next = NULL;
-		sk->packets_out= 0;
+		tp->snd_wnd = ntohs(th->window);
+		tp->snd_wl1 = ack_seq;
+		tp->snd_wl2 = ack;
+
+		flag |= FLAG_WIN_UPDATE;
+
+		if (tp->snd_wnd > sk->max_window)
+		{
+			sk->max_window = tp->snd_wnd;
+		}
 	}
 
+	
 	/*
-	 *	We don't want too many packets out there. 
+	 *	We passed data and got it acked, remove any soft error
+	 *	log. Something worked...
 	 */
 	 
-	if (sk->ip_xmit_timeout == TIME_WRITE && 
-		sk->cong_window < 2048 && after(ack, sk->rcv_ack_seq)) 
+	sk->err_soft = 0;
+
+	/*
+	 *	If this ack opens up a zero window, clear backoff.  It was
+	 *	being used to time the probes, and is probably far higher than
+	 *	it needs to be for normal retransmission.
+	 */
+
+	if (tp->pending == TIME_PROBE0) 
 	{
+		tp->probes_out = 0;	/* Our probe was answered */
 		
-		/* 
-		 * This is Jacobson's slow start and congestion avoidance. 
-		 * SIGCOMM '88, p. 328.  Because we keep cong_window in integral
-		 * mss's, we can't do cwnd += 1 / cwnd.  Instead, maintain a 
-		 * counter and increment it once every cwnd times.  It's possible
-		 * that this should be done only if sk->retransmits == 0.  I'm
-		 * interpreting "new data is acked" as including data that has
-		 * been retransmitted but is just now being acked.
+		/*
+		 *	Was it a usable window open ?
 		 */
-		if (sk->cong_window <= sk->ssthresh)
-			/* 
-			 *	In "safe" area, increase
-			 */
-			sk->cong_window++;
-		else 
+		 
+		/* should always be non-null */
+  		if (tp->send_head != NULL &&
+		    !before (ack + tp->snd_wnd, tp->send_head->end_seq))
 		{
-			/*
-			 *	In dangerous area, increase slowly.  In theory this is
-			 *  	sk->cong_window += 1 / sk->cong_window
-			 */
-			if (sk->cong_count >= sk->cong_window) 
-			{
-				sk->cong_window++;
-				sk->cong_count = 0;
-			}
-			else 
-				sk->cong_count++;
-		}
-	}
+			tp->backoff = 0;
+			tp->pending = 0;
 
-	/*
-	 *	Remember the highest ack received and update the
-	 *	right hand window edge of the host.
-	 *	We do a bit of work here to track number of times we've
-	 *	seen this ack without a change in the right edge of the
-	 *	window and no data in the packet.
-	 *	This will allow us to do fast retransmits.
-	 */
-
-	/* We are looking for duplicate ACKs here.
-	 * An ACK is a duplicate if:
-	 * (1) it has the same sequence number as the largest number we've seen,
-	 * (2) it has the same window as the last ACK,
-	 * (3) we have outstanding data that has not been ACKed
-	 * (4) The packet was not carrying any data.
-	 * (5) [From Floyd's paper on fast retransmit wars]
-	 *     The packet acked data after high_seq;
-	 * I've tried to order these in occurrence of most likely to fail
-	 * to least likely to fail.
-	 * [These are an extension of the rules BSD stacks use to
-	 *  determine if an ACK is a duplicate.]
-	 */
-
-	if (sk->rcv_ack_seq == ack
-		&& sk->window_seq == window_seq
-		&& len != th->doff*4
-		&& before(ack, sk->sent_seq)
-		&& after(ack, sk->high_seq))
-	{
-		/* Prevent counting of duplicate ACKs if the congestion
-		 * window is smaller than 3. Note that since we reduce
-		 * the congestion window when we do a fast retransmit,
-		 * we must be careful to keep counting if we were already
-		 * counting. The idea behind this is to avoid doing
-		 * fast retransmits if the congestion window is so small
-		 * that we cannot get 3 ACKs due to the loss of a packet
-		 * unless we are getting ACKs for retransmitted packets.
-		 */
-		if (sk->cong_window >= 3 || sk->rcv_ack_cnt > MAX_DUP_ACKS+1)
-			sk->rcv_ack_cnt++;
-		/* See draft-stevens-tcpca-spec-01 for explanation
-		 * of what we are doing here.
-		 */
-		if (sk->rcv_ack_cnt == MAX_DUP_ACKS+1) {
-			int tmp;
+                        tcp_clear_xmit_timer(sk, TIME_PROBE0);
 
-			/* We need to be a bit careful to preserve the
-			 * count of packets that are out in the system here.
-			 */
-			sk->ssthresh = max(sk->cong_window >> 1, 2);
-			sk->cong_window = sk->ssthresh+MAX_DUP_ACKS+1;
-			tmp = sk->packets_out;
-			tcp_do_retransmit(sk,0);
-			sk->packets_out = tmp;
-		} else if (sk->rcv_ack_cnt > MAX_DUP_ACKS+1) {
-			sk->cong_window++;
-			/*
-			* At this point we are suppose to transmit a NEW
-			* packet (not retransmit the missing packet,
-			* this would only get us into a retransmit war.)
-			* I think that having just adjusted cong_window
-			* we will transmit the new packet below.
-			*/
-		}
-	}
-	else
-	{
-		if (sk->rcv_ack_cnt > MAX_DUP_ACKS) {
-			sk->cong_window = sk->ssthresh;
 		}
-		sk->window_seq = window_seq;
-		sk->rcv_ack_seq = ack;
-		sk->rcv_ack_cnt = 1;
-	}
-	
-	/*
-	 *	We passed data and got it acked, remove any soft error
-	 *	log. Something worked...
-	 */
-	 
-	sk->err_soft = 0;
-
-	/*
-	 *	If this ack opens up a zero window, clear backoff.  It was
-	 *	being used to time the probes, and is probably far higher than
-	 *	it needs to be for normal retransmission.
-	 */
-
-	if (sk->ip_xmit_timeout == TIME_PROBE0) 
-	{
-		sk->retransmits = 0;	/* Our probe was answered */
-		
-		/*
-		 *	Was it a usable window open ?
-		 */
-		 
-  		if (!skb_queue_empty(&sk->write_queue) &&   /* should always be true */
-		    ! before (sk->window_seq, sk->write_queue.next->end_seq)) 
+                else
 		{
-			sk->backoff = 0;
-			
-			/*
-			 *	Recompute rto from rtt.  this eliminates any backoff.
-			 */
-
-			/*
-			 * Appendix C of Van Jacobson's final version of
-			 * the SIGCOMM 88 paper states that although
-			 * the original paper suggested that
-			 *  RTO = R*2V
-			 * was the correct calculation experience showed
-			 * better results using
-			 *  RTO = R*4V
-			 * In particular this gives better performance over
-			 * slow links, and should not effect fast links.
-			 *
-			 * Note: Jacobson's algorithm is fine on BSD which
-			 * has a 1/2 second granularity clock, but with our
-			 * 1/100 second granularity clock we become too
-	 		 * sensitive to minor changes in the round trip time.
-			 * We add in two compensating factors.
-			 * First we multiply by 5/4. For large congestion
-			 * windows this allows us to tolerate burst traffic
-			 * delaying up to 1/4 of our packets.
-			 * We also add in a rtt / cong_window term.
-			 * For small congestion windows this allows
-			 * a single packet delay, but has negligible effect
-			 * on the compensation for large windows.
-	 		 */
-			sk->rto = (sk->rtt >> 3) + sk->mdev;
-			sk->rto += (sk->rto>>2) + (sk->rto >> (sk->cong_window-1));
-			if (sk->rto > 120*HZ)
-				sk->rto = 120*HZ;
-			if (sk->rto < HZ/5)	/* Was 1*HZ, then 1 - turns out we must allow about
-						   .2 of a second because of BSD delayed acks - on a 100Mb/sec link
-						   .2 of a second is going to need huge windows (SIGH) */
-			sk->rto = HZ/5;
+                        tcp_reset_xmit_timer(sk, TIME_PROBE0, 
+					     min(tp->rto << tp->backoff, 
+						 120*HZ));
 		}
 	}
 
 	/* 
 	 *	See if we can take anything off of the retransmit queue.
 	 */
+   
+	start_bh_atomic();
 
-	for (;;) {
-		struct sk_buff * skb = sk->send_head;
-		if (!skb)
-			break;
-
+	while(((skb=skb_peek(&sk->write_queue)) != NULL) &&
+	      (skb != tp->send_head))
+	{
 		/* Check for a bug. */
-		if (skb->link3 && after(skb->end_seq, skb->link3->end_seq)) 
-			printk("INET: tcp.c: *** bug send_list out of order.\n");
-			
+
+		if (skb->next != (struct sk_buff*) &sk->write_queue &&
+		    after(skb->end_seq, skb->next->seq)) 
+			printk("INET: tcp_input.c: *** "
+			       "bug send_list out of order.\n");
+								
 		/*
 		 *	If our packet is before the ack sequence we can
-		 *	discard it as it's confirmed to have arrived the other end.
+		 *	discard it as it's confirmed to have arrived the 
+		 *	other end.
 		 */
 		 
-		if (after(skb->end_seq, ack))
-			break;
-
-		if (sk->retransmits) 
+		if (!after(skb->end_seq, ack)) 
 		{
-			/*
-			 *	We were retransmitting.  don't count this in RTT est 
-			 */
-			flag |= 2;
-		}
+			if (sk->debug)
+			{
+				printk(KERN_DEBUG "removing seg %x-%x from "
+				       "retransmit queue\n",
+				       skb->seq, skb->end_seq);
+			}
+			
+			tp->retrans_head = NULL;
+						
+			flag |= FLAG_DATA_ACKED;
+			seq = skb->seq;
+			seq_rtt = jiffies - skb->when;
+			
+			skb_unlink(skb);
+			atomic_dec(&sk->packets_out);
+			skb->free = 1;
 
-		if ((sk->send_head = skb->link3) == NULL)
+			kfree_skb(skb, FREE_WRITE);
+			
+			if (!sk->dead)
+				sk->write_space(sk);
+		}
+		else
 		{
-			sk->send_tail = NULL;
-			sk->send_next = NULL;
-			sk->retransmits = 0;
+			break;
 		}
+	}
 
-		/*
-		 * advance the send_next pointer if needed.
-		 */
-		if (sk->send_next == skb)
-			sk->send_next = sk->send_head;
+	end_bh_atomic();
+
+	/* 
+	 * if we where retransmiting don't count rtt estimate
+	 */
 
+	if (sk->retransmits)
+	{
+		if (sk->packets_out == 0)
+			sk->retransmits = 0;
+	}
+	else
+	{
 		/*
 		 * Note that we only reset backoff and rto in the
 		 * rtt recomputation code.  And that doesn't happen
@@ -946,274 +699,89 @@
 		 * suddenly.  I.e. Karn's algorithm. (SIGCOMM '87, p5.)
 		 */
 
-		/*
-		 *	We have one less packet out there. 
-		 */
-			 
-		if (sk->packets_out > 0) 
-			sk->packets_out --;
-
-		/* This is really only supposed to be called when we
-		 * are actually ACKing new data, which should exclude
-		 * the ACK handshake on an initial SYN packet as well.
-		 * Rather than introducing a new test here for this
-		 * special case, we just reset the initial values for
-		 * rtt immediately after we move to the established state.
-		 */
-		if (!(flag&2)) 	/* Not retransmitting */
-			tcp_rtt_estimator(sk,skb);
-		IS_SKB(skb);
-
-		/*
-		 *	We may need to remove this from the dev send list. 
-		 */
-		cli();
-		if (skb->next)
-			skb_unlink(skb);
-		sti();
-		kfree_skb(skb, FREE_WRITE); /* write. */
-		if (!sk->dead)
-			sk->write_space(sk);
-	}
-
-	/*
-	 * Maybe we can take some stuff off of the write queue,
-	 * and put it onto the xmit queue.
-	 * There is bizarre case being tested here, to check if
-	 * the data at the head of the queue ends before the start of
-	 * the sequence we already ACKed. This is not an error,
-	 * it can occur when we send a packet directly off of the write_queue
-	 * in a zero window probe.
-	 */
-
-	if (!skb_queue_empty(&sk->write_queue) &&
-	    	!before(sk->window_seq, sk->write_queue.next->end_seq) &&
-		(sk->retransmits == 0 || 
-		 sk->ip_xmit_timeout != TIME_WRITE ||
-		 !after(sk->write_queue.next->end_seq, sk->rcv_ack_seq)) &&
-		sk->packets_out < sk->cong_window)
-	{
-		/*
-		 *	Add more data to the send queue.
-		 */
-		tcp_write_xmit(sk);
-	}
-
-	/*
-	 * Reset timers to reflect the new state.
-	 *
-	 * from TIME_WAIT we stay in TIME_WAIT as long as we rx packets
-	 * from TCP_CLOSE we don't do anything
-	 *
-	 * from anything else, if there is queued data (or fin) pending,
-	 * we use a TIME_WRITE timeout, if there is data to write but
-	 * no room in the window we use TIME_PROBE0, else if keepalive
-	 * we reset to a KEEPALIVE timeout, else we delete the timer.
-	 *
-	 * We do not set flag for nominal write data, otherwise we may
-	 * force a state where we start to write itsy bitsy tidbits
-	 * of data.
-	 */
-
-	switch(sk->state) {
-	case TCP_TIME_WAIT:
-		/*
-		 * keep us in TIME_WAIT until we stop getting packets,
-		 * reset the timeout.
-		 */
-		tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
-		break;
-	case TCP_CLOSE:
-		/*
-		 * don't touch the timer.
-		 */
-		break;
-	default:
-		/*
-		 * 	Must check send_head and write_queue
-		 * 	to determine which timeout to use.
-		 */
-		if (sk->send_head) {
-			tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
-		} else if (!skb_queue_empty(&sk->write_queue)
-			&& sk->ack_backlog == 0)
+		if (flag & FLAG_DATA_ACKED)
 		{
-			/* 
-			 * if the write queue is not empty when we get here
-			 * then we failed to move any data to the retransmit
-			 * queue above. (If we had send_head would be non-NULL).
-			 * Furthermore, since the send_head is NULL here
-			 * we must not be in retransmit mode at this point.
-			 * This implies we have no packets in flight,
-			 * hence sk->packets_out < sk->cong_window.
-			 * Examining the conditions for the test to move
-			 * data to the retransmission queue we find that
-			 * we must therefore have a zero window.
-			 * Hence, if the ack_backlog is 0 we should initiate
-			 * a zero probe.
-			 * We don't do a zero probe if we have a delayed
-			 * ACK in hand since the other side may have a
-			 * window opening, but they are waiting to hear
-			 * from us before they tell us about it.
-			 * (They are applying Nagle's rule).
-			 * So, we don't set up the zero window probe
-			 * just yet. We do have to clear the timer
-			 * though in this case...
-			 */
-			tcp_reset_xmit_timer(sk, TIME_PROBE0, sk->rto);
-		} else if (sk->keepopen) {
-			tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
-		} else {
-			del_timer(&sk->retransmit_timer);
-			sk->ip_xmit_timeout = 0;
+			tcp_rtt_estimator(tp, seq_rtt);
+			if (sysctl_tcp_vegas_cong_avoidance)
+			{
+				tcp_cong_avoid_vegas(sk, seq, ack, seq_rtt);
+			}
+			else
+			{
+				tcp_cong_avoid_vanj(sk, seq, ack, seq_rtt);
+			}
 		}
-		break;
-	}
-
-	/*
-	 *	We have nothing queued but space to send. Send any partial
-	 *	packets immediately (end of Nagle rule application).
-	 */
-	 
-	if (sk->packets_out == 0
-	    && sk->partial != NULL
-	    && skb_queue_empty(&sk->write_queue)
-	    && sk->send_head == NULL) 
-	{
-		tcp_send_partial(sk);
 	}
 
-	/*
-	 * In the LAST_ACK case, the other end FIN'd us.  We then FIN'd them, and
-	 * we are now waiting for an acknowledge to our FIN.  The other end is
-	 * already in TIME_WAIT.
-	 *
-	 * Move to TCP_CLOSE on success.
-	 */
+			
 
-	if (sk->state == TCP_LAST_ACK) 
+	/* Sanity check out packets_out counter */
+	if (skb_queue_len(&sk->write_queue) == 0 || 
+	    ack == tp->snd_nxt ) 
 	{
-		if (!sk->dead)
-			sk->state_change(sk);
-		if(sk->debug)
-			printk("rcv_ack_seq: %X==%X, acked_seq: %X==%X\n",
-				sk->rcv_ack_seq,sk->write_seq,sk->acked_seq,sk->fin_seq);
-		if (sk->rcv_ack_seq == sk->write_seq /*&& sk->acked_seq == sk->fin_seq*/) 
+		if (sk->packets_out) 
 		{
-			sk->shutdown = SHUTDOWN_MASK;
-			tcp_set_state(sk,TCP_CLOSE);
-			return 1;
-		}
+			printk(KERN_DEBUG "tcp_ack: packets_out %d\n",
+			       sk->packets_out);
+                        sk->packets_out = 0;
+                }
 	}
 
-	/*
-	 *	Incoming ACK to a FIN we sent in the case of our initiating the close.
-	 *
-	 *	Move to FIN_WAIT2 to await a FIN from the other end. Set
-	 *	SEND_SHUTDOWN but not RCV_SHUTDOWN as data can still be coming in.
-	 */
 
-	if (sk->state == TCP_FIN_WAIT1) 
+	if (sk->packets_out)
 	{
-
-		if (!sk->dead) 
-			sk->state_change(sk);
-		if (sk->rcv_ack_seq == sk->write_seq) 
+		if (flag & FLAG_DATA_ACKED)
 		{
-			sk->shutdown |= SEND_SHUTDOWN;
-			tcp_set_state(sk, TCP_FIN_WAIT2);
-			/* If the socket is dead, then there is no
-			 * user process hanging around using it.
-			 * We want to set up a FIN_WAIT2 timeout ala BSD.
-			 */
-			if (sk->dead)
-				tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_FIN_TIMEOUT);
+			long when;
+				
+			skb = skb_peek(&sk->write_queue);
+		
+			when = tp->rto - (jiffies - skb->when);
+		
+			if (when <= 0) 
+			{
+				tp->retrans_head = NULL;
+				/* 
+				 * This is tricky. We are retransmiting a 
+				 * segment of a window when congestion occured.
+				 */
+				tcp_do_retransmit(sk, 0);
+				tcp_reset_xmit_timer(sk, TIME_RETRANS,
+						     tp->rto);
+			}
+			else 
+				tcp_reset_xmit_timer(sk, TIME_RETRANS, when);
 		}
 	}
+	else
+		tcp_clear_xmit_timer(sk, TIME_RETRANS);
+	
 
 	/*
-	 *	Incoming ACK to a FIN we sent in the case of a simultaneous close.
-	 *
-	 *	Move to TIME_WAIT
+	 *	Remember the highest ack received.
 	 */
+	 
+	tp->snd_una = ack;
+
+	tcp_fast_retrans(sk, ack, (flag & (FLAG_DATA|FLAG_WIN_UPDATE)));
 
-	if (sk->state == TCP_CLOSING) 
-	{
 
-		if (!sk->dead) 
-			sk->state_change(sk);
-		if (sk->rcv_ack_seq == sk->write_seq) 
-		{
-			tcp_time_wait(sk);
-		}
-	}
-	
-	/*
-	 *	Final ack of a three way shake 
-	 */
-	 
-	if (sk->state==TCP_SYN_RECV)
-	{
-		tcp_set_state(sk, TCP_ESTABLISHED);
-		tcp_options(sk,th);
-		sk->dummy_th.dest=th->source;
-		sk->copied_seq = sk->acked_seq;
-		if(!sk->dead)
-			sk->state_change(sk);
-		if(sk->max_window==0)
-		{
-			sk->max_window=32;	/* Sanity check */
-			sk->mss=min(sk->max_window,sk->mtu);
-		}
-		/* Reset the RTT estimator to the initial
-		 * state rather than testing to avoid
-		 * updating it on the ACK to the SYN packet.
-		 */
-		sk->rtt = 0;
-		sk->rto = TCP_TIMEOUT_INIT;
-		sk->mdev = TCP_TIMEOUT_INIT;
-	}
-	
 	/*
-	 * The following code has been greatly simplified from the
-	 * old hacked up stuff. The wonders of properly setting the
-	 * retransmission timeouts.
-	 *
-	 * If we are retransmitting, and we acked a packet on the retransmit
-	 * queue, and there is still something in the retransmit queue,
-	 * then we can output some retransmission packets.
+	 * Maybe we can take some stuff off of the write queue,
+	 * and put it onto the xmit queue.
 	 */
 
-	if (sk->send_head != NULL && (flag&2) && sk->retransmits)
-	{
-		tcp_do_retransmit(sk, 1);
-	}
 
 	return 1;
 
 uninteresting_ack:
+
+	tcp_fast_retrans(sk, ack, 0);
+
 	if(sk->debug)
-		printk("Ack ignored %u %u\n",ack,sk->sent_seq);
+		printk("Ack ignored %u %u\n",ack,tp->snd_nxt);
 			
-	/*
-	 *	Keepalive processing.
-	 */
-		 
-	if (after(ack, sk->sent_seq)) 
-	{
-		return 0;
-	}
-		
-	/*
-	 *	Restart the keepalive timer.
-	 */
-		 
-	if (sk->keepopen) 
-	{
-		if(sk->ip_xmit_timeout==TIME_KEEPOPEN)
-			tcp_reset_xmit_timer(sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
-	}
-	return 1;
+	return 0;
 }
 
 
@@ -1237,6 +805,8 @@
 {
 	sk->fin_seq = skb->end_seq;
 
+	tcp_send_ack(sk);
+
 	if (!sk->dead) 
 	{
 		sk->state_change(sk);
@@ -1249,10 +819,11 @@
 		case TCP_SYN_SENT:
 		case TCP_ESTABLISHED:
 			/*
-			 * move to CLOSE_WAIT, tcp_data() already handled
-			 * sending the ack.
+			 * move to CLOSE_WAIT
 			 */
-			tcp_set_state(sk,TCP_CLOSE_WAIT);
+
+			tcp_set_state(sk, TCP_CLOSE_WAIT);
+			
 			if (th->rst)
 				sk->shutdown = SHUTDOWN_MASK;
 			break;
@@ -1280,27 +851,11 @@
 			 * This causes a WRITE timeout, which will either
 			 * move on to TIME_WAIT when we timeout, or resend
 			 * the FIN properly (maybe we get rid of that annoying
-			 * FIN lost hang). The TIME_WRITE code is already correct
-			 * for handling this timeout.
+			 * FIN lost hang). The TIME_WRITE code is already 
+			 * correct for handling this timeout.
 			 */
 
-			if (sk->ip_xmit_timeout != TIME_WRITE) {
-				if (sk->send_head)
-					tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
-				else if (sk->ip_xmit_timeout != TIME_PROBE0
-				|| skb_queue_empty(&sk->write_queue)) {
-					/* BUG check case.
-					 * We have a problem here if there
-					 * is no timer running [leads to
-					 * frozen socket] or no data in the
-					 * write queue [means we sent a fin
-					 * and lost it from the queue before
-					 * changing the ack properly].
-					 */
-					printk(KERN_ERR "Lost timer or fin packet in tcp_fin.\n");
-				}
-			}
-			tcp_set_state(sk,TCP_CLOSING);
+			tcp_set_state(sk, TCP_CLOSING);
 			break;
 		case TCP_FIN_WAIT2:
 			/*
@@ -1326,156 +881,176 @@
 	return(0);
 }
 
-/*
- * Add a sk_buff to the TCP receive queue, calculating
- * the ACK sequence as we go..
- */
-static inline void tcp_insert_skb(struct sk_buff * skb, struct sk_buff_head * list)
-{
-	struct sk_buff * prev, * next;
-	u32 seq;
+
 
 	/*
-	 * Find where the new skb goes.. (This goes backwards,
-	 * on the assumption that we get the packets in order)
+	 * This one checks to see if we can put data from the
+	 * out_of_order queue into the receive_queue
 	 */
-	seq = skb->seq;
-	prev = list->prev;
-	next = (struct sk_buff *) list;
-	for (;;) {
-		if (prev == (struct sk_buff *) list || !after(prev->seq, seq))
+
+static __inline__ void  tcp_ofo_queue(struct sock *sk)
+{
+	struct sk_buff * skb;
+	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+
+	while ((skb = skb_peek(&sk->out_of_order_queue))) {
+		
+		if (after(skb->seq, tp->rcv_nxt))
 			break;
-		next = prev;
-		prev = prev->prev;
+
+		if (!after(skb->end_seq, tp->rcv_nxt)) {
+
+			if (sk->debug)
+				printk("ofo packet was allready received \n");
+
+			skb_unlink(skb);
+			kfree_skb(skb, FREE_READ);
+			
+			continue;
+		}
+
+		if (sk->debug) 
+			printk("ofo requeuing : rcv_next %X seq %X - %X\n", 
+			       tp->rcv_nxt, skb->seq, skb->end_seq);
+		
+		skb_unlink(skb);
+
+		 
+		skb_queue_tail(&sk->receive_queue, skb);
+
+
+		tp->rcv_nxt = skb->end_seq;
 	}
-	__skb_insert(skb, prev, next, list);
 }
 
-/*
- * Called for each packet when we find a new ACK endpoint sequence in it
- */
-static inline u32 tcp_queue_ack(struct sk_buff * skb, struct sock * sk)
+static __inline__ void	tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
+	struct sk_buff * skb1;
+	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
+
 	/*
-	 *	When we ack the fin, we do the FIN 
-	 *	processing.
+	 *  Queue data for delivery to the user
+	 *  Packets in sequence go to the receive queue
+	 *  Out of sequence packets to out_of_order_queue
 	 */
-	skb->acked = 1;
-	if (skb->h.th->fin)
-		tcp_fin(skb,sk,skb->h.th);
-	return skb->end_seq;
-}	
 
-static void tcp_queue(struct sk_buff * skb, struct sock * sk, struct tcphdr *th)
-{
-	u32 ack_seq;
 
-	tcp_insert_skb(skb, &sk->receive_queue);
+	if (skb->seq == tp->rcv_nxt) {
+
+		/*
+		 * Ok. In sequence.
+		 */
+		
+ 
+		skb_queue_tail(&sk->receive_queue, skb);
+
+
+		tp->rcv_nxt = skb->end_seq;
+
+		tcp_ofo_queue(sk);
+		
+		if (skb_queue_len(&sk->out_of_order_queue) == 0)
+			tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
 
+		return;
+	}
+	
 	/*
-	 * Did we get anything new to ack?
+	 *  Not in sequence
+	 *  either a retransmit or some packet got lost
 	 */
-	ack_seq = sk->acked_seq;
 
+	if (!after(skb->end_seq, tp->rcv_nxt)) {
+		
+		/* 
+		 * A retransmit.
+		 * 2nd most common case.
+		 * force an imediate ack
+		 */
 
-	if (!after(skb->seq, ack_seq)) {
-		if (after(skb->end_seq, ack_seq)) {
-			/* the packet straddles our window end */
-			struct sk_buff_head * list = &sk->receive_queue;
-			struct sk_buff * next;
-			ack_seq = tcp_queue_ack(skb, sk);
+		if (sk->debug) 
+			printk("retransmit received: seq %X\n", skb->seq);
 
-			/*
-			 * Do we have any old packets to ack that the above
-			 * made visible? (Go forward from skb)
-			 */
-			next = skb->next;
-			while (next != (struct sk_buff *) list) {
-				if (after(next->seq, ack_seq))
-					break;
-				if (after(next->end_seq, ack_seq))
-					ack_seq = tcp_queue_ack(next, sk);
-				next = next->next;
-			}
+		sk->delayed_acks = MAX_DELAY_ACK;
+		kfree_skb(skb, FREE_READ);
 
-			/*
-			 * Ok, we found new data, update acked_seq as
-			 * necessary (and possibly send the actual
-			 * ACK packet).
-			 */
-			sk->acked_seq = ack_seq;
+		return;
+	}
 
-		} else {
-			if (sk->debug)
-				printk("Ack duplicate packet.\n");
-			tcp_send_ack(sk);
-			return;
-		}
 
+	if (before(skb->seq, tp->rcv_nxt)) {
 
 		/*
-		 * Delay the ack if possible.  Send ack's to
-		 * fin frames immediately as there shouldn't be
-		 * anything more to come.
+		 * Partial packet
+		 * seq < rcv_next < end_seq
 		 */
-		if (!sk->delay_acks || th->fin) {
-			tcp_send_ack(sk);
-		} else {
-			/*
-			 * If psh is set we assume it's an
-			 * interactive session that wants quick
-			 * acks to avoid nagling too much. 
-			 */
-			int delay = HZ/2;
-			if (th->psh)
-				delay = HZ/50;
-			tcp_send_delayed_ack(sk, delay, sk->ato);
-		}
 
-		/*
-		 *	Tell the user we have some more data.
-		 */
+		if (sk->debug) 
+			printk("partial packet: rcv_next %X seq %X - %X\n", 
+			       tp->rcv_nxt, skb->seq, skb->end_seq);
+		
+		skb_queue_tail(&sk->receive_queue, skb);
 
-		if (!sk->dead)
-			sk->data_ready(sk,0);
 
-	}
-	else
-	{
-	    /*
-	     *	If we've missed a packet, send an ack.
-	     *	Also start a timer to send another.
-	     *
-	     *	4.3reno machines look for these kind of acks so
-	     *	they can do fast recovery. Three identical 'old'
-	     *	acks lets it know that one frame has been lost
-	     *      and should be resent. Because this is before the
-	     *	whole window of data has timed out it can take
-	     *	one lost frame per window without stalling.
-	     *	[See Jacobson RFC1323, Stevens TCP/IP illus vol2]
-	     *
-	     *	We also should be spotting triple bad sequences.
-	     *	[We now do this.]
-	     *
-	     */
-	     
-	    if (!skb->acked) 
-	    {
-		    if(sk->debug)
-			    printk("Ack past end of seq packet.\n");
-		    tcp_send_ack(sk);
-		    /*
-		     * We need to be very careful here. We must
-		     * not violate Jacobsons packet conservation condition.
-		     * This means we should only send an ACK when a packet
-		     * leaves the network. We can say a packet left the
-		     * network when we see a packet leave the network, or
-		     * when an rto measure expires.
-		     */
-		    tcp_send_delayed_ack(sk,sk->rto,sk->rto);
-	    }
-	}
-}
+		tp->rcv_nxt = skb->end_seq;
+
+		tcp_ofo_queue(sk);
+
+		if (skb_queue_len(&sk->out_of_order_queue) == 0)
+			tp->pred_flags = htonl((0x5010 << 16) | tp->snd_wnd);
+
+		return;		
+	}
+
+	/* 
+	 * Ok. This is an out_of_order segment 
+	 */
+	
+	/* Force an ack */
+	
+	sk->delayed_acks = MAX_DELAY_ACK;
+
+	/*
+	 *	disable header predition
+	 */
+
+	tp->pred_flags = 0;
+
+	if (sk->debug) 
+		printk("out of order segment: rcv_next %X seq %X - %X\n", 
+		       tp->rcv_nxt, skb->seq, skb->end_seq);
+
+	if (skb_peek(&sk->out_of_order_queue) == NULL) {
+		skb_queue_head(&sk->out_of_order_queue,skb);
+	}
+	else 
+		for(skb1=sk->out_of_order_queue.prev; ; skb1 = skb1->prev) {
+
+			/* allready there */
+			if (skb->seq==skb1->seq && skb->len>=skb1->len)
+			{
+ 				skb_append(skb1,skb);
+ 				skb_unlink(skb1);
+ 				kfree_skb(skb1,FREE_READ);
+				break;
+			}
+			
+			if (after(skb->seq, skb1->seq))
+			{
+				skb_append(skb1,skb);
+				break;
+			}
+			
+                        /*
+			 *	See if we've hit the start. If so insert.
+			 */
+			if (skb1 == skb_peek(&sk->out_of_order_queue)) {
+				skb_queue_head(&sk->out_of_order_queue,skb);
+				break;
+			}
+		}
+			
+}
 
 
 /*
@@ -1484,117 +1059,124 @@
  *	room, then we will just have to discard the packet.
  */
 
-static int tcp_data(struct sk_buff *skb, struct sock *sk, 
-	 unsigned long saddr, unsigned int len)
+static int tcp_data(struct sk_buff *skb, struct sock *sk, unsigned int len)
 {
 	struct tcphdr *th;
-	u32 new_seq, shut_seq;
+	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
 
 	th = skb->h.th;
 	skb_pull(skb,th->doff*4);
 	skb_trim(skb,len-(th->doff*4));
 
+        if (skb->len == 0 && !th->fin)
+        {
+		return(0);
+        }
+
 	/*
-	 *	The bytes in the receive read/assembly queue has increased. Needed for the
-	 *	low memory discard algorithm 
+	 *	FIXME: don't accept data after the receved fin
+	 */
+
+	/*
+	 *	The bytes in the receive read/assembly queue has increased. 
+	 *	Needed for the low memory discard algorithm 
 	 */
 	   
 	sk->bytes_rcv += skb->len;
-	
-	if (skb->len == 0 && !th->fin) 
+		
+	/*
+	 *	We no longer have anyone receiving data on this connection.
+	 */
+
+	tcp_data_queue(sk, skb);
+
+	if (before(tp->rcv_nxt, sk->copied_seq)) 
 	{
-		/* 
-		 *	Don't want to keep passing ack's back and forth. 
-		 *	(someone sent us dataless, boring frame)
-		 */
-		if (!th->ack)
-			tcp_send_ack(sk);
-		kfree_skb(skb, FREE_READ);
-		return(0);
+		printk("*** tcp.c:tcp_data bug acked < copied\n");
+		tp->rcv_nxt = sk->copied_seq;
 	}
 
+	sk->delayed_acks++;
+	
 
 	/*
-	 *	We no longer have anyone receiving data on this connection.
+	 *	Now tell the user we may have some data. 
 	 */
+	 
+	if (!sk->dead) 
+	{
+        	if(sk->debug)
+        		printk("Data wakeup.\n");
+		sk->data_ready(sk,0);
+	} 
+	return(1);
+}
 
-#ifndef TCP_DONT_RST_SHUTDOWN		 
+static void tcp_data_snd_check(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct tcp_opt *tp=&(sk->tp_pinfo.af_tcp);
 
-	if(sk->shutdown & RCV_SHUTDOWN)
+	if ((skb = tp->send_head)) 
 	{
-		/*
-		 *	FIXME: BSD has some magic to avoid sending resets to
-		 *	broken 4.2 BSD keepalives. Much to my surprise a few non
-		 *	BSD stacks still have broken keepalives so we want to
-		 *	cope with it.
-		 */
-
-		if(skb->len)	/* We don't care if it's just an ack or
-				   a keepalive/window probe */
+		if (!after(skb->end_seq, tp->snd_una + tp->snd_wnd) &&
+		    sk->packets_out < sk->cong_window )
 		{
-			new_seq = skb->seq + skb->len + th->syn;	/* Right edge of _data_ part of frame */
-			
-			/* Do this the way 4.4BSD treats it. Not what I'd
-			   regard as the meaning of the spec but it's what BSD
-			   does and clearly they know everything 8) */
-
 			/*
-			 *	This is valid because of two things
-			 *
-			 *	a) The way tcp_data behaves at the bottom.
-			 *	b) A fin takes effect when read not when received.
+			 *	Add more data to the send queue.
 			 */
-			 
-			shut_seq = sk->acked_seq+1;	/* Last byte */
-			
-			if(after(new_seq,shut_seq))
-			{
-				if(sk->debug)
-					printk("Data arrived on %p after close [Data right edge %X, Socket shut on %X] %d\n",
-						sk, new_seq, shut_seq, sk->blog);
-				if(sk->dead)
-				{
-					sk->acked_seq = new_seq + th->fin;
-					tcp_send_reset(sk->saddr, sk->daddr, skb->h.th,
-						sk->prot, NULL, skb->dev, sk->ip_tos, sk->ip_ttl);
-					tcp_statistics.TcpEstabResets++;
-					sk->err = EPIPE;
-					sk->error_report(sk);
-					sk->shutdown = SHUTDOWN_MASK;
-					tcp_set_state(sk,TCP_CLOSE);
-					kfree_skb(skb, FREE_READ);
-					return 0;
-				}
-			}
+
+			tcp_write_xmit(sk);
+			wake_up_interruptible(sk->sleep);
 		}
+		else if (sk->packets_out == 0 && !tp->pending)
+ 		{
+ 			/*
+ 			 *	Data to queue but no room.
+ 			 */
+ 			tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto);
+ 		}		
 	}
+}	
 
-#endif
-
+static __inline__ void tcp_ack_snd_check(struct sock *sk)
+{
 	/*
-  	 * We should only call this if there is data in the frame.
- 	 */
-	tcp_delack_estimator(sk);
+	 *	This also takes care of updating the window.
+	 *	This if statement needs to be simplified.
+	 *
+	 *      rules for delaying an ack:
+	 *      - delay time <= 0.5 HZ
+	 *      - we don't have a window update to send
+	 *      - must send at least every 2 full sized packets
+	 */
 
-	tcp_queue(skb, sk, th);
+	if (sk->delayed_acks == 0)
+		return;
 
-	return(0);
+	if (sk->delayed_acks >= MAX_DELAY_ACK || tcp_raise_window(sk)) 
+	{
+		tcp_send_ack(sk);
+	}
+	else 
+	{	
+		tcp_send_delayed_ack(sk, HZ/2);		
+	}
 }
 
-
 /*
  *	This routine is only called when we have urgent data
  *	signalled. Its the 'slow' part of tcp_urg. It could be
  *	moved inline now as tcp_urg is only called from one
  *	place. We handle URGent data wrong. We have to - as
  *	BSD still doesn't use the correction from RFC961.
- *
  *	For 1003.1g we should support a new option TCP_STDURG to permit
  *	either form.
  */
  
 static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
 {
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 	u32 ptr = ntohs(th->urg_ptr);
 
 	if (ptr)
@@ -1628,6 +1210,9 @@
 		sk->copied_seq++;	/* Move the copied sequence on correctly */
 	sk->urg_data = URG_NOTYET;
 	sk->urg_seq = ptr;
+
+	/* disable header prediction */
+	tp->pred_flags = 0;
 }
 
 /*
@@ -1662,429 +1247,430 @@
 	}
 }
 
-/*
- * This should be a bit smarter and remove partially
- * overlapping stuff too, but this should be good
- * enough for any even remotely normal case (and the
- * worst that can happen is that we have a few
- * unnecessary packets in the receive queue).
- *
- * This function is never called with an empty list..
- */
-static inline void tcp_remove_dups(struct sk_buff_head * list)
-{
-	struct sk_buff * next = list->next;
-
-	for (;;) {
-		struct sk_buff * skb = next;
-		next = next->next;
-		if (next == (struct sk_buff *) list)
-			break;
-		if (before(next->end_seq, skb->end_seq)) {
-			__skb_unlink(next, list);
-			kfree_skb(next, FREE_READ);
-			next = skb;
-			continue;
-		}
-		if (next->seq != skb->seq)
-			continue;
-		__skb_unlink(skb, list);
-		kfree_skb(skb, FREE_READ);
-	}
-}
 
-/*
- * Throw out all unnecessary packets: we've gone over the
- * receive queue limit. This shouldn't happen in a normal
- * TCP connection, but we might have gotten duplicates etc.
- */
-static void prune_queue(struct sk_buff_head * list)
+static __inline__ void prune_queue(struct sock *sk)
 {
-	for (;;) {
-		struct sk_buff * skb = list->prev;
+	struct sk_buff * skb;
 
-		/* gone through it all? */
-		if (skb == (struct sk_buff *) list)
-			break;
-		if (!skb->acked) {
-			__skb_unlink(skb, list);
-			kfree_skb(skb, FREE_READ);
-			continue;
-		}
-		tcp_remove_dups(list);
-		break;
+	/*
+	 *	clean the out_of_order queue
+	 */
+
+	while ((skb = skb_dequeue(&sk->out_of_order_queue))) 
+	{
+		kfree_skb(skb, FREE_READ);
 	}
 }
 
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-/*
- *	Check whether a received TCP packet might be for one of our
- *	connections.
- */
 
-int tcp_chkaddr(struct sk_buff *skb)
+void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+			 struct tcphdr *th, __u16 len)
 {
-	struct iphdr *iph = skb->h.iph;
-	struct tcphdr *th = (struct tcphdr *)(skb->h.raw + iph->ihl*4);
-	struct sock *sk;
-
-	sk = get_sock(&tcp_prot, th->dest, iph->saddr, th->source, iph->daddr, 0, 0);
-
-	if (!sk) return 0;
-	/* 0 means accept all LOCAL addresses here, not all the world... */
-	if (sk->rcv_saddr == 0) return 0;
-	return 1;
-}
-#endif
-
-/*
- *	A TCP packet has arrived.
- *		skb->h.raw is the TCP header.
- */
- 
-int tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
-	__u32 daddr, unsigned short len,
-	__u32 saddr, int redo, struct inet_protocol * protocol)
-{
-	struct tcphdr *th;
-	struct sock *sk;
-	__u32 seq;
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-	int r;
-#endif
+	struct tcp_opt *tp;
+	int queued = 0;
+	u32 flg;
+	
+	/*
+	 *	Header prediction.
+	 *	The code follows the one in the famous 
+	 *	"30 instruction TCP receive" Van Jacobson mail.
+	 *	
+	 *	Van's trick is to deposit buffers into socket queue 
+	 *	on a device interrupt, to call tcp_recv function
+	 *	on the receive process context and checksum and copy
+	 *	the buffer to user space. smart...
+	 *
+	 *	Our current scheme is not silly either but we take the 
+	 *	extra cost of the net_bh soft interrupt processing...
+	 *	We do checksum and copy also but from device to kernel.
+	 */
 
+	tp = &(sk->tp_pinfo.af_tcp); 
+	flg = *(((u32 *)th) + 3);
+		
 	/*
-	 * "redo" is 1 if we have already seen this skb but couldn't
-	 * use it at that time (the socket was locked).  In that case
-	 * we have already done a lot of the work (looked up the socket
-	 * etc).
+	 *	pred_flags is 0x5?10 << 16 + snd_wnd
+	 *	if header_predition is to be made
+	 *	? will be 0 else it will be !0
+	 *	(when there are holes in the receive 
+	 *	 space for instance)
 	 */
-	th = skb->h.th;
-	sk = skb->sk;
-	if (!redo) {
-		tcp_statistics.TcpInSegs++;
-		if (skb->pkt_type!=PACKET_HOST)
-			goto discard_it;
 
-		/*
-		 *	Pull up the IP header.
-		 */
-	
-		skb_pull(skb, skb->h.raw-skb->data);
+	if (flg == tp->pred_flags && skb->seq == tp->rcv_nxt)
+	{
+		if (len <= sizeof(struct tcphdr))
+		{
+			if (len == sizeof(struct tcphdr))
+			{
+				tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
+			}
 
-		/*
-		 *	Try to use the device checksum if provided.
-		 */
-		switch (skb->ip_summed) 
+			tcp_data_snd_check(sk);
+
+			kfree_skb(skb, FREE_READ);
+			return;
+			
+		}
+		else if (skb->ack_seq == tp->snd_una)
 		{
-			case CHECKSUM_NONE:
-				skb->csum = csum_partial((char *)th, len, 0);
-			case CHECKSUM_HW:
-				if (tcp_check(th, len, saddr, daddr, skb->csum))
-					goto discard_it;
-			default:
-				/* CHECKSUM_UNNECESSARY */
-		}
-		sk = get_tcp_sock(saddr, th->source, daddr, th->dest, dev->pa_addr, skb->redirport);
-		if (!sk)
-			goto no_tcp_socket;
-		skb->sk = sk;
-		skb->seq = ntohl(th->seq);
-		skb->end_seq = skb->seq + th->syn + th->fin + len - th->doff*4;
-		skb->ack_seq = ntohl(th->ack_seq);
-
-		skb->acked = 0;
-		skb->used = 0;
-		skb->free = 1;
-		skb->saddr = daddr;
-		skb->daddr = saddr;
+			/* 
+			 * Bulk data transfer: receiver 
+			 */
+			
+			skb_pull(skb,sizeof(struct tcphdr));
+			
+			skb_queue_tail(&sk->receive_queue, skb);
+			tp->rcv_nxt = skb->end_seq;
+			sk->bytes_rcv += len - sizeof(struct tcphdr);
+			
+			sk->data_ready(sk, 0);
+			tcp_delack_estimator(tp);
 
-		/*
-		 * We may need to add it to the backlog here. 
-		 */
-		if (sk->users) 
+			if (sk->delayed_acks++)
+			{
+				tcp_send_delayed_ack(sk, HZ/2);
+			}
+			else
+				tcp_send_ack(sk);
+
+			return;
+		}
+	}
+
+	if (!tcp_sequence(tp, skb->seq, skb->end_seq))
+	{
+		if (!th->rst)
 		{
-			__skb_queue_tail(&sk->back_log, skb);
-			return(0);
+			if (after(skb->seq, tp->rcv_nxt))
+			{
+				printk(KERN_DEBUG "->seq:%d end:%d "
+				       "wup:%d wnd:%d\n",
+				       skb->seq, skb->end_seq, 
+				       tp->rcv_wup, tp->rcv_wnd);
+			}
+			tcp_send_ack(sk);
+			kfree_skb(skb, FREE_READ);
+			return;
 		}
 	}
 
+	if(th->syn && skb->seq != sk->syn_seq)
+	{
+		printk(KERN_DEBUG "syn in established state\n");
+		tcp_reset(sk, skb);
+		kfree_skb(skb, FREE_READ);
+		return;
+	}
+	
+	if(th->rst)
+	{
+		tcp_reset(sk,skb);
+		kfree_skb(skb, FREE_READ);
+		return;
+	}
+	
+	if(th->ack)
+	{
+		tcp_ack(sk, th, skb->seq, skb->ack_seq, len);
+	}
+
+	
 	/*
-	 *	If this socket has got a reset it's to all intents and purposes 
-	 *	really dead. Count closed sockets as dead.
-	 *
-	 *	Note: BSD appears to have a bug here. A 'closed' TCP in BSD
-	 *	simply drops data. This seems incorrect as a 'closed' TCP doesn't
-	 *	exist so should cause resets as if the port was unreachable.
+	 *	Process urgent data
 	 */
 
-	if (sk->zapped || sk->state==TCP_CLOSE)
-		goto no_tcp_socket;
+	tcp_urg(sk, th, len);
 
-	if (!sk->prot) 
+	/*
+	 *	step 7: process the segment text
+	 */
+
+
+	queued = tcp_data(skb, sk, len);
+
+	/*
+	 *	step 8: check the FIN bit
+	 */
+
+	if (th->fin)
 	{
-		printk(KERN_CRIT "IMPOSSIBLE 3\n");
-		return(0);
+		tcp_fin(skb, sk, th);
 	}
 
+	tcp_data_snd_check(sk);
+	tcp_ack_snd_check(sk);
 
 	/*
-	 *	Charge the memory to the socket. 
+	 *	If our receive queue has grown past its limits,
+	 *	try to prune away duplicates etc..
 	 */
-	 
-	skb->sk=sk;
-	atomic_add(skb->truesize, &sk->rmem_alloc);
+	if (sk->rmem_alloc > sk->rcvbuf)
+		prune_queue(sk);
 
 	/*
-	 * Mark the time of the last received packet.
-	 */
-	sk->idletime = jiffies;
+	 *	And done
+	 */	
 	
+	if (queued)
+		return;
+
+	kfree_skb(skb, FREE_READ);
+}
+		
+
+/*
+ *	This function implements the receiving procedure of RFC 793.
+ *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ *	address independent.
+ */
+	
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+			  struct tcphdr *th, void *opt, __u16 len)
+{
+	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+	int queued = 0;
+	int rcv_mss;
+
 	/*
-	 *	We should now do header prediction.
-	 */
-	 
-	/*
-	 *	This basically follows the flow suggested by RFC793, with the corrections in RFC1122. We
-	 *	don't implement precedence and we process URG incorrectly (deliberately so) for BSD bug
-	 *	compatibility. We also set up variables more thoroughly [Karn notes in the
-	 *	KA9Q code the RFC793 incoming segment rules don't initialise the variables for all paths].
+	 *	state == CLOSED
+	 *	tested in tcp_v{4,6}_rcv
 	 */
 
-	if(sk->state!=TCP_ESTABLISHED)		/* Skip this lot for normal flow */
-	{
-	
-		/*
-		 *	Now deal with unusual cases.
+	switch (sk->state) {
+
+
+	case TCP_LISTEN:
+		
+		if (th->rst)			
+			goto discard;
+
+		/* 
+		 * These use the socket TOS.. 
+		 * might want to be the received TOS 
 		 */
-	 
-		if(sk->state==TCP_LISTEN)
-		{
-			if(th->ack)	/* These use the socket TOS.. might want to be the received TOS */
-				tcp_send_reset(daddr,saddr,th,sk->prot,opt,dev,sk->ip_tos, sk->ip_ttl);
 
+		if(th->ack)
+		{	
 			/*
-			 *	We don't care for RST, and non SYN are absorbed (old segments)
-			 *	Broadcast/multicast SYN isn't allowed. Note - bug if you change the
-			 *	netmask on a running connection it can go broadcast. Even Sun's have
-			 *	this problem so I'm ignoring it 
+			 *  send reset
 			 */
-			   
-#ifdef CONFIG_IP_TRANSPARENT_PROXY
-			/*
-			 * We may get non-local addresses and still want to
-			 * handle them locally, due to transparent proxying.
-			 * Thus, narrow down the test to what is really meant.
-			 */
-			if(th->rst || !th->syn || th->ack || (r = ip_chk_addr(daddr)) == IS_BROADCAST || r == IS_MULTICAST)
-#else
-			if(th->rst || !th->syn || th->ack || ip_chk_addr(daddr)!=IS_MYADDR)
-#endif
-			{
-				kfree_skb(skb, FREE_READ);
-				return 0;
-			}
+
+			return 1;
+		}
 		
-			/*	
-			 *	Guess we need to make a new socket up
-			 */
-			seq = secure_tcp_sequence_number(saddr, daddr,
-							 skb->h.th->dest,
-							 skb->h.th->source);
-			tcp_conn_request(sk, skb, daddr, saddr, opt, dev, seq);
 		
-			/*
-			 *	Now we have several options: In theory there is nothing else
-			 *	in the frame. KA9Q has an option to send data with the syn,
-			 *	BSD accepts data with the syn up to the [to be] advertised window
-			 *	and Solaris 2.1 gives you a protocol error. For now we just ignore
-			 *	it, that fits the spec precisely and avoids incompatibilities. It
-			 *	would be nice in future to drop through and process the data.
+		if(th->syn)
+		{
+			int err;
+			__u32 isn;
+
+			isn = tp->af_specific->init_sequence(sk, skb);
+			err = tp->af_specific->conn_request(sk, skb, opt, isn);
+
+			if (err < 0)
+				return 1;
+
+			/*
+			 *  Now we have several options: In theory there is 
+			 *  nothing else in the frame. KA9Q has an option to 
+			 *  send data with the syn, BSD accepts data with the
+			 *  syn up to the [to be] advertised window and 
+			 *  Solaris 2.1 gives you a protocol error. For now 
+			 *  we just ignore it, that fits the spec precisely 
+			 *  and avoids incompatibilities. It would be nice in
+			 *  future to drop through and process the data.
 			 *
-			 *	Now TTCP is starting to use we ought to queue this data.
+			 *  Now that TTCP is starting to be used we ought to 
+			 *  queue this data.
 			 */
-			 
-			return 0;
-		}
-	
-		/* 
-		 *	Retransmitted SYN for our socket. This is uninteresting. If sk->state==TCP_LISTEN
-		 *	then it's a new connection
-		 */
-		 
-		if (sk->state == TCP_SYN_RECV && th->syn && skb->seq+1 == sk->acked_seq)
-		{
-			kfree_skb(skb, FREE_READ);
+
 			return 0;
 		}
 		
+		goto discard;
+		break;
+
+	case TCP_SYN_SENT:
+		
 		/*
-		 *	SYN sent means we have to look for a suitable ack and either reset
-		 *	for bad matches or go to connected. The SYN_SENT case is unusual and should
+		 *	SYN sent means we have to look for a suitable ack and 
+		 *	either reset for bad matches or go to connected. 
+		 *	The SYN_SENT case is unusual and should
 		 *	not be in line code. [AC]
 		 */
 	   
-		if(sk->state==TCP_SYN_SENT)
+		if(th->ack)
 		{
-			/* Crossed SYN or previous junk segment */
-			if(th->ack)
+			/* We got an ack, but it's not a good ack */
+			if(!tcp_ack(sk,th, skb->seq, skb->ack_seq, len))
 			{
-				/* We got an ack, but it's not a good ack.
-				 * We used to test this with a call to tcp_ack,
-				 * but this loses, because it takes the SYN
-				 * packet out of the send queue, even if
-				 * the ACK doesn't have the SYN bit sent, and
-				 * therefore isn't the one we are waiting for.
-				 */
-				if (after(skb->ack_seq, sk->sent_seq) || before(skb->ack_seq, sk->rcv_ack_seq))
-				{
-					/* Reset the ack - it's an ack from a 
-					   different connection  [ th->rst is checked in tcp_send_reset()] */
-					tcp_statistics.TcpAttemptFails++;
-					tcp_send_reset(daddr, saddr, th,
-						sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
-					kfree_skb(skb, FREE_READ);
-					return(0);
-				}
-				if(th->rst)
-					return tcp_reset(sk,skb);
-				if(!th->syn)
-				{
-					/* A valid ack from a different connection
-					   start. Shouldn't happen but cover it */
-	         			tcp_statistics.TcpAttemptFails++;
-	                                tcp_send_reset(daddr, saddr, th,
-	                                        sk->prot, opt,dev,sk->ip_tos,sk->ip_ttl);
-					kfree_skb(skb, FREE_READ);
-					return 0;
-				}
-
-				/* process the ACK, get the SYN packet out
-				 * of the send queue, do other initial
-				 * processing stuff. [We know it's good, and
-				 * we know it's the SYN,ACK we want.]
-				 */
-				tcp_ack(sk,th,skb->ack_seq,len);
-
+				tcp_statistics.TcpAttemptFails++;
+				return 1;
+			}
 
-				/*
-				 *	Ok.. it's good. Set up sequence numbers and
-				 *	move to established.
-				 */
-				sk->acked_seq = skb->seq+1;
-				sk->lastwin_seq = skb->seq+1;
-				sk->fin_seq = skb->seq;
-				tcp_send_ack(sk);
-				tcp_set_state(sk, TCP_ESTABLISHED);
-				tcp_options(sk,th);
-				sk->dummy_th.dest=th->source;
-				sk->copied_seq = sk->acked_seq;
-				if(!sk->dead)
-				{
-					sk->state_change(sk);
-					sock_wake_async(sk->socket, 0);
-				}
-				if(sk->max_window==0)
-				{
-					sk->max_window = 32;
-					sk->mss = min(sk->max_window, sk->mtu);
-				}
-				/* Reset the RTT estimator to the initial
-				 * state rather than testing to avoid
-				 * updating it on the ACK to the SYN packet.
-				 */
-				sk->rtt = 0;
-				sk->rto = TCP_TIMEOUT_INIT;
-				sk->mdev = TCP_TIMEOUT_INIT;
+			if(th->rst)
+			{
+				tcp_reset(sk,skb);
+				goto discard;
 			}
-			else
+
+			if(!th->syn)
 			{
-				/* See if SYN's cross. Drop if boring */
-				if(th->syn && !th->rst)
-				{
-					/* Crossed SYN's are fine - but talking to
-					   yourself is right out... */
-					if(sk->saddr==saddr && sk->daddr==daddr &&
-						sk->dummy_th.source==th->source &&
-						sk->dummy_th.dest==th->dest)
-					{
-						tcp_statistics.TcpAttemptFails++;
-						return tcp_reset(sk,skb);
-					}
-					tcp_set_state(sk,TCP_SYN_RECV);
-					
-					/*
-					 *	FIXME:
-					 *	Must send SYN|ACK here
-					 */
-				}		
-				/* Discard junk segment */
-				kfree_skb(skb, FREE_READ);
-				return 0;
+				/* 
+				 *  A valid ack from a different connection
+				 *  start. Shouldn't happen but cover it 
+				 */
+				tcp_statistics.TcpAttemptFails++;
+				return 1;
 			}
+
 			/*
-			 *	SYN_RECV with data maybe.. drop through
+			 *	Ok.. it's good. Set up sequence 
+			 *	numbers and
+			 *	move to established.
 			 */
-			goto rfc_step6;
-		}
 
-	/*
-	 *	BSD has a funny hack with TIME_WAIT and fast reuse of a port. There is
-	 *	a more complex suggestion for fixing these reuse issues in RFC1644
-	 *	but not yet ready for general use. Also see RFC1379.
-	 *
-	 *	Note the funny way we go back to the top of this function for
-	 *	this case ("goto try_next_socket").  That also takes care of
-	 *	checking "sk->users" for the new socket as well as doing all
-	 *	the normal tests on the packet.
-	 */
-	
-#define BSD_TIME_WAIT
-#ifdef BSD_TIME_WAIT
-		if (sk->state == TCP_TIME_WAIT && th->syn && sk->dead && 
-			after(skb->seq, sk->acked_seq) && !th->rst)
-		{
-			u32 seq = sk->write_seq;
-			if(sk->debug)
-				printk("Doing a BSD time wait\n");
-			tcp_statistics.TcpEstabResets++;	   
-			atomic_sub(skb->truesize, &sk->rmem_alloc);
-			skb->sk = NULL;
-			sk->err=ECONNRESET;
-			tcp_set_state(sk, TCP_CLOSE);
-			sk->shutdown = SHUTDOWN_MASK;
-			sk=get_sock(&tcp_prot, th->dest, saddr, th->source, daddr, dev->pa_addr, skb->redirport);
-			/* this is not really correct: we should check sk->users */
-			if (sk && sk->state==TCP_LISTEN)
-			{
-				skb->sk = sk;
-				atomic_add(skb->truesize, &sk->rmem_alloc);
-				tcp_conn_request(sk, skb, daddr, saddr,opt, dev,seq+128000);
-				return 0;
+			tp->rcv_nxt = skb->seq+1;
+			tp->rcv_wnd = 0;
+			tp->rcv_wup = skb->seq+1;
+
+			tp->snd_wnd = htons(th->window);
+			tp->snd_wl1 = skb->seq;
+			tp->snd_wl2 = skb->ack_seq;
+
+			sk->fin_seq = skb->seq;
+			tcp_send_ack(sk);
+
+			tcp_set_state(sk, TCP_ESTABLISHED);
+			rcv_mss = tcp_parse_options(th);
+			
+			if (rcv_mss == 0)
+			{
+				rcv_mss = 536;
 			}
-			kfree_skb(skb, FREE_READ);
+
+			sk->mss = min(sk->mss, rcv_mss);
+			
+			sk->dummy_th.dest = th->source;
+			sk->copied_seq = tp->rcv_nxt;
+
+			if(!sk->dead)
+			{
+				sk->state_change(sk);
+				sock_wake_async(sk->socket, 0);
+			}
+
+			/* Drop through step 6 */
+			goto step6;
+		}
+		else
+		{
+			if(th->syn && !th->rst)
+			{
+				/* 
+				 * the previous version of the code
+				 * checked for "connecting to self"
+				 * here. that check is done now in
+				 * tcp_connect
+				 */
+
+				tcp_set_state(sk, TCP_SYN_RECV);
+				
+				tp->rcv_nxt = skb->seq + 1;
+				tp->rcv_wup = skb->seq + 1;
+
+				tp->snd_wnd = htons(th->window);
+				tp->snd_wl1 = skb->seq;
+				
+				tcp_send_synack(sk);
+				goto discard;
+			}		
+
+		}
+		break;
+
+	case TCP_TIME_WAIT:
+	        /*
+		 *	RFC 1122:
+		 *	"When a connection is [...] on TIME-WAIT state [...]
+		 *	[a TCP] MAY accept a new SYN from the remote TCP to
+		 *	reopen the connection directly, if it:
+		 *	
+		 *	(1)  assigns its initial sequence number for the new
+                 *	connection to be larger than the largest sequence
+                 *	number it used on the previous connection incarnation,
+                 *	and
+		 *
+		 *	(2)  returns to TIME-WAIT state if the SYN turns out 
+		 *	to be an old duplicate".
+		 */
+
+		if (th->syn && !th->rst && after(skb->seq, tp->rcv_nxt))
+		{
+			__u32 isn;
+			int err;
+
+                        atomic_sub(skb->truesize, &sk->rmem_alloc);
+                        skb->sk = NULL;
+                        sk->err = ECONNRESET;
+                        tcp_set_state(sk, TCP_CLOSE);
+                        sk->shutdown = SHUTDOWN_MASK;
+
+			isn = tp->rcv_nxt + 128000;
+
+			sk = tp->af_specific->get_sock(skb, th);
+
+			if (sk == NULL)
+				goto discard;
+
+			skb->sk = sk;
+			tp = &sk->tp_pinfo.af_tcp;
+			atomic_add(skb->truesize, &sk->rmem_alloc);
+			
+			err = tp->af_specific->conn_request(sk, skb, opt, isn);
+
+			if (err < 0)
+				return 1;
+
 			return 0;
 		}
-#endif	
+
+		break;
+
 	}
 
 	/*
-	 *	We are now in normal data flow (see the step list in the RFC)
-	 *	Note most of these are inline now. I'll inline the lot when
-	 *	I have time to test it hard and look at what gcc outputs 
+	 *	step 1: check sequence number
 	 */
 
-	if (!tcp_sequence(sk, skb->seq, skb->end_seq-th->syn))
+	if (!tcp_sequence(tp, skb->seq, skb->end_seq))
 	{
-		bad_tcp_sequence(sk, th, skb->end_seq-th->syn, dev);
-		kfree_skb(skb, FREE_READ);
-		return 0;
+		if (!th->rst)
+		{
+			tcp_send_ack(sk);
+			goto discard;
+		}
 	}
 
+
+	/*
+	 *	step 2: check RST bit
+	 */
+
 	if(th->rst)
-		return tcp_reset(sk,skb);
-	
+	{
+		tcp_reset(sk,skb);
+		goto discard;
+	}
+
 	/*
+	 *	step 3: check security and precedence 
+	 *	[ignored]
+	 */
+
+	/*
+	 *	step 4:
+	 *
 	 *	Check for a SYN, and ensure it matches the SYN we were
 	 *	first sent. We have to handle the rather unusual (but valid)
 	 *	sequence that KA9Q derived products may generate of
@@ -2098,77 +1684,152 @@
 	 *	We keep syn_seq as the sequence space occupied by the 
 	 *	original syn. 
 	 */
-	 
-	if(th->syn && skb->seq!=sk->syn_seq)
+
+	if (th->syn && skb->seq!=sk->syn_seq)
 	{
-		tcp_send_reset(daddr,saddr,th, &tcp_prot, opt, dev, skb->ip_hdr->tos, 255);
-		return tcp_reset(sk,skb);	
+		tcp_reset(sk, skb);
+		return 1;
 	}
 
 	/*
-	 *	Process the ACK
+	 *	step 5: check the ACK field
 	 */
-	 
 
-	if(th->ack && !tcp_ack(sk,th,skb->ack_seq,len))
+	if (th->ack) 
 	{
-		/*
-		 *	Our three way handshake failed.
-		 */
-		 
-		if(sk->state==TCP_SYN_RECV)
-		{
-			tcp_send_reset(daddr, saddr, th,sk->prot, opt, dev,sk->ip_tos,sk->ip_ttl);
+		int acceptable = tcp_ack(sk,th,skb->seq, skb->ack_seq,len);
+		
+		switch(sk->state) {
+		case TCP_SYN_RECV:
+			if (acceptable)
+			{
+				tcp_set_state(sk, TCP_ESTABLISHED);
+				sk->dummy_th.dest=th->source;
+				sk->copied_seq = tp->rcv_nxt;
+
+				if(!sk->dead)
+					sk->state_change(sk);		
+
+				tp->snd_una = skb->ack_seq;
+				tp->snd_wnd = htons(th->window);
+				tp->snd_wl1 = skb->seq;
+				tp->snd_wl2 = skb->ack_seq;
+
+			}
+			else
+				return 1;
+			break;
+
+		case TCP_FIN_WAIT1:
+			
+			if (tp->snd_una == sk->write_seq) 
+			{
+				sk->shutdown |= SEND_SHUTDOWN;
+				tcp_set_state(sk, TCP_FIN_WAIT2);
+				if (!sk->dead) 
+					sk->state_change(sk);
+			}
+			break;
+
+		case TCP_CLOSING:			
+
+			if (tp->snd_una == sk->write_seq) 
+			{
+				tcp_time_wait(sk);
+				if (!sk->dead) 
+					sk->state_change(sk);
+			}
+			break;
+
+		case TCP_LAST_ACK:
+
+			if (tp->snd_una == sk->write_seq) 
+			{
+				sk->shutdown = SHUTDOWN_MASK;
+				tcp_set_state(sk,TCP_CLOSE);
+				if (!sk->dead)
+					sk->state_change(sk);
+				goto discard;
+			}
+			break;
+
+		case TCP_TIME_WAIT:
+			/*
+			 * keep us in TIME_WAIT until we stop getting 
+			 * packets, reset the timeout.
+			 */
+			tcp_reset_msl_timer(sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+			break;
+
 		}
-		kfree_skb(skb, FREE_READ);
-		return 0;
 	}
-	
-rfc_step6:		/* I'll clean this up later */
+	else
+		goto discard;
 
-	/*
-	 *	If the accepted buffer put us over our queue size we
-	 *	now drop it (we must process the ack first to avoid
-	 *	deadlock cases).
-	 */
+  step6:
 
 	/*
-	 *	Process urgent data
+	 *	step 6: check the URG bit
 	 */
-	 	
+
 	tcp_urg(sk, th, len);
-	
-	/*
-	 *	Process the encapsulated data
-	 */
-	
-	if(tcp_data(skb,sk, saddr, len))
-		kfree_skb(skb, FREE_READ);
 
 	/*
-	 *	If our receive queue has grown past its limits,
-	 *	try to prune away duplicates etc..
+	 *	step 7: process the segment text
 	 */
-	if (sk->rmem_alloc > sk->rcvbuf)
-		prune_queue(&sk->receive_queue);
 
-	/*
-	 *	And done
-	 */	
+	switch (sk->state) {
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
+		if (!before(skb->seq, sk->fin_seq))
+			break;
 	
-	return 0;
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
 
-no_tcp_socket:
-	/*
-	 *	No such TCB. If th->rst is 0 send a reset (checked in tcp_send_reset)
-	 */
-	tcp_send_reset(daddr, saddr, th, &tcp_prot, opt,dev,skb->ip_hdr->tos,255);
+		/*
+		 *	RFC 793 says to queue data in this states,
+		 *	RFC 1122 says we MUST send a reset. 
+		 *	BSD 4.4 also does reset.
+		 */
+
+		if ((sk->shutdown & RCV_SHUTDOWN) && sk->dead)
+		{
+			if (after(skb->end_seq - th->fin, tp->rcv_nxt))
+			{
+				tcp_reset(sk, skb);
+				return 1;
+			}
+		}
+		
+	case TCP_ESTABLISHED:
+		queued = tcp_data(skb, sk, len);
+		break;		
+	}
 
-discard_it:
 	/*
-	 *	Discard frame
+	 *	step 8: check the FIN bit
 	 */
-	skb->sk = NULL;
+
+	if (th->fin)
+	{
+		tcp_fin(skb, sk, th);
+	}
+
+	tcp_data_snd_check(sk);
+	tcp_ack_snd_check(sk);
+
+	if (queued)
+		return 0;
+  discard:
+
 	kfree_skb(skb, FREE_READ);
 	return 0;
 }
+
+/*
+ * Local variables:
+ *  compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -fno-strength-reduce -pipe -m486 -DCPU=486 -c -o tcp_input.o tcp_input.c"
+ * c-file-style: "Linux"
+ * End:
+ */

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov