patch-2.1.8 linux/net/ipv4/tcp_timer.c

Next file: linux/net/ipv4/timer.c
Previous file: linux/net/ipv4/tcp_output.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.1.7/linux/net/ipv4/tcp_timer.c linux/net/ipv4/tcp_timer.c
@@ -18,132 +18,142 @@
  *		Matthew Dillon, <dillon@apollo.west.oic.com>
  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  *		Jorge Cwik, <jorge@laser.satlink.net>
- *
- * Fixes:
- *
- *		Eric Schenk	: Fix retransmission timeout counting.
  */
 
 #include <net/tcp.h>
 
-void tcp_delack_timer(unsigned long data)
+static void tcp_sltimer_handler(unsigned long);
+static void tcp_syn_recv_timer(unsigned long);
+static void tcp_keepalive(unsigned long data);
+
+struct timer_list	tcp_slow_timer = {
+	NULL, NULL,
+	0, 0,
+	tcp_sltimer_handler,
+};
+
+
+struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
+	{0, TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},		/* SYNACK	*/
+	{0, TCP_KEEPALIVE_PERIOD, 0, tcp_keepalive}		/* KEEPALIVE	*/
+};
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies 
+ * to optimize.
+ */
+
+void tcp_init_xmit_timers(struct sock *sk)
 {
-	tcp_send_ack((struct sock *) data);
+	init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
+	sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
+	sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
+	
+	init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
+	sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
+	sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;
+
+	init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
+	sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
+	sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
 }
 
 /*
  *	Reset the retransmission timer
  */
  
-void tcp_reset_xmit_timer(struct sock *sk, int why, unsigned long when)
+void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long when)
 {
-	del_timer(&sk->retransmit_timer);
-	sk->ip_xmit_timeout = why;
-	if (why == TIME_WRITE) {
-		/* In this case we want to timeout on the first packet
-		 * in the resend queue. If the resend queue is empty,
-		 * then the packet we are sending hasn't made it there yet,
-		 * so we timeout from the current time.
-		 */
-		if (sk->send_head) {
-			sk->retransmit_timer.expires =
-				sk->send_head->when + when;
-		} else {
-			/* This should never happen!
-		 	 */
-			printk(KERN_ERR "Error: send_head NULL in xmit_timer\n");
-			sk->ip_xmit_timeout = 0;
-			return;
-		}
-	} else {
-		sk->retransmit_timer.expires = jiffies+when;
-	}
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
-	if (sk->retransmit_timer.expires < jiffies) {
-		/* We can get here if we reset the timer on an event
-		 * that could not fire because the interrupts were disabled.
-		 * make sure it happens soon.
-		 */
-		sk->retransmit_timer.expires = jiffies+2;
+	if((long)when <= 0)
+	{		
+		printk("xmit_timer <= 0 - timer:%d when:%lx\n", what, when);
+		when=HZ/50;
 	}
-	add_timer(&sk->retransmit_timer);
-}
 
-/*
- *	POLICY:
- *
- * 	This is the normal code called for timeouts.  It does the retransmission
- * 	and then does backoff.  tcp_do_retransmit is separated out because
- * 	tcp_ack needs to send stuff from the retransmit queue without
- * 	initiating a backoff.
- */
+	switch (what) {
+	case TIME_RETRANS:
+		/*
+		 * When seting the transmit timer the probe timer 
+		 * should not be set.
+		 * The delayed ack timer can be set if we are changing the
+		 * retransmit timer when removing acked frames.
+		 */
+		del_timer(&tp->probe_timer);
+		del_timer(&tp->retransmit_timer);
+		tp->retransmit_timer.expires=jiffies+when;
+		add_timer(&tp->retransmit_timer);
+		break;
 
+	case TIME_DACK:
+		del_timer(&tp->delack_timer);
+		tp->delack_timer.expires=jiffies+when;
+		add_timer(&tp->delack_timer);
+		break;
 
-static void tcp_retransmit_time(struct sock *sk, int all)
-{
-	/*
-	 * record how many times we've timed out.
-	 * This determines when we should quite trying.
-	 * This needs to be counted here, because we should not be
-	 * counting one per packet we send, but rather one per round
-	 * trip timeout.
-	 */
-	sk->retransmits++;
+	case TIME_PROBE0:
+		del_timer(&tp->probe_timer);
+		tp->probe_timer.expires=jiffies+when;
+		add_timer(&tp->probe_timer);
+		break;	
 
-	tcp_do_retransmit(sk, all);
+	case TIME_WRITE:
+		printk("bug: tcp_reset_xmit_timer TIME_WRITE\n");
+		break;
 
-	/*
-	 * Increase the timeout each time we retransmit.  Note that
-	 * we do not increase the rtt estimate.  rto is initialized
-	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
-	 * that doubling rto each time is the least we can get away with.
-	 * In KA9Q, Karn uses this for the first few times, and then
-	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
-	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
-	 * defined in the protocol as the maximum possible RTT.  I guess
-	 * we'll have to use something other than TCP to talk to the
-	 * University of Mars.
-	 *
-	 * PAWS allows us longer timeouts and large windows, so once
-	 * implemented ftp to mars will work nicely. We will have to fix
-	 * the 120 second clamps though!
-	 */
+	default:
+		printk("bug: unknown timer value\n");
+	}
+}
 
-	sk->backoff++;
-	sk->rto = min(sk->rto << 1, 120*HZ);
+void tcp_clear_xmit_timer(struct sock *sk, int what)
+{
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
-	/* be paranoid about the data structure... */
-	if (sk->send_head)
-		tcp_reset_xmit_timer(sk, TIME_WRITE, sk->rto);
-	else
-		printk(KERN_ERR "send_head NULL in tcp_retransmit_time\n");
+	switch (what) {
+	case TIME_RETRANS:
+		del_timer(&tp->retransmit_timer);
+		break;
+	case TIME_DACK:
+		del_timer(&tp->delack_timer);
+		break;
+	case TIME_PROBE0:
+		del_timer(&tp->probe_timer);
+		break;	
+	default:
+		printk("bug: unknown timer value\n");
+	}
 }
 
-/*
- *	POLICY:
- *		Congestion control.
- *
- *	A timer event has trigger a tcp retransmit timeout. The
- *	socket xmit queue is ready and set up to send. Because
- *	the ack receive code keeps the queue straight we do
- *	nothing clever here.
- */
-
-void tcp_retransmit(struct sock *sk, int all)
+int tcp_timer_is_set(struct sock *sk, int what)
 {
-	if (all) 
-	{
-		tcp_retransmit_time(sk, all);
-		return;
-	}
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
-	sk->ssthresh = sk->cong_window >> 1; /* remember window where we lost */
-	/* sk->ssthresh in theory can be zero.  I guess that's OK */
-	sk->cong_count = 0;
-	sk->cong_window = 1;
+	switch (what) {
+	case TIME_RETRANS:
+		return tp->retransmit_timer.next != NULL;
+		break;
+	case TIME_DACK:
+		return tp->delack_timer.next != NULL;
+		break;
+	case TIME_PROBE0:
+		return tp->probe_timer.next != NULL;
+		break;	
+	default:
+		printk("bug: unknown timer value\n");
+	}
+	return 0;
+}
 
-	/* Do the actual retransmit. */
-	tcp_retransmit_time(sk, all);
+void tcp_clear_xmit_timers(struct sock *sk)
+{	
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+	del_timer(&tp->retransmit_timer);
+	del_timer(&tp->delack_timer);
+	del_timer(&tp->probe_timer);
 }
 
 /*
@@ -175,8 +185,11 @@
 			sk->err=sk->err_soft;
 		else
 			sk->err=ETIMEDOUT;
+
+		printk(KERN_DEBUG "syn timeout\n");
+
 		sk->error_report(sk);
-		del_timer(&sk->retransmit_timer);
+		tcp_clear_xmit_timers(sk);
 		tcp_statistics.TcpAttemptFails++;	/* Is this right ??? - FIXME - */
 		tcp_set_state(sk,TCP_CLOSE);
 		/* Don't FIN, we got nothing back */
@@ -192,7 +205,9 @@
 		else
 			sk->err = ETIMEDOUT;
 		sk->error_report(sk);
-		del_timer(&sk->retransmit_timer);
+
+		tcp_clear_xmit_timers(sk);
+
 		/*
 		 *	Time wait the socket 
 		 */
@@ -213,19 +228,147 @@
 	return 1;
 }
 
-/*
- *	It could be we got here because we needed to send an ack,
- *	so we need to check for that and not just normal retransmit.
- */
-static void tcp_time_write_timeout(struct sock * sk)
-{
+
+void tcp_delack_timer(unsigned long data) {
+
+	struct sock *sk = (struct sock*)data;
+
+	if(sk->zapped)
+	{
+		return;
+	}
+	
+	if (sk->delayed_acks)
+	{
+		tcp_read_wakeup(sk); 		
+	}
+}
+
+void tcp_probe_timer(unsigned long data) {
+
+	struct sock *sk = (struct sock*)data;
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+	if(sk->zapped) 
+	{		
+		return;
+	}
+	
+	if (sk->users) 
+	{
+		/* 
+		 * Try again in second 
+		 */
+
+		tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ);
+		return;
+	}
+
 	/*
-	 *	Retransmission
+	 *	*WARNING* RFC 1122 forbids this
+	 *	FIXME: We ought not to do it, Solaris 2.5 actually has fixing
+	 *	this behaviour in Solaris down as a bug fix. [AC]
 	 */
-	sk->prot->retransmit (sk, 0);
-	tcp_write_timeout(sk);
+	if (tp->probes_out > TCP_RETR2) 
+	{
+		if(sk->err_soft)
+			sk->err = sk->err_soft;
+		else
+			sk->err = ETIMEDOUT;
+		sk->error_report(sk);
+
+		/*
+		 *	Time wait the socket 
+		 */
+		if (sk->state == TCP_FIN_WAIT1 || sk->state == TCP_FIN_WAIT2 
+		    || sk->state == TCP_CLOSING ) 
+		{
+			tcp_set_state(sk, TCP_TIME_WAIT);
+			tcp_reset_msl_timer (sk, TIME_CLOSE, TCP_TIMEWAIT_LEN);
+		}
+		else
+		{
+			/*
+			 *	Clean up time.
+			 */
+			tcp_set_state(sk, TCP_CLOSE);
+		}
+	}
+	
+	tcp_send_probe0(sk);
 }
 
+static __inline__ int tcp_keepopen_proc(struct sock *sk)
+{
+	int res = 0;
+
+	if (sk->state == TCP_ESTABLISHED || sk->state == TCP_CLOSE_WAIT)
+	{
+		struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+		__u32 elapsed = jiffies - tp->rcv_tstamp;
+
+		if (elapsed >= TCP_KEEPALIVE_TIME)
+		{
+			if (tp->probes_out > TCP_KEEPALIVE_PROBES)
+			{
+				if(sk->err_soft)
+					sk->err = sk->err_soft;
+				else
+					sk->err = ETIMEDOUT;
+
+				tcp_set_state(sk, TCP_CLOSE);
+			}
+			else
+			{
+				tp->probes_out++;
+				tp->pending = TIME_KEEPOPEN;
+				tcp_write_wakeup(sk);
+				res = 1;
+			}
+		}
+	}
+	return res;
+}
+
+/*
+ *	Check all sockets for keepalive timer
+ *	Called every 75 seconds
+ *	This timer is started by af_inet init routine and is constantly
+ *	running.
+ *
+ *	It might be better to maintain a count of sockets that need it using
+ *	setsockopt/tcp_destroy_sk and only set the timer when needed.
+ */
+
+/*
+ *	don't send over 5 keepopens at a time to avoid burstiness 
+ *	on big servers [AC]
+ */
+#define MAX_KA_PROBES	5
+
+static void tcp_keepalive(unsigned long data)
+{
+	struct sock *sk;
+	int count = 0;
+	int i;
+	
+	for(i=0; i < SOCK_ARRAY_SIZE; i++)
+	{
+		sk = tcp_prot.sock_array[i];
+		while (sk)
+		{
+			if (sk->keepopen)
+			{
+				count += tcp_keepopen_proc(sk);
+			}
+
+			if (count == MAX_KA_PROBES)
+				return;
+			
+			sk = sk->next;	    
+		}
+	}
+}
 
 /*
  *	The TCP retransmit timer. This lacks a few small details.
@@ -235,67 +378,174 @@
  *	2.	On a 'major timeout' as defined by RFC1122 we shouldn't report
  *		ETIMEDOUT if we know an additional 'soft' error caused this.
  *		tcp_err should save a 'soft error' for us.
+ *	[Unless someone has broken it then it does, except for one 2.0 
+ *	broken case of a send when the route/device is directly unreachable,
+ *	and we error but should retry! - FIXME] [AC]
  */
 
 void tcp_retransmit_timer(unsigned long data)
 {
 	struct sock *sk = (struct sock*)data;
-	int why = sk->ip_xmit_timeout;
+	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
 
 	/*
 	 *	We are reset. We will send no more retransmits.
 	 */
-	 
+
 	if(sk->zapped)
+	{
+		tcp_clear_xmit_timer(sk, TIME_RETRANS);
 		return;
-		
-	/* 
-	 *	Only process if socket is not in use
+	}
+
+	/*
+	 * Clear delay ack timer
 	 */
 
-	if (sk->users) 
+	tcp_clear_xmit_timer(sk, TIME_DACK);
+
+	/*
+	 *	Retransmission
+	 */
+
+	tp->retrans_head = NULL;
+	
+
+	if (sk->retransmits == 0)
 	{
-		/* Try again in 1 second */
-		sk->retransmit_timer.expires = jiffies+HZ;
-		add_timer(&sk->retransmit_timer);
-		return;
+		/* 
+		 * remember window where we lost 
+		 * "one half of the current window but at least 2 segments"
+		 */
+		
+		sk->ssthresh = max(sk->cong_window >> 1, 2); 
+		sk->cong_count = 0;
+		sk->cong_window = 1;
 	}
 
-	if (sk->ack_backlog && !sk->dead) 
-		sk->data_ready(sk,0);
+	atomic_inc(&sk->retransmits);
+
+	tcp_do_retransmit(sk, 0);
 
-	/* Now we need to figure out why the socket was on the timer. */
+	/*
+	 * Increase the timeout each time we retransmit.  Note that
+	 * we do not increase the rtt estimate.  rto is initialized
+	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
+	 * that doubling rto each time is the least we can get away with.
+	 * In KA9Q, Karn uses this for the first few times, and then
+	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
+	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
+	 * defined in the protocol as the maximum possible RTT.  I guess
+	 * we'll have to use something other than TCP to talk to the
+	 * University of Mars.
+	 *
+	 * PAWS allows us longer timeouts and large windows, so once
+	 * implemented ftp to mars will work nicely. We will have to fix
+	 * the 120 second clamps though!
+	 */
+
+	tp->backoff++;
+	tp->rto = min(tp->rto << 1, 120*HZ);
+	tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
+
+	tcp_write_timeout(sk);
+}
 
-	switch (why) 
+/*
+ *	Slow timer for SYN-RECV sockets
+ */
+
+static void tcp_syn_recv_timer(unsigned long data)
+{
+	struct sock *sk;
+	unsigned long now = jiffies;
+	int i;
+
+	for(i=0; i < SOCK_ARRAY_SIZE; i++)
 	{
-	/* Window probing */
-	case TIME_PROBE0:
-		tcp_send_probe0(sk);
-		tcp_write_timeout(sk);
-		break;
+		sk = tcp_prot.sock_array[i];
+		while (sk)
+		{
+			struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+			
+			if (sk->state == TCP_LISTEN && !sk->users &&
+			    tp->syn_wait_queue)
+			{
+				struct open_request *req;
+				
+				req = tp->syn_wait_queue;
+
+				while (tp->syn_wait_queue &&
+				       (((long)(req->expires - now)) <= 0))
+				{
+					struct open_request *conn;
+
+					conn = req;
+					req = req->dl_next;
+
+					if (conn->sk && conn->sk->state > TCP_SYN_RECV)
+						continue;
+					
+					tcp_synq_unlink(tp, conn);
+					
+					if (conn->retrans >= TCP_RETR1)
+					{
+						printk(KERN_DEBUG "syn_recv: "
+						       "too many retransmits\n");
+						(*conn->class->destructor)(conn);
+						tcp_dec_slow_timer(TCP_SLT_SYNACK);
+						kfree(conn);
+					}
+					else
+					{
+						__u32 timeo;
+						
+						(*conn->class->rtx_syn_ack)(sk, conn);
+
+						conn->retrans++;
+						printk(KERN_DEBUG "syn_ack rtx %d\n", conn->retrans);
+						timeo = min((TCP_TIMEOUT_INIT 
+							     << conn->retrans),
+							    120*HZ);
+						conn->expires = now + timeo;
+						tcp_synq_queue(tp, conn);
+					}
+				}
+			}
+			
+			sk = sk->next;
+		}
+	}
+}
 
-	/* Retransmitting */
-	case TIME_WRITE:
-		tcp_time_write_timeout(sk);
-		break;
+void tcp_sltimer_handler(unsigned long data)
+{
+	struct tcp_sl_timer *slt = tcp_slt_array;
+	unsigned long next = ~0UL;
+	unsigned long now = jiffies;
+	int i;
 
-	/* Sending Keepalives */
-	case TIME_KEEPOPEN:
-		/* 
-		 * this reset_timer() call is a hack, this is not
-		 * how KEEPOPEN is supposed to work.
-		 */
-		tcp_reset_xmit_timer (sk, TIME_KEEPOPEN, TCP_TIMEOUT_LEN);
-		/* Send something to keep the connection open. */
-		if (sk->prot->write_wakeup)
-			  sk->prot->write_wakeup (sk);
-		sk->retransmits++;
-		sk->prot->retransmits++;
-		tcp_write_timeout(sk);
-		break;
+	for (i=0; i < TCP_SLT_MAX; i++, slt++)
+	{
+		if (slt->count)
+		{
+			long trigger;
 
-	default:
-		printk (KERN_ERR "rexmit_timer: timer expired - reason unknown\n");
-		break;
+			trigger = slt->period - ((long)(now - slt->last));
+
+			if (trigger <= 0)
+			{
+				(*slt->handler)((unsigned long) slt);
+				slt->last = now;
+				trigger = slt->period;
+			}
+			next = min(next, trigger);
+		}
+	}
+
+	if (next != ~0UL)
+	{
+		tcp_slow_timer.expires = now + next;
+		add_timer(&tcp_slow_timer);
 	}
 }

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov