patch-2.3.41 linux/net/ipv4/tcp_timer.c
Next file: linux/net/ipv4/udp.c
Previous file: linux/net/ipv4/tcp_output.c
Back to the patch index
Back to the overall index
- Lines: 930
- Date:
Sat Jan 22 11:54:57 2000
- Orig file:
v2.3.40/linux/net/ipv4/tcp_timer.c
- Orig date:
Fri Sep 10 23:57:38 1999
diff -u --recursive --new-file v2.3.40/linux/net/ipv4/tcp_timer.c linux/net/ipv4/tcp_timer.c
@@ -5,7 +5,7 @@
*
* Implementation of the Transmission Control Protocol(TCP).
*
- * Version: $Id: tcp_timer.c,v 1.68 1999/09/07 02:31:43 davem Exp $
+ * Version: $Id: tcp_timer.c,v 1.71 2000/01/18 08:24:19 davem Exp $
*
* Authors: Ross Biro, <bir7@leland.Stanford.Edu>
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -23,29 +23,20 @@
#include <net/tcp.h>
int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 = TCP_RETR1;
int sysctl_tcp_retries2 = TCP_RETR2;
+int sysctl_tcp_orphan_retries = TCP_ORPHAN_RETRIES;
-
-static void tcp_sltimer_handler(unsigned long);
-static void tcp_syn_recv_timer(unsigned long);
+static void tcp_retransmit_timer(unsigned long);
+static void tcp_delack_timer(unsigned long);
+static void tcp_probe_timer(unsigned long);
+static void tcp_keepalive_timer (unsigned long data);
static void tcp_twkill(unsigned long);
-struct timer_list tcp_slow_timer = {
- NULL, NULL,
- 0, 0,
- tcp_sltimer_handler,
-};
-
-
-struct tcp_sl_timer tcp_slt_array[TCP_SLT_MAX] = {
- {ATOMIC_INIT(0), TCP_SYNACK_PERIOD, 0, tcp_syn_recv_timer},/* SYNACK */
- {ATOMIC_INIT(0), TCP_TWKILL_PERIOD, 0, tcp_twkill} /* TWKILL */
-};
-
const char timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
/*
@@ -56,17 +47,25 @@
void tcp_init_xmit_timers(struct sock *sk)
{
- init_timer(&sk->tp_pinfo.af_tcp.retransmit_timer);
- sk->tp_pinfo.af_tcp.retransmit_timer.function=&tcp_retransmit_timer;
- sk->tp_pinfo.af_tcp.retransmit_timer.data = (unsigned long) sk;
-
- init_timer(&sk->tp_pinfo.af_tcp.delack_timer);
- sk->tp_pinfo.af_tcp.delack_timer.function=&tcp_delack_timer;
- sk->tp_pinfo.af_tcp.delack_timer.data = (unsigned long) sk;
-
- init_timer(&sk->tp_pinfo.af_tcp.probe_timer);
- sk->tp_pinfo.af_tcp.probe_timer.function=&tcp_probe_timer;
- sk->tp_pinfo.af_tcp.probe_timer.data = (unsigned long) sk;
+ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
+
+ spin_lock_init(&sk->timer_lock);
+
+ init_timer(&tp->retransmit_timer);
+ tp->retransmit_timer.function=&tcp_retransmit_timer;
+ tp->retransmit_timer.data = (unsigned long) sk;
+
+ init_timer(&tp->delack_timer);
+ tp->delack_timer.function=&tcp_delack_timer;
+ tp->delack_timer.data = (unsigned long) sk;
+
+ init_timer(&tp->probe_timer);
+ tp->probe_timer.function=&tcp_probe_timer;
+ tp->probe_timer.data = (unsigned long) sk;
+
+ init_timer(&sk->timer);
+ sk->timer.function=&tcp_keepalive_timer;
+ sk->timer.data = (unsigned long) sk;
}
/*
@@ -79,7 +78,7 @@
spin_lock_bh(&sk->timer_lock);
switch (what) {
- case TIME_RETRANS:
+ case TCP_TIME_RETRANS:
/* When seting the transmit timer the probe timer
* should not be set.
* The delayed ack timer can be set if we are changing the
@@ -89,29 +88,25 @@
__sock_put(sk);
if (!tp->retransmit_timer.prev || !del_timer(&tp->retransmit_timer))
sock_hold(sk);
- if (when > 120*HZ) {
+ if (when > TCP_RTO_MAX) {
printk(KERN_DEBUG "reset_xmit_timer sk=%p when=0x%lx, caller=%p\n", sk, when, NET_CALLER(sk));
- when = 120*HZ;
+ when = TCP_RTO_MAX;
}
mod_timer(&tp->retransmit_timer, jiffies+when);
break;
- case TIME_DACK:
+ case TCP_TIME_DACK:
if (!tp->delack_timer.prev || !del_timer(&tp->delack_timer))
sock_hold(sk);
mod_timer(&tp->delack_timer, jiffies+when);
break;
- case TIME_PROBE0:
+ case TCP_TIME_PROBE0:
if (!tp->probe_timer.prev || !del_timer(&tp->probe_timer))
sock_hold(sk);
mod_timer(&tp->probe_timer, jiffies+when);
break;
- case TIME_WRITE:
- printk(KERN_DEBUG "bug: tcp_reset_xmit_timer TIME_WRITE\n");
- break;
-
default:
printk(KERN_DEBUG "bug: unknown timer value\n");
};
@@ -127,6 +122,7 @@
__sock_put(sk);
if(tp->delack_timer.prev && del_timer(&tp->delack_timer))
__sock_put(sk);
+ tp->ack.blocked = 0;
if(tp->probe_timer.prev && del_timer(&tp->probe_timer))
__sock_put(sk);
if(sk->timer.prev && del_timer(&sk->timer))
@@ -134,39 +130,33 @@
spin_unlock_bh(&sk->timer_lock);
}
-static void tcp_write_err(struct sock *sk, int force)
+static void tcp_write_err(struct sock *sk)
{
- sk->err = sk->err_soft ? sk->err_soft : ETIMEDOUT;
+ sk->err = sk->err_soft ? : ETIMEDOUT;
sk->error_report(sk);
- tcp_clear_xmit_timers(sk);
-
- /* Do not time wait the socket. It is timed out and, hence,
- * idle for 120*HZ. "force" argument is ignored, delete
- * it eventually.
- */
-
- /* Clean up time. */
- tcp_set_state(sk, TCP_CLOSE);
tcp_done(sk);
}
/* A write timeout has occurred. Process the after effects. */
-static void tcp_write_timeout(struct sock *sk)
+static int tcp_write_timeout(struct sock *sk)
{
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int retry_until;
- /* Look for a 'soft' timeout. */
- if ((sk->state == TCP_ESTABLISHED &&
- tp->retransmits && (tp->retransmits % TCP_QUICK_TRIES) == 0) ||
- (sk->state != TCP_ESTABLISHED && tp->retransmits > sysctl_tcp_retries1)) {
- /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
- hole detection. :-(
-
- It is place to make it. It is not made. I do not want
- to make it. It is disguisting. It does not work in any
- case. Let me to cite the same draft, which requires for
- us to implement this:
+ if ((1<<sk->state)&(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
+ if (tp->retransmits)
+ dst_negative_advice(&sk->dst_cache);
+ retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+ } else {
+ if (tp->retransmits >= sysctl_tcp_retries1) {
+ /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
+ hole detection. :-(
+
+ It is place to make it. It is not made. I do not want
+ to make it. It is disguisting. It does not work in any
+ case. Let me to cite the same draft, which requires for
+ us to implement this:
"The one security concern raised by this memo is that ICMP black holes
are often caused by over-zealous security administrators who block
@@ -177,57 +167,70 @@
be far nicer to have all of the black holes fixed rather than fixing
all of the TCP implementations."
- Golden words :-).
- */
+ Golden words :-).
+ */
- dst_negative_advice(&sk->dst_cache);
+ dst_negative_advice(&sk->dst_cache);
+ }
+ retry_until = sysctl_tcp_retries2;
+ if (sk->dead)
+ retry_until = sysctl_tcp_orphan_retries;
}
-
- /* Have we tried to SYN too many times (repent repent 8)) */
- if (sk->state == TCP_SYN_SENT &&
- ((!tp->syn_retries && tp->retransmits > sysctl_tcp_syn_retries) ||
- (tp->syn_retries && tp->retransmits > tp->syn_retries))) {
- tcp_write_err(sk, 1);
- /* Don't FIN, we got nothing back */
- } else if (tp->retransmits > sysctl_tcp_retries2) {
+
+ if (tp->retransmits >= retry_until) {
/* Has it gone just too far? */
- tcp_write_err(sk, 0);
+ tcp_write_err(sk);
+ return 1;
}
+ return 0;
}
-void tcp_delack_timer(unsigned long data)
+static void tcp_delack_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later. */
- tcp_reset_xmit_timer(sk, TIME_DACK, HZ/5);
+ tp->ack.blocked = 1;
+ NET_INC_STATS_BH(DelayedACKLocked);
+ tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MIN);
goto out_unlock;
}
- if(!sk->zapped &&
- sk->tp_pinfo.af_tcp.delayed_acks &&
- sk->state != TCP_CLOSE)
+ if (tp->ack.pending) {
+ /* Delayed ACK missed: inflate ATO, leave pingpong mode */
+ tp->ack.ato = min(tp->ack.ato<<1, TCP_ATO_MAX);
+ tp->ack.pingpong = 0;
tcp_send_ack(sk);
+ NET_INC_STATS_BH(DelayedACKs);
+ }
+ TCP_CHECK_TIMER(sk);
out_unlock:
bh_unlock_sock(sk);
sock_put(sk);
}
-void tcp_probe_timer(unsigned long data)
+static void tcp_probe_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-
- if(sk->zapped)
- goto out;
+ int max_probes;
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later. */
- tcp_reset_xmit_timer(sk, TIME_PROBE0, HZ/5);
+ tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, HZ/5);
+ goto out_unlock;
+ }
+
+ if (sk->state == TCP_CLOSE)
+ goto out_unlock;
+
+ if (tp->packets_out || !tp->send_head) {
+ tp->probes_out = 0;
goto out_unlock;
}
@@ -246,151 +249,251 @@
* with RFCs, only probe timer combines both retransmission timeout
* and probe timeout in one bottle. --ANK
*/
- if (tp->probes_out > sysctl_tcp_retries2) {
- tcp_write_err(sk, 0);
+ max_probes = sk->dead ? sysctl_tcp_orphan_retries : sysctl_tcp_retries2;
+
+ if (tp->probes_out > max_probes) {
+ tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
tcp_send_probe0(sk);
+ TCP_CHECK_TIMER(sk);
}
out_unlock:
bh_unlock_sock(sk);
-out:
sock_put(sk);
}
/* Kill off TIME_WAIT sockets once their lifetime has expired. */
-int tcp_tw_death_row_slot = 0;
-static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS] =
- { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL };
-static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
+static int tcp_tw_death_row_slot = 0;
+int tcp_tw_count = 0;
+static struct tcp_tw_bucket *tcp_tw_death_row[TCP_TWKILL_SLOTS];
+static spinlock_t tw_death_lock = SPIN_LOCK_UNLOCKED;
+static struct timer_list tcp_tw_timer = { function: tcp_twkill };
static void tcp_twkill(unsigned long data)
{
struct tcp_tw_bucket *tw;
int killed = 0;
- /* The death-row tw chains are only ever touched
- * in BH context so no BH disabling (for now) is needed.
+ /* NOTE: compare this to previous version where lock
+ * was released after detaching chain. It was racy,
+ * because tw buckets are scheduled in not serialized context
+ * in 2.3 (with netfilter), and with softnet it is common, because
+ * soft irqs are not sequenced.
*/
spin_lock(&tw_death_lock);
- tw = tcp_tw_death_row[tcp_tw_death_row_slot];
- tcp_tw_death_row[tcp_tw_death_row_slot] = NULL;
- tcp_tw_death_row_slot =
- ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
- spin_unlock(&tw_death_lock);
- while(tw != NULL) {
- struct tcp_tw_bucket *next = tw->next_death;
+ if (tcp_tw_count == 0)
+ goto out;
+
+ while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) {
+ tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death;
+ tw->pprev_death = NULL;
+ spin_unlock(&tw_death_lock);
tcp_timewait_kill(tw);
tcp_tw_put(tw);
+
killed++;
- tw = next;
- }
- if(killed != 0) {
- struct tcp_sl_timer *slt = (struct tcp_sl_timer *)data;
- atomic_sub(killed, &slt->count);
+
+ spin_lock(&tw_death_lock);
}
+ tcp_tw_death_row_slot =
+ ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+
+ if ((tcp_tw_count -= killed) != 0)
+ mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
+ net_statistics[smp_processor_id()*2].TimeWaited += killed;
+out:
+ spin_unlock(&tw_death_lock);
}
/* These are always called from BH context. See callers in
* tcp_input.c to verify this.
*/
-void tcp_tw_schedule(struct tcp_tw_bucket *tw)
-{
- struct tcp_tw_bucket **tpp;
- int slot;
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+{
spin_lock(&tw_death_lock);
- slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
- tpp = &tcp_tw_death_row[slot];
- if((tw->next_death = *tpp) != NULL)
- (*tpp)->pprev_death = &tw->next_death;
- *tpp = tw;
- tw->pprev_death = tpp;
-
- tw->death_slot = slot;
- atomic_inc(&tw->refcnt);
+ if (tw->pprev_death) {
+ if(tw->next_death)
+ tw->next_death->pprev_death = tw->pprev_death;
+ *tw->pprev_death = tw->next_death;
+ tw->pprev_death = NULL;
+ tcp_tw_put(tw);
+ if (--tcp_tw_count == 0)
+ del_timer(&tcp_tw_timer);
+ }
spin_unlock(&tw_death_lock);
-
- tcp_inc_slow_timer(TCP_SLT_TWKILL);
}
-/* Happens rarely if at all, no care about scalability here. */
-void tcp_tw_reschedule(struct tcp_tw_bucket *tw)
+/* Short-time timewait calendar */
+
+static int tcp_twcal_hand = -1;
+static int tcp_twcal_jiffie;
+static void tcp_twcal_tick(unsigned long);
+static struct timer_list tcp_twcal_timer = {NULL, NULL, 0, 0, tcp_twcal_tick,};
+static struct tcp_tw_bucket *tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
+
+void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
{
struct tcp_tw_bucket **tpp;
int slot;
+ /* timeout := RTO * 3.5
+ *
+ * 3.5 = 1+2+0.5 to wait for two retransmits.
+ *
+ * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+ * our ACK acking that FIN can be lost. If N subsequent retransmitted
+ * FINs (or previous seqments) are lost (probability of such event
+ * is p^(N+1), where p is probability to lose single packet and
+ * time to detect the loss is about RTO*(2^N - 1) with exponential
+ * backoff). Normal timewait length is calculated so, that we
+ * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+ * [ BTW Linux. following BSD, violates this requirement waiting
+ * only for 60sec, we should wait at least for 240 secs.
+ * Well, 240 consumes too much of resources 8)
+ * ]
+ * This interval is not reduced to catch old duplicate and
+ * responces to our wandering segments living for two MSLs.
+ * However, if we use PAWS to detect
+ * old duplicates, we can reduce the interval to bounds required
+ * by RTO, rather than MSL. So, if peer understands PAWS, we
+ * kill tw bucket after 3.5*RTO (it is important that this number
+ * is greater than TS tick!) and detect old duplicates with help
+ * of PAWS.
+ */
+ slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
+
spin_lock(&tw_death_lock);
+
+ /* Unlink it, if it was scheduled */
if (tw->pprev_death) {
if(tw->next_death)
tw->next_death->pprev_death = tw->pprev_death;
*tw->pprev_death = tw->next_death;
tw->pprev_death = NULL;
+ tcp_tw_count--;
} else
atomic_inc(&tw->refcnt);
- slot = (tcp_tw_death_row_slot - 1) & (TCP_TWKILL_SLOTS - 1);
- tpp = &tcp_tw_death_row[slot];
+ if (slot >= TCP_TW_RECYCLE_SLOTS) {
+ /* Schedule to slow timer */
+ if (timeo >= TCP_TIMEWAIT_LEN) {
+ slot = TCP_TWKILL_SLOTS-1;
+ } else {
+ slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
+ if (slot >= TCP_TWKILL_SLOTS)
+ slot = TCP_TWKILL_SLOTS-1;
+ }
+ tw->ttd = jiffies + timeo;
+ slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
+ tpp = &tcp_tw_death_row[slot];
+ } else {
+ tw->ttd = jiffies + (slot<<TCP_TW_RECYCLE_TICK);
+
+ if (tcp_twcal_hand < 0) {
+ tcp_twcal_hand = 0;
+ tcp_twcal_jiffie = jiffies;
+ tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
+ add_timer(&tcp_twcal_timer);
+ } else {
+ if ((long)(tcp_twcal_timer.expires - jiffies) > (slot<<TCP_TW_RECYCLE_TICK))
+ mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
+ slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
+ }
+ tpp = &tcp_twcal_row[slot];
+ }
+
if((tw->next_death = *tpp) != NULL)
(*tpp)->pprev_death = &tw->next_death;
*tpp = tw;
tw->pprev_death = tpp;
- tw->death_slot = slot;
+ if (tcp_tw_count++ == 0)
+ mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
spin_unlock(&tw_death_lock);
-
- /* Timer was incremented when we first entered the table. */
}
-/* This is for handling early-kills of TIME_WAIT sockets. */
-void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+void tcp_twcal_tick(unsigned long dummy)
{
+ int n, slot;
+ unsigned long j;
+ unsigned long now = jiffies;
+ int killed = 0;
+ int adv = 0;
+
spin_lock(&tw_death_lock);
- if (tw->pprev_death) {
- if(tw->next_death)
- tw->next_death->pprev_death = tw->pprev_death;
- *tw->pprev_death = tw->next_death;
- tw->pprev_death = NULL;
- tcp_tw_put(tw);
+ if (tcp_twcal_hand < 0)
+ goto out;
+
+ slot = tcp_twcal_hand;
+ j = tcp_twcal_jiffie;
+
+ for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
+ if ((long)(j - now) <= 0) {
+ struct tcp_tw_bucket *tw;
+
+ while((tw = tcp_twcal_row[slot]) != NULL) {
+ tcp_twcal_row[slot] = tw->next_death;
+ tw->pprev_death = NULL;
+
+ tcp_timewait_kill(tw);
+ tcp_tw_put(tw);
+ killed++;
+ }
+ } else {
+ if (!adv) {
+ adv = 1;
+ tcp_twcal_jiffie = j;
+ tcp_twcal_hand = slot;
+ }
+
+ if (tcp_twcal_row[slot] != NULL) {
+ mod_timer(&tcp_twcal_timer, j);
+ goto out;
+ }
+ }
+ j += (1<<TCP_TW_RECYCLE_TICK);
+ slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
}
- spin_unlock(&tw_death_lock);
+ tcp_twcal_hand = -1;
- tcp_dec_slow_timer(TCP_SLT_TWKILL);
+out:
+ if ((tcp_tw_count -= killed) == 0)
+ del_timer(&tcp_tw_timer);
+ net_statistics[smp_processor_id()*2].TimeWaitKilled += killed;
+ spin_unlock(&tw_death_lock);
}
/*
* The TCP retransmit timer.
- *
- * 1. An initial rtt timeout on the probe0 should cause what we can
- * of the first write queue buffer to be split and sent.
- * 2. On a 'major timeout' as defined by RFC1122 we do not report
- * ETIMEDOUT if we know an additional 'soft' error caused this.
- * tcp_err saves a 'soft error' for us.
*/
-void tcp_retransmit_timer(unsigned long data)
+static void tcp_retransmit_timer(unsigned long data)
{
struct sock *sk = (struct sock*)data;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
- /* We are reset. We will send no more retransmits. */
- if(sk->zapped)
- goto out;
-
bh_lock_sock(sk);
if (sk->lock.users) {
/* Try again later */
- tcp_reset_xmit_timer(sk, TIME_RETRANS, HZ/20);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, HZ/20);
goto out_unlock;
}
- /* Clear delay ack timer. */
- tcp_clear_xmit_timer(sk, TIME_DACK);
+ if (sk->state == TCP_CLOSE || tp->packets_out == 0)
+ goto out_unlock;
+
+ BUG_TRAP(!skb_queue_empty(&sk->write_queue));
+
+ if (tcp_write_timeout(sk))
+ goto out_unlock;
/* RFC 2018, clear all 'sacked' flags in retransmission queue,
* the sender may have dropped out of order frames and we must
@@ -426,11 +529,19 @@
tp->snd_cwnd = 1;
}
- tp->retransmits++;
-
tp->dup_acks = 0;
tp->high_seq = tp->snd_nxt;
- tcp_retransmit_skb(sk, skb_peek(&sk->write_queue));
+ if (tcp_retransmit_skb(sk, skb_peek(&sk->write_queue)) > 0) {
+ /* Retransmission failed because of local congestion,
+ * do not backoff.
+ */
+ if (!tp->retransmits)
+ tp->retransmits=1;
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
+ min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+ TCP_CHECK_TIMER(sk);
+ goto out_unlock;
+ }
/* Increase the timeout each time we retransmit. Note that
* we do not increase the rtt estimate. rto is initialized
@@ -448,132 +559,105 @@
* the 120 second clamps though!
*/
tp->backoff++;
- tp->rto = min(tp->rto << 1, 120*HZ);
- tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto);
-
- tcp_write_timeout(sk);
+ tp->retransmits++;
+ tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+ TCP_CHECK_TIMER(sk);
out_unlock:
bh_unlock_sock(sk);
-out:
sock_put(sk);
}
/*
- * Slow timer for SYN-RECV sockets
+ * Timer for listening sockets
*/
-static void tcp_do_syn_queue(struct sock *sk, struct tcp_opt *tp, unsigned long now)
+static void tcp_synack_timer(struct sock *sk)
{
- struct open_request *prev, *req;
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ struct tcp_listen_opt *lopt = tp->listen_opt;
+ int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
+ int thresh = max_retries;
+ unsigned long now = jiffies;
+ struct open_request **reqp, *req;
+ int i, budget;
- prev = (struct open_request *) &tp->syn_wait_queue;
- for(req = tp->syn_wait_queue; req; ) {
- struct open_request *next = req->dl_next;
-
- if (!req->sk && (long)(now - req->expires) >= 0) {
- tcp_synq_unlink(tp, req, prev);
- if(req->retrans >= sysctl_tcp_retries1) {
- (*req->class->destructor)(req);
- tcp_dec_slow_timer(TCP_SLT_SYNACK);
- tp->syn_backlog--;
- tcp_openreq_free(req);
- if (! tp->syn_wait_queue)
- break;
- } else {
- unsigned long timeo;
- struct open_request *rp;
-
- (*req->class->rtx_syn_ack)(sk, req);
- req->retrans++;
- timeo = min((TCP_TIMEOUT_INIT << req->retrans),
- (120 * HZ));
- req->expires = now + timeo;
- rp = prev->dl_next;
- tcp_synq_queue(tp, req);
- if(rp != prev->dl_next)
- prev = prev->dl_next;
- }
- } else
- prev = req;
- req = next;
- }
-}
+ if (lopt == NULL || lopt->qlen == 0)
+ return;
-/* This now scales very nicely. -DaveM */
-static void tcp_syn_recv_timer(unsigned long data)
-{
- struct sock *sk;
- unsigned long now = jiffies;
- int i;
+ /* Normally all the openreqs are young and become mature
+ * (i.e. converted to established socket) for first timeout.
+ * If synack was not acknowledged for 3 seconds, it means
+ * one of the following things: synack was lost, ack was lost,
+ * rtt is high or nobody planned to ack (i.e. synflood).
+ * When server is a bit loaded, queue is populated with old
+ * open requests, reducing effective size of queue.
+ * When server is well loaded, queue size reduces to zero
+ * after several minutes of work. It is not synflood,
+ * it is normal operation. The solution is pruning
+ * too old entries overriding normal timeout, when
+ * situation becomes dangerous.
+ *
+ * Essentially, we reserve half of room for young
+ * embrions; and abort old ones without pity, if old
+ * ones are about to clog our table.
+ */
+ if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+ int young = (lopt->qlen_young<<1);
- read_lock(&tcp_lhash_lock);
- for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
- sk = tcp_listening_hash[i];
- while(sk) {
- struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
-
- /* TCP_LISTEN is implied. */
- bh_lock_sock(sk);
- if (!sk->lock.users && tp->syn_wait_queue)
- tcp_do_syn_queue(sk, tp, now);
- bh_unlock_sock(sk);
- sk = sk->next;
+ while (thresh > 2) {
+ if (lopt->qlen < young)
+ break;
+ thresh--;
+ young <<= 1;
}
}
- read_unlock(&tcp_lhash_lock);
-}
-void tcp_sltimer_handler(unsigned long data)
-{
- struct tcp_sl_timer *slt = tcp_slt_array;
- unsigned long next = ~0UL;
- unsigned long now = jiffies;
- int i;
+ if (tp->defer_accept)
+ max_retries = tp->defer_accept;
- for (i=0; i < TCP_SLT_MAX; i++, slt++) {
- if (atomic_read(&slt->count)) {
- long trigger;
-
- trigger = slt->period - ((long)(now - slt->last));
-
- if (trigger <= 0) {
- (*slt->handler)((unsigned long) slt);
- slt->last = now;
- trigger = slt->period;
- }
+ budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
+ i = lopt->clock_hand;
- /* Only reschedule if some events remain. */
- if (atomic_read(&slt->count))
- next = min(next, trigger);
+ do {
+ reqp=&lopt->syn_table[i];
+ while ((req = *reqp) != NULL) {
+ if ((long)(now - req->expires) >= 0) {
+ if ((req->retrans < thresh ||
+ (req->acked && req->retrans < max_retries))
+ && !req->class->rtx_syn_ack(sk, req, NULL)) {
+ unsigned long timeo;
+
+ if (req->retrans++ == 0)
+ lopt->qlen_young--;
+ timeo = min((TCP_TIMEOUT_INIT << req->retrans),
+ TCP_RTO_MAX);
+ req->expires = now + timeo;
+ reqp = &req->dl_next;
+ continue;
+ }
+
+ /* Drop this request */
+ write_lock(&tp->syn_wait_lock);
+ *reqp = req->dl_next;
+ write_unlock(&tp->syn_wait_lock);
+ lopt->qlen--;
+ if (req->retrans == 0)
+ lopt->qlen_young--;
+ tcp_openreq_free(req);
+ }
+ reqp = &req->dl_next;
}
- }
- if (next != ~0UL)
- mod_timer(&tcp_slow_timer, (now + next));
-}
-/* __tcp_inc_slow_timer is called when an slow timer is started
- * first time (slt->count was 0). There is race condition between
- * timer creation and deletion and if we do not force adding timer here,
- * we might lose timer. We could avoid it with global spinlock, but
- * it is apparently overkill, so that we restart timer ALWAYS when
- * this function is entered, it guarantees that timer will not lost.
- */
+ i = (i+1)&(TCP_SYNQ_HSIZE-1);
-void __tcp_inc_slow_timer(struct tcp_sl_timer *slt)
-{
- unsigned long now = jiffies;
- unsigned long when;
-
- slt->last = now;
+ } while (--budget > 0);
- when = now + slt->period;
+ lopt->clock_hand = i;
- if (tcp_slow_timer.prev &&
- (long)(tcp_slow_timer.expires - when) < 0)
- when = tcp_slow_timer.expires;
-
- mod_timer(&tcp_slow_timer, when);
+ if (lopt->qlen)
+ tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
}
void tcp_delete_keepalive_timer (struct sock *sk)
@@ -595,6 +679,9 @@
void tcp_set_keepalive(struct sock *sk, int val)
{
+ if ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))
+ return;
+
if (val && !sk->keepopen)
tcp_reset_keepalive_timer(sk, keepalive_time_when(&sk->tp_pinfo.af_tcp));
else if (!val)
@@ -602,7 +689,7 @@
}
-void tcp_keepalive_timer (unsigned long data)
+static void tcp_keepalive_timer (unsigned long data)
{
struct sock *sk = (struct sock *) data;
struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
@@ -616,14 +703,31 @@
goto out;
}
- if (sk->state == TCP_FIN_WAIT2 && sk->dead)
+ if (sk->state == TCP_LISTEN) {
+ tcp_synack_timer(sk);
+ goto out;
+ }
+
+ if (sk->state == TCP_FIN_WAIT2 && sk->dead) {
+ if (tp->linger2 >= 0) {
+ int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+
+ if (tmo > 0) {
+ tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+ goto out;
+ }
+ }
+ tcp_send_active_reset(sk, GFP_ATOMIC);
goto death;
+ }
- if (!sk->keepopen)
+ if (!sk->keepopen || sk->state == TCP_CLOSE)
goto out;
elapsed = keepalive_time_when(tp);
- if (!((1<<sk->state) & (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT2)))
+
+ /* It is alive without keepalive 8) */
+ if (tp->packets_out || tp->send_head)
goto resched;
elapsed = tcp_time_stamp - tp->rcv_tstamp;
@@ -632,28 +736,30 @@
if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
(tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
tcp_send_active_reset(sk, GFP_ATOMIC);
- tcp_write_err(sk, 1);
+ tcp_write_err(sk);
goto out;
}
- tp->probes_out++;
- tp->pending = TIME_KEEPOPEN;
- tcp_write_wakeup(sk);
- elapsed = keepalive_intvl_when(tp);
+ if (tcp_write_wakeup(sk) <= 0) {
+ tp->probes_out++;
+ elapsed = keepalive_intvl_when(tp);
+ } else {
+ /* If keepalive was lost due to local congestion,
+ * try harder.
+ */
+ elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+ }
} else {
/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
- if (keepalive_time_when(tp) > elapsed)
- elapsed = keepalive_time_when(tp) - elapsed;
- else
- elapsed = 0;
+ elapsed = keepalive_time_when(tp) - elapsed;
}
+ TCP_CHECK_TIMER(sk);
+
resched:
tcp_reset_keepalive_timer (sk, elapsed);
goto out;
death:
- tcp_set_state(sk, TCP_CLOSE);
- tcp_clear_xmit_timers(sk);
tcp_done(sk);
out:
FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)