patch-1.3.42 linux/net/ipv4/route.c

Next file: linux/net/ipv4/tcp.c
Previous file: linux/net/ipv4/raw.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v1.3.41/linux/net/ipv4/route.c linux/net/ipv4/route.c
@@ -35,6 +35,8 @@
  *		Alan Cox	:	Aligned routing errors more closely with BSD
  *					our system is still very different.
  *		Alan Cox	:	Faster /proc handling
+ *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
+ *					routing caches and better behaviour.
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -42,8 +44,10 @@
  *		2 of the License, or (at your option) any later version.
  */
 
+#include <linux/config.h>
 #include <asm/segment.h>
 #include <asm/system.h>
+#include <asm/bitops.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -65,102 +69,246 @@
 #include <net/netlink.h>
 
 /*
- *	The routing table list
+ * Forwarding Information Base definitions.
  */
 
-static struct rtable *rt_base = NULL;
-unsigned long rt_stamp = 1;		/* Routing table version stamp for caches ( 0 is 'unset' ) */
+struct fib_node
+{
+	struct fib_node		*fib_next;
+	__u32			fib_dst;
+	unsigned long		fib_use;
+	struct fib_info		*fib_info;
+	short			fib_metric;
+	unsigned char		fib_tos;
+};
+
+/*
+ * This structure contains data shared by many of routes.
+ */	
+
+struct fib_info
+{
+	struct fib_info		*fib_next;
+	struct fib_info		*fib_prev;
+	__u32			fib_gateway;
+	struct device		*fib_dev;
+	int			fib_refcnt;
+	unsigned long		fib_window;
+	unsigned short		fib_flags;
+	unsigned short		fib_mtu;
+	unsigned short		fib_irtt;
+};
+
+struct fib_zone
+{
+	struct fib_zone	*fz_next;
+	struct fib_node	**fz_hash_table;
+	struct fib_node	*fz_list;
+	int		fz_nent;
+	int		fz_logmask;
+	__u32		fz_mask;
+};
+
+static struct fib_zone 	*fib_zones[33];
+static struct fib_zone 	*fib_zone_list;
+static struct fib_node 	*fib_loopback = NULL;
+static struct fib_info 	*fib_info_list;
 
 /*
- *	Pointer to the loopback route
+ * Backlogging.
  */
- 
-static struct rtable *rt_loopback = NULL;
+
+#define RT_BH_REDIRECT		0
+#define RT_BH_GARBAGE_COLLECT 	1
+#define RT_BH_FREE	 	2
+
+struct rt_req
+{
+	struct rt_req * rtr_next;
+	struct device *dev;
+	__u32 dst;
+	__u32 gw;
+	unsigned char tos;
+};
+
+int		    	ip_rt_lock;
+unsigned		ip_rt_bh_mask;
+static struct rt_req 	*rt_backlog;
 
 /*
- *	Remove a routing table entry.
+ * Route cache.
  */
 
-static int rt_del(__u32 dst, __u32 mask,
-		char *devname, __u32 gtw, short rt_flags, short metric)
+struct rtable 		*ip_rt_hash_table[RT_HASH_DIVISOR];
+static int		rt_cache_size;
+static struct rtable 	*rt_free_queue;
+struct wait_queue	*rt_wait;
+
+static void rt_kick_backlog(void);
+static void rt_cache_add(unsigned hash, struct rtable * rth);
+static void rt_cache_flush(void);
+static void rt_garbage_collect_1(void);
+
+/* 
+ * Evaluate mask length.
+ */
+
+static __inline__ int rt_logmask(__u32 mask)
 {
-	struct rtable *r, **rp;
-	unsigned long flags;
-	int found=0;
+	if (!(mask = ntohl(mask)))
+		return 32;
+	return ffz(~mask);
+}
 
-	rp = &rt_base;
-	
-	/*
-	 *	This must be done with interrupts off because we could take
-	 *	an ICMP_REDIRECT.
-	 */
-	 
-	save_flags(flags);
-	cli();
-	while((r = *rp) != NULL) 
+/* 
+ * Create mask from length.
+ */
+
+static __inline__ __u32 rt_mask(int logmask)
+{
+	if (logmask >= 32)
+		return 0;
+	return htonl(~((1<<logmask)-1));
+}
+
+static __inline__ unsigned fz_hash_code(__u32 dst, int logmask)
+{
+	return ip_rt_hash_code(ntohl(dst)>>logmask);
+}
+
+/*
+ * Free FIB node.
+ */
+
+static void fib_free_node(struct fib_node * f)
+{
+	struct fib_info * fi = f->fib_info;
+	if (!--fi->fib_refcnt)
 	{
-		/*
-		 *	Make sure the destination and netmask match.
-		 *	metric, gateway and device are also checked
-		 *	if they were specified.
-		 */
-		if (r->rt_dst != dst ||
-		    (mask && r->rt_mask != mask) ||
-		    (gtw && r->rt_gateway != gtw) ||
-		    (metric >= 0 && r->rt_metric != metric) ||
-		    (devname && strcmp((r->rt_dev)->name,devname) != 0) )
+#if RT_CACHE_DEBUG >= 2
+		printk("fib_free_node: fi %08x/%s is free\n", fi->fib_gateway, fi->fib_dev->name);
+#endif
+		if (fi->fib_next)
+			fi->fib_next->fib_prev = fi->fib_prev;
+		if (fi->fib_prev)
+			fi->fib_prev->fib_next = fi->fib_next;
+		if (fi == fib_info_list)
+			fib_info_list = fi->fib_next;
+	}
+	kfree_s(f, sizeof(struct fib_node));
+}
+
+/*
+ * Find gateway route by address.
+ */
+
+static struct fib_node * fib_lookup_gateway(__u32 dst)
+{
+	struct fib_zone * fz;
+	struct fib_node * f;
+
+	for (fz = fib_zone_list; fz; fz = fz->fz_next) 
+	{
+		if (fz->fz_hash_table)
+			f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
+		else
+			f = fz->fz_list;
+		
+		for ( ; f; f = f->fib_next)
 		{
-			rp = &r->rt_next;
-			continue;
+			if ((dst ^ f->fib_dst) & fz->fz_mask)
+				continue;
+			if (f->fib_info->fib_flags & RTF_GATEWAY)
+				return NULL;
+			return f;
 		}
-		*rp = r->rt_next;
-		
-		/*
-		 *	If we delete the loopback route update its pointer.
-		 */
-		 
-		if (rt_loopback == r)
-			rt_loopback = NULL;
-		ip_netlink_msg(RTMSG_DELROUTE, dst, gtw, mask, rt_flags, metric, r->rt_dev->name);
-		kfree_s(r, sizeof(struct rtable));
-		found=1;
-	} 
-	rt_stamp++;		/* New table revision */
-	
-	restore_flags(flags);
-	
-	if(found)
-		return 0;
-	return -ESRCH;
+	}
+	return NULL;
 }
 
+/*
+ * Find local route by address.
+ * FIXME: I use "longest match" principle. If destination
+ *	  has some non-local route, I'll not search shorter matches.
+ *	  It's possible, I'm wrong, but I wanted to prevent following
+ *	  situation:
+ *	route add 193.233.7.128 netmask 255.255.255.192 gw xxxxxx
+ *	route add 193.233.7.0	netmask 255.255.255.0 eth1
+ *	  (Two ethernets connected by serial line, one is small and other is large)
+ *	  Host 193.233.7.129 is locally unreachable,
+ *	  but old (<=1.3.37) code will send packets destined for it to eth1.
+ *
+ */
+
+static struct fib_node * fib_lookup_local(__u32 dst)
+{
+	struct fib_zone * fz;
+	struct fib_node * f;
+
+	for (fz = fib_zone_list; fz; fz = fz->fz_next) 
+	{
+		int longest_match_found = 0;
+
+		if (fz->fz_hash_table)
+			f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
+		else
+			f = fz->fz_list;
+		
+		for ( ; f; f = f->fib_next)
+		{
+			if ((dst ^ f->fib_dst) & fz->fz_mask)
+				continue;
+			if (!(f->fib_info->fib_flags & RTF_GATEWAY))
+				return f;
+			longest_match_found = 1;
+		}
+		if (longest_match_found)
+			return NULL;
+	}
+	return NULL;
+}
 
 /*
- *	Remove all routing table entries for a device. This is called when
- *	a device is downed.
+ * Main lookup routine.
+ *	IMPORTANT NOTE: this algorithm has small difference from <=1.3.37 visible
+ *	by user. It doesn't route non-CIDR broadcasts by default.
+ *
+ *	F.e.
+ *		ifconfig eth0 193.233.7.65 netmask 255.255.255.192 broadcast 193.233.7.255
+ *	is valid, but if you really are not able (not allowed, do not want) to
+ *	use CIDR compliant broadcast 193.233.7.127, you should add host route:
+ *		route add -host 193.233.7.255 eth0
  */
- 
-void ip_rt_flush(struct device *dev)
+
+static struct fib_node * fib_lookup(__u32 dst)
 {
-	struct rtable *r;
-	struct rtable **rp;
-	unsigned long flags;
+	struct fib_zone * fz;
+	struct fib_node * f;
 
-	rp = &rt_base;
-	save_flags(flags);
-	cli();
-	while ((r = *rp) != NULL) {
-		if (r->rt_dev != dev) {
-			rp = &r->rt_next;
-			continue;
+	for (fz = fib_zone_list; fz; fz = fz->fz_next) 
+	{
+		if (fz->fz_hash_table)
+			f = fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
+		else
+			f = fz->fz_list;
+		
+		for ( ; f; f = f->fib_next)
+		{
+			if ((dst ^ f->fib_dst) & fz->fz_mask)
+				continue;
+			return f;
 		}
-		*rp = r->rt_next;
-		if (rt_loopback == r)
-			rt_loopback = NULL;
-		kfree_s(r, sizeof(struct rtable));
-	} 
-	rt_stamp++;		/* New table revision */
-	restore_flags(flags);
+	}
+	return NULL;
+}
+
+static __inline__ struct device * get_gw_dev(__u32 gw)
+{
+	struct fib_node * f;
+	f = fib_lookup_gateway(gw);
+	if (f)
+		return f->fib_info->fib_dev;
+	return NULL;
 }
 
 /*
@@ -199,228 +347,1244 @@
 }
 
 
-/*
- *	Find the route entry through which our gateway will be reached
- */
- 
-static inline struct device * get_gw_dev(__u32 gw)
+/*
+ *	Check if a mask is acceptable.
+ */
+ 
+static inline int bad_mask(__u32 mask, __u32 addr)
+{
+	if (addr & (mask = ~mask))
+		return 1;
+	mask = ntohl(mask);
+	if (mask & (mask+1))
+		return 1;
+	return 0;
+}
+
+
+static int fib_del_list(struct fib_node **fp, __u32 dst,
+		struct device * dev, __u32 gtw, short flags, short metric, __u32 mask)
+{
+	struct fib_node *f;
+	int found=0;
+
+	while((f = *fp) != NULL) 
+	{
+		struct fib_info * fi = f->fib_info;
+
+		/*
+		 *	Make sure the destination and netmask match.
+		 *	metric, gateway and device are also checked
+		 *	if they were specified.
+		 */
+		if (f->fib_dst != dst ||
+		    (gtw && fi->fib_gateway != gtw) ||
+		    (metric >= 0 && f->fib_metric != metric) ||
+		    (dev && fi->fib_dev != dev) )
+		{
+			fp = &f->fib_next;
+			continue;
+		}
+		cli();
+		*fp = f->fib_next;
+		if (fib_loopback == f)
+			fib_loopback = NULL;
+		sti();
+		ip_netlink_msg(RTMSG_DELROUTE, dst, gtw, mask, flags, metric, fi->fib_dev->name);
+		fib_free_node(f);
+		found++;
+	}
+	return found;
+}
+
+static __inline__ int fib_del_1(__u32 dst, __u32 mask,
+		struct device * dev, __u32 gtw, short flags, short metric)
+{
+	struct fib_node **fp;
+	struct fib_zone *fz;
+	int found=0;
+
+	if (!mask)
+	{
+		for (fz=fib_zone_list; fz; fz = fz->fz_next)
+		{
+			int tmp;
+			if (fz->fz_hash_table)
+				fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
+			else
+				fp = &fz->fz_list;
+
+			tmp = fib_del_list(fp, dst, dev, gtw, flags, metric, mask);
+			fz->fz_nent -= tmp;
+			found += tmp;
+		}
+	} 
+	else
+	{
+		if ((fz = fib_zones[rt_logmask(mask)]) != NULL)
+		{
+			if (fz->fz_hash_table)
+				fp = &fz->fz_hash_table[fz_hash_code(dst, fz->fz_logmask)];
+			else
+				fp = &fz->fz_list;
+	
+			found = fib_del_list(fp, dst, dev, gtw, flags, metric, mask);
+			fz->fz_nent -= found;
+		}
+	}
+
+	if (found)
+	{
+		rt_cache_flush();
+		return 0;
+	}
+	return -ESRCH;
+}
+
+
+static struct fib_info * fib_create_info(__u32 gw, struct device * dev,
+					 unsigned short flags, unsigned short mss,
+					 unsigned long window, unsigned short irtt)
+{
+	struct fib_info * fi;
+
+	if (!(flags & RTF_MSS))
+	{
+		mss = dev->mtu;
+#ifdef CONFIG_NO_PATH_MTU_DISCOVERY
+		/*
+		 *	If MTU was not specified, use default.
+		 *	If you want to increase MTU for some net (local subnet)
+		 *	use "route add .... mss xxx".
+		 *
+		 * 	The MTU isnt currently always used and computed as it
+		 *	should be as far as I can tell. [Still verifying this is right]
+		 */
+		if ((flags & RTF_GATEWAY) && mss > 576)
+			mss = 576;
+#endif
+	}
+	if (!(flags & RTF_WINDOW))
+		window = 0;
+	if (!(flags & RTF_IRTT))
+		irtt = 0;
+
+	for (fi=fib_info_list; fi; fi = fi->fib_next)
+	{
+		if (fi->fib_gateway != gw ||
+		    fi->fib_dev != dev  ||
+		    fi->fib_flags != flags ||
+		    fi->fib_mtu != mss ||
+		    fi->fib_window != window ||
+		    fi->fib_irtt != irtt)
+			continue;
+		fi->fib_refcnt++;
+#if RT_CACHE_DEBUG >= 2
+		printk("fib_create_info: fi %08x/%s is duplicate\n", fi->fib_gateway, fi->fib_dev->name);
+#endif
+		return fi;
+	}
+	fi = (struct fib_info*)kmalloc(sizeof(struct fib_info), GFP_KERNEL);
+	if (!fi)
+		return NULL;
+	memset(fi, 0, sizeof(struct fib_info));
+	fi->fib_flags = flags;
+	fi->fib_dev = dev;
+	fi->fib_gateway = gw;
+	fi->fib_mtu = mss;
+	fi->fib_window = window;
+	fi->fib_refcnt++;
+	fi->fib_next = fib_info_list;
+	fi->fib_prev = NULL;
+	if (fib_info_list)
+		fib_info_list->fib_prev = fi;
+	fib_info_list = fi;
+#if RT_CACHE_DEBUG >= 2
+	printk("fib_create_info: fi %08x/%s is created\n", fi->fib_gateway, fi->fib_dev->name);
+#endif
+	return fi;
+}
+
+
+static __inline__ void fib_add_1(short flags, __u32 dst, __u32 mask,
+	__u32 gw, struct device *dev, unsigned short mss,
+	unsigned long window, unsigned short irtt, short metric)
+{
+	struct fib_node *f, *f1;
+	struct fib_node **fp;
+	struct fib_node **dup_fp = NULL;
+	struct fib_zone * fz;
+	struct fib_info * fi;
+	int logmask;
+
+	if (flags & RTF_HOST) 
+		mask = 0xffffffff;
+	/*
+	 * If mask is not specified, try to guess it.
+	 */
+	else if (!mask)
+	{
+		if (!((dst ^ dev->pa_addr) & dev->pa_mask)) 
+		{
+			mask = dev->pa_mask;
+			flags &= ~RTF_GATEWAY;
+			if (flags & RTF_DYNAMIC) 
+			{
+				printk("Dynamic route to my own net rejected\n");
+				return;
+			}
+		} 
+		else
+			mask = guess_mask(dst, dev);
+		dst &= mask;
+	}
+	
+	/*
+	 *	A gateway must be reachable and not a local address
+	 */
+	 
+	if (gw == dev->pa_addr)
+		flags &= ~RTF_GATEWAY;
+		
+	if (flags & RTF_GATEWAY) 
+	{
+		/*
+		 *	Don't try to add a gateway we can't reach.. 
+		 */
+		 
+		if (dev != get_gw_dev(gw))
+			return;
+			
+		flags |= RTF_GATEWAY;
+	} 
+	else
+		gw = 0;
+		
+	/*
+	 *	Allocate an entry and fill it in.
+	 */
+	 
+	f = (struct fib_node *) kmalloc(sizeof(struct fib_node), GFP_KERNEL);
+	if (f == NULL)
+		return;
+
+	memset(f, 0, sizeof(struct fib_node));
+	f->fib_dst = dst;
+	f->fib_metric = metric;
+	f->fib_tos    = 0;
+
+	if  ((fi = fib_create_info(gw, dev, flags, mss, window, irtt)) == NULL)
+	{
+		kfree_s(f, sizeof(struct fib_node));
+		return;
+	}
+	f->fib_info = fi;
+
+	logmask = rt_logmask(mask);
+	fz = fib_zones[logmask];
+
+
+	if (!fz)
+	{
+		int i;
+		fz = kmalloc(sizeof(struct fib_zone), GFP_KERNEL);
+		if (!fz)
+		{
+			fib_free_node(f);
+			return;
+		}
+		memset(fz, 0, sizeof(struct fib_zone));
+		fz->fz_logmask = logmask;
+		fz->fz_mask = mask;
+		for (i=logmask-1; i>=0; i--)
+			if (fib_zones[i])
+				break;
+		cli();
+		if (i<0)
+		{
+			fz->fz_next = fib_zone_list;
+			fib_zone_list = fz;
+		}
+		else
+		{
+			fz->fz_next = fib_zones[i]->fz_next;
+			fib_zones[i]->fz_next = fz;
+		}
+		fib_zones[logmask] = fz;
+		sti();
+	}
+
+	/*
+	 * If zone overgrows RTZ_HASHING_LIMIT, create hash table.
+	 */
+
+	if (fz->fz_nent >= RTZ_HASHING_LIMIT && !fz->fz_hash_table && logmask<32)
+	{
+		struct fib_node ** ht;
+#if RT_CACHE_DEBUG
+		printk("fib_add_1: hashing for zone %d started\n", logmask);
+#endif
+		ht = kmalloc(RTZ_HASH_DIVISOR*sizeof(struct rtable*), GFP_KERNEL);
+
+		if (ht)
+		{
+			memset(ht, 0, RTZ_HASH_DIVISOR*sizeof(struct fib_node*));
+			cli();
+			f1 = fz->fz_list;
+			while (f1)
+			{
+				struct fib_node * next;
+				unsigned hash = fz_hash_code(f1->fib_dst, logmask);
+				next = f1->fib_next;
+				f1->fib_next = ht[hash];
+				ht[hash] = f1;
+				f1 = next;
+			}
+			fz->fz_list = NULL;
+			fz->fz_hash_table = ht; 
+			sti();
+		}
+	}
+
+	if (fz->fz_hash_table)
+		fp = &fz->fz_hash_table[fz_hash_code(dst, logmask)];
+	else
+		fp = &fz->fz_list;
+
+	/*
+	 * Scan list to find the first route with the same destination
+	 */
+	while ((f1 = *fp) != NULL)
+	{
+		if (f1->fib_dst == dst)
+			break;
+		fp = &f1->fib_next;
+	}
+
+	/*
+	 * Find route with the same destination and less (or equal) metric.
+	 */
+	while ((f1 = *fp) != NULL && f1->fib_dst == dst)
+	{
+		if (f1->fib_metric >= metric)
+			break;
+		/*
+		 *	Record route with the same destination and gateway,
+		 *	but less metric. We'll delete it 
+		 *	after instantiation of new route.
+		 */
+		if (f1->fib_info->fib_gateway == gw)
+			dup_fp = fp;
+		fp = &f1->fib_next;
+	}
+
+	/*
+	 * Is it already present?
+	 */
+
+	if (f1 && f1->fib_metric == metric && f1->fib_info == fi)
+	{
+		fib_free_node(f);
+		return;
+	}
+	
+	/*
+	 * Insert new entry to the list.
+	 */
+
+	cli();
+	f->fib_next = f1;
+	*fp = f;
+	if (!fib_loopback && (fi->fib_dev->flags & IFF_LOOPBACK))
+		fib_loopback = f;
+	sti();
+	fz->fz_nent++;
+	ip_netlink_msg(RTMSG_NEWROUTE, dst, gw, mask, flags, metric, fi->fib_dev->name);
+
+	/*
+	 *	Delete route with the same destination and gateway.
+	 *	Note that we should have at most one such route.
+	 */
+	if (dup_fp)
+		fp = dup_fp;
+	else
+		fp = &f->fib_next;
+
+	while ((f1 = *fp) != NULL && f1->fib_dst == dst)
+	{
+		if (f1->fib_info->fib_gateway == gw)
+		{
+			cli();
+			*fp = f1->fib_next;
+			if (fib_loopback == f1)
+				fib_loopback = NULL;
+			sti();
+			ip_netlink_msg(RTMSG_DELROUTE, dst, gw, mask, flags, metric, f1->fib_info->fib_dev->name);
+			fib_free_node(f1);
+			fz->fz_nent--;
+			break;
+		}
+		fp = &f1->fib_next;
+	}
+	rt_cache_flush();
+	return;
+}
+
+static int rt_flush_list(struct fib_node ** fp, struct device *dev)
+{
+	int found = 0;
+	struct fib_node *f;
+
+	while ((f = *fp) != NULL) {
+		if (f->fib_info->fib_dev != dev) {
+			fp = &f->fib_next;
+			continue;
+		}
+		cli();
+		*fp = f->fib_next;
+		if (fib_loopback == f)
+			fib_loopback = NULL;
+		sti();
+		fib_free_node(f);
+		found++;
+	}
+	return found;
+}
+
+static __inline__ void fib_flush_1(struct device *dev)
+{
+	struct fib_zone *fz;
+	int found = 0;
+
+	for (fz = fib_zone_list; fz; fz = fz->fz_next)
+	{
+		if (fz->fz_hash_table)
+		{
+			int i;
+			int tmp = 0;
+			for (i=0; i<RTZ_HASH_DIVISOR; i++)
+				tmp += rt_flush_list(&fz->fz_hash_table[i], dev);
+			fz->fz_nent -= tmp;
+			found += tmp;
+		}
+		else
+		{
+			int tmp;
+			tmp = rt_flush_list(&fz->fz_list, dev);
+			fz->fz_nent -= tmp;
+			found += tmp;
+		}
+	}
+		
+	if (found)
+		rt_cache_flush();
+}
+
+
+/* 
+ *	Called from the PROCfs module. This outputs /proc/net/route.
+ *
+ *	We preserve the old format but pad the buffers out. This means that
+ *	we can spin over the other entries as we read them. Remember the
+ *	gated BGP4 code could need to read 60,000+ routes on occasion (thats
+ *	about 7Mb of data). To do that ok we will need to also cache the
+ *	last route we got to (reads will generally be following on from
+ *	one another without gaps).
+ */
+ 
+int rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+	struct fib_zone *fz;
+	struct fib_node *f;
+	int len=0;
+	off_t pos=0;
+	char temp[129];
+	int i;
+	
+	pos = 128;
+
+	if (offset<128)
+	{
+		sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT");
+		len = 128;
+  	}
+  	
+	while  (ip_rt_lock)
+		sleep_on(&rt_wait);
+	ip_rt_fast_lock();
+
+	for (fz=fib_zone_list; fz; fz = fz->fz_next)
+	{
+		int maxslot;
+		struct fib_node ** fp;
+
+		if (fz->fz_nent == 0)
+			continue;
+
+		if (pos + 128*fz->fz_nent <= offset)
+		{
+			pos += 128*fz->fz_nent;
+			len = 0;
+			continue;
+		}
+
+		if (fz->fz_hash_table)
+		{
+			maxslot = RTZ_HASH_DIVISOR;
+			fp	= fz->fz_hash_table;
+		}
+		else
+		{
+			maxslot	= 1;
+			fp	= &fz->fz_list;
+		}
+			
+		for (i=0; i < maxslot; i++, fp++)
+		{
+			
+			for (f = *fp; f; f = f->fib_next) 
+			{
+				struct fib_info * fi;
+				/*
+				 *	Spin through entries until we are ready
+				 */
+				pos += 128;
+
+				if (pos <= offset)
+				{
+					len=0;
+					continue;
+				}
+					
+				fi = f->fib_info;
+				sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%lu\t%d\t%08lX\t%d\t%lu\t%u",
+					fi->fib_dev->name, (unsigned long)f->fib_dst, (unsigned long)fi->fib_gateway,
+					fi->fib_flags, 0, f->fib_use, f->fib_metric,
+					(unsigned long)fz->fz_mask, (int)fi->fib_mtu, fi->fib_window, (int)fi->fib_irtt);
+				sprintf(buffer+len,"%-127s\n",temp);
+
+				len += 128;
+				if (pos >= offset+length)
+					goto done;
+			}
+		}
+        }
+
+done:
+	ip_rt_unlock();
+	wake_up(&rt_wait);
+  	
+  	*start = buffer+len-(pos-offset);
+  	len = pos - offset;
+  	if (len>length)
+  		len = length;
+  	return len;
+}
+
+int rt_cache_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
+{
+	int len=0;
+	off_t pos=0;
+	char temp[129];
+	struct rtable *r;
+	int i;
+
+	pos = 128;
+
+	if (offset<128)
+	{
+		sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tSource\t\tMTU\tWindow\tIRTT\tHH\tARP\n");
+		len = 128;
+  	}
+	
+  	
+	while  (ip_rt_lock)
+		sleep_on(&rt_wait);
+	ip_rt_fast_lock();
+
+	for (i = 0; i<RT_HASH_DIVISOR; i++)
+	{
+		for (r = ip_rt_hash_table[i]; r; r = r->rt_next) 
+		{
+			/*
+			 *	Spin through entries until we are ready
+			 */
+			pos += 128;
+
+			if (pos <= offset)
+			{
+				len = 0;
+				continue;
+			}
+					
+			sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%ld\t%lu\t%d\t%08lX\t%d\t%lu\t%u\t%ld\t%1d",
+				r->rt_dev->name, (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
+				r->rt_flags, r->rt_refcnt, r->rt_use, 0,
+				(unsigned long)r->rt_src, (int)r->rt_mtu, r->rt_window, (int)r->rt_irtt, r->rt_hh ? r->rt_hh->hh_refcnt : -1, r->rt_hh ? r->rt_hh->hh_uptodate : 0);
+			sprintf(buffer+len,"%-127s\n",temp);
+			len += 128;
+			if (pos >= offset+length)
+				goto done;
+		}
+        }
+
+done:
+	ip_rt_unlock();
+	wake_up(&rt_wait);
+  	
+  	*start = buffer+len-(pos-offset);
+  	len = pos-offset;
+  	if (len>length)
+  		len = length;
+  	return len;
+}
+
+
+static void rt_free(struct rtable * rt)
+{
+	unsigned long flags;
+
+	save_flags(flags);
+	cli();
+	if (!rt->rt_refcnt)
+	{
+		struct hh_cache * hh = rt->rt_hh;
+		rt->rt_hh = NULL;
+		if (hh && !--hh->hh_refcnt)
+		{
+			restore_flags(flags);
+			kfree_s(hh, sizeof(struct hh_cache));
+		}
+		restore_flags(flags);
+		kfree_s(rt, sizeof(struct rt_table));
+		return;
+	}
+	rt->rt_next = rt_free_queue;
+	rt->rt_flags &= ~RTF_UP;
+	rt_free_queue = rt;
+	ip_rt_bh_mask |= RT_BH_FREE;
+#if RT_CACHE_DEBUG >= 2
+	printk("rt_free: %08x\n", rt->rt_dst);
+#endif
+	restore_flags(flags);
+}
+
+/*
+ * RT "bottom half" handlers. Called with masked inetrrupts.
+ */
+
+static __inline__ void rt_kick_free_queue(void)
+{
+	struct rtable *rt, **rtp;
+
+	rtp = &rt_free_queue;
+
+	while ((rt = *rtp) != NULL)
+	{
+		if  (!rt->rt_refcnt)
+		{
+			struct hh_cache * hh = rt->rt_hh;
+#if RT_CACHE_DEBUG >= 2
+			__u32 daddr = rt->rt_dst;
+#endif
+			*rtp = rt->rt_next;
+			rt->rt_hh = NULL;
+			if (hh && !--hh->hh_refcnt)
+			{
+				sti();
+				kfree_s(hh, sizeof(struct hh_cache));
+			}
+			sti();
+			kfree_s(rt, sizeof(struct rt_table));
+#if RT_CACHE_DEBUG >= 2
+			printk("rt_kick_free_queue: %08x is free\n", daddr);
+#endif
+			cli();
+			continue;
+		}
+		rtp = &rt->rt_next;
+	}
+}
+
+void ip_rt_run_bh() {
+	unsigned long flags;
+	save_flags(flags);
+	cli();
+	if (ip_rt_bh_mask && !ip_rt_lock)
+	{
+		if (ip_rt_bh_mask & RT_BH_REDIRECT)
+			rt_kick_backlog();
+
+		if (ip_rt_bh_mask & RT_BH_GARBAGE_COLLECT)
+		{
+			ip_rt_fast_lock();
+			ip_rt_bh_mask &= ~RT_BH_GARBAGE_COLLECT;
+			sti();
+			rt_garbage_collect_1();
+			cli();
+			ip_rt_fast_unlock();
+		}
+
+		if (ip_rt_bh_mask & RT_BH_FREE)
+			rt_kick_free_queue();
+	}
+	restore_flags(flags);
+}
+
+
+void ip_rt_check_expire()
+{
+	ip_rt_fast_lock();
+	if (ip_rt_lock == 1)
+	{
+		int i;
+		struct rtable *rth, **rthp;
+		unsigned long flags;
+		unsigned long now = jiffies;
+
+		save_flags(flags);
+		for (i=0; i<RT_HASH_DIVISOR; i++)
+		{
+			rthp = &ip_rt_hash_table[i];
+
+			while ((rth = *rthp) != NULL)
+			{
+				struct rtable * rth_next = rth->rt_next;
+
+				/*
+				 * Cleanup aged off entries.
+				 */
+
+				cli();
+				if (!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now)
+				{
+					*rthp = rth_next;
+					sti();
+					rt_cache_size--;
+#if RT_CACHE_DEBUG >= 2
+					printk("rt_check_expire clean %02x@%08x\n", i, rth->rt_dst);
+#endif
+					rt_free(rth);
+					continue;
+				}
+				sti();
+
+				if (!rth_next)
+					break;
+
+				/*
+				 * LRU ordering.
+				 */
+
+				if (rth->rt_lastuse + RT_CACHE_BUBBLE_THRESHOULD < rth_next->rt_lastuse ||
+				    (rth->rt_lastuse < rth_next->rt_lastuse &&
+				     rth->rt_use < rth_next->rt_use))
+				{
+#if RT_CACHE_DEBUG >= 2
+					printk("rt_check_expire bubbled %02x@%08x<->%08x\n", i, rth->rt_dst, rth_next->rt_dst);
+#endif
+					cli();
+					*rthp = rth_next;
+					rth->rt_next = rth_next->rt_next;
+					rth_next->rt_next = rth;
+					sti();
+					rthp = &rth_next->rt_next;
+					continue;
+				}
+				rthp = &rth->rt_next;
+			}
+		}
+		restore_flags(flags);
+		rt_kick_free_queue();
+	}
+	ip_rt_unlock();
+}
+
+static void rt_redirect_1(__u32 dst, __u32 gw, struct device *dev)
+{
+	struct rtable *rt;
+	unsigned long hash = ip_rt_hash_code(dst);
+
+	if (gw == dev->pa_addr)
+		return;
+	if (dev != get_gw_dev(gw))
+		return;
+	rt = (struct rtable *) kmalloc(sizeof(struct rtable), GFP_ATOMIC);
+	if (rt == NULL) 
+		return;
+	memset(rt, 0, sizeof(struct rtable));
+	rt->rt_flags = RTF_DYNAMIC | RTF_MODIFIED | RTF_HOST | RTF_GATEWAY | RTF_UP;
+	rt->rt_dst = dst;
+	rt->rt_dev = dev;
+	rt->rt_gateway = gw;
+	rt->rt_src = dev->pa_addr;
+	rt->rt_mtu = dev->mtu;
+#ifdef CONFIG_NO_PATH_MTU_DISCOVERY
+	if (dev->mtu > 576)
+		rt->rt_mtu = 576;
+#endif
+	rt->rt_lastuse  = jiffies;
+	rt->rt_refcnt  = 1;
+	rt_cache_add(hash, rt);
+	ip_rt_put(rt);
+	return;
+}
+
+static void rt_cache_flush(void)
+{
+	int i;
+	struct rtable * rth, * next;
+
+	for (i=0; i<RT_HASH_DIVISOR; i++)
+	{
+		int nr=0;
+
+		cli();
+		if (!(rth = ip_rt_hash_table[i]))
+		{
+			sti();
+			continue;
+		}
+
+		ip_rt_hash_table[i] = NULL;
+		sti();
+
+		for (; rth; rth=next)
+		{
+			next = rth->rt_next;
+			rt_cache_size--;
+			nr++;
+			rth->rt_next = NULL;
+			rt_free(rth);
+		}
+#if RT_CACHE_DEBUG >= 2
+		if (nr > 0)
+			printk("rt_cache_flush: %d@%02x\n", nr, i);
+#endif
+	}
+#if RT_CACHE_DEBUG >= 1
+	if (rt_cache_size)
+	{
+		printk("rt_cache_flush: bug rt_cache_size=%d\n", rt_cache_size);
+		rt_cache_size = 0;
+	}
+#endif
+}
+
+static void rt_garbage_collect_1(void)
+{
+	int i;
+	unsigned expire = RT_CACHE_TIMEOUT>>1;
+	struct rtable * rth, **rthp;
+	unsigned long now = jiffies;
+
+	for (;;)
+	{
+		for (i=0; i<RT_HASH_DIVISOR; i++)
+		{
+			if (!ip_rt_hash_table[i])
+				continue;
+			for (rthp=&ip_rt_hash_table[i]; (rth=*rthp); rthp=&rth->rt_next)
+			{
+				if (rth->rt_lastuse + expire*(rth->rt_refcnt+1) > now)
+					continue;
+				rt_cache_size--;
+				cli();
+				*rthp=rth->rt_next;
+				rth->rt_next = NULL;
+				sti();
+				rt_free(rth);
+				break;
+			}
+		}
+		if (rt_cache_size < RT_CACHE_SIZE_MAX)
+			return;
+		expire >>= 1;
+	}
+}
+
+static __inline__ void rt_req_enqueue(struct rt_req **q, struct rt_req *rtr)
+{
+	unsigned long flags;
+	struct rt_req * tail;
+
+	save_flags(flags);
+	cli();
+	tail = *q;
+	if (!tail)
+		rtr->rtr_next = rtr;
+	else
+	{
+		rtr->rtr_next = tail->rtr_next;
+		tail->rtr_next = rtr;
+	}
+	*q = rtr;
+	restore_flags(flags);
+	return;
+}
+
+/*
+ * Caller should mask interrupts.
+ */
+
+static __inline__ struct rt_req * rt_req_dequeue(struct rt_req **q)
+{
+	struct rt_req * rtr;
+
+	if (*q)
+	{
+		rtr = (*q)->rtr_next;
+		(*q)->rtr_next = rtr->rtr_next;
+		if (rtr->rtr_next == rtr)
+			*q = NULL;
+		rtr->rtr_next = NULL;
+		return rtr;
+	}
+	return NULL;
+}
+
+/*
+   Called with masked interrupts
+ */
+
+static void rt_kick_backlog()
+{
+	if (!ip_rt_lock)
+	{
+		struct rt_req * rtr;
+
+		ip_rt_fast_lock();
+
+		while ((rtr = rt_req_dequeue(&rt_backlog)) != NULL)
+		{
+			sti();
+			rt_redirect_1(rtr->dst, rtr->gw, rtr->dev);
+			kfree_s(rtr, sizeof(struct rt_req));
+			cli();
+		}
+
+		ip_rt_bh_mask &= ~RT_BH_REDIRECT;
+
+		ip_rt_fast_unlock();
+	}
+}
+
+/*
+ * rt_{del|add|flush} called only from USER process. Waiting is OK.
+ */
+
+static int rt_del(__u32 dst, __u32 mask,
+		struct device * dev, __u32 gtw, short rt_flags, short metric)
+{
+	int retval;
+
+	while (ip_rt_lock)
+		sleep_on(&rt_wait);
+	ip_rt_fast_lock();
+	retval = fib_del_1(dst, mask, dev, gtw, rt_flags, metric);
+	ip_rt_unlock();
+	wake_up(&rt_wait);
+	return retval;
+}
+
+static void rt_add(short flags, __u32 dst, __u32 mask,
+	__u32 gw, struct device *dev, unsigned short mss,
+	unsigned long window, unsigned short irtt, short metric)
+{
+	while (ip_rt_lock)
+		sleep_on(&rt_wait);
+	ip_rt_fast_lock();
+	fib_add_1(flags, dst, mask, gw, dev, mss, window, irtt, metric);
+	ip_rt_unlock();
+	wake_up(&rt_wait);
+}
+
+void ip_rt_flush(struct device *dev)
+{
+	while (ip_rt_lock)
+		sleep_on(&rt_wait);
+	ip_rt_fast_lock();
+	fib_flush_1(dev);
+	ip_rt_unlock();
+	wake_up(&rt_wait);
+}
+
+/*
+   Called by ICMP module.
+ */
+
+void ip_rt_redirect(__u32 src, __u32 dst, __u32 gw, struct device *dev)
+{
+	struct rt_req * rtr;
+	struct rtable * rt;
+
+	rt = ip_rt_route(dst, 0);
+	if (!rt)
+		return;
+
+	if (rt->rt_gateway != src ||
+	    rt->rt_dev != dev ||
+	    ((gw^dev->pa_addr)&dev->pa_mask) ||
+	    ip_chk_addr(gw))
+	{
+		ip_rt_put(rt);
+		return;
+	}
+	ip_rt_put(rt);
+
+	ip_rt_fast_lock();
+	if (ip_rt_lock == 1)
+	{
+		rt_redirect_1(dst, gw, dev);
+		ip_rt_unlock();
+		return;
+	}
+
+	rtr = kmalloc(sizeof(struct rt_req), GFP_ATOMIC);
+	if (rtr)
+	{
+		rtr->dst = dst;
+		rtr->gw = gw;
+		rtr->dev = dev;
+		rt_req_enqueue(&rt_backlog, rtr);
+		ip_rt_bh_mask |= RT_BH_REDIRECT;
+	}
+	ip_rt_unlock();
+}
+
+
+static __inline__ void rt_garbage_collect(void)
 {
-	struct rtable * rt;
-
-	for (rt = rt_base ; ; rt = rt->rt_next) 
+	if (ip_rt_lock == 1)
 	{
-		if (!rt)
-			return NULL;
-		if ((gw ^ rt->rt_dst) & rt->rt_mask)
-			continue;
-		/* 
-		 *	Gateways behind gateways are a no-no 
-		 */
-		 
-		if (rt->rt_flags & RTF_GATEWAY)
-			return NULL;
-		return rt->rt_dev;
+		rt_garbage_collect_1();
+		return;
 	}
+	ip_rt_bh_mask |= RT_BH_GARBAGE_COLLECT;
 }
 
-/*
- *	Rewrote rt_add(), as the old one was weird - Linus
- *
- *	This routine is used to update the IP routing table, either
- *	from the kernel (ICMP_REDIRECT) or via an ioctl call issued
- *	by the superuser.
- */
- 
-void ip_rt_add(short flags, __u32 dst, __u32 mask,
-	__u32 gw, struct device *dev, unsigned short mtu,
-	unsigned long window, unsigned short irtt, short metric)
+static void rt_cache_add(unsigned hash, struct rtable * rth)
 {
-	struct rtable *r, *rt;
-	struct rtable **rp;
-	unsigned long cpuflags;
-	int duplicate = 0;
+	unsigned long	flags;
+	struct rtable	**rthp;
+	__u32		daddr = rth->rt_dst;
+	unsigned long	now = jiffies;
 
-	/*
-	 *	A host is a unique machine and has no network bits.
-	 */
-	 
-	if (flags & RTF_HOST) 
+#if RT_CACHE_DEBUG >= 2
+	if (ip_rt_lock != 1)
 	{
-		mask = 0xffffffff;
-	} 
-	
-	/*
-	 *	Calculate the network mask
-	 */
-	 
-	else if (!mask) 
+		printk("rt_cache_add: ip_rt_lock==%d\n", ip_rt_lock);
+		return;
+	}
+#endif
+
+	save_flags(flags);
+
+	if (rth->rt_dev->header_cache_bind)
 	{
-		if (!((dst ^ dev->pa_addr) & dev->pa_mask)) 
+		struct rtable * rtg = rth;
+
+		if (rth->rt_gateway != daddr)
 		{
-			mask = dev->pa_mask;
-			flags &= ~RTF_GATEWAY;
-			if (flags & RTF_DYNAMIC) 
+			ip_rt_fast_unlock();
+			rtg = ip_rt_route(rth->rt_gateway, 0);
+			ip_rt_fast_lock();
+		}
+
+		if (rtg)
+		{
+			if (rtg == rth)
+				rtg->rt_dev->header_cache_bind(&rtg->rt_hh, rtg->rt_dev, ETH_P_IP, rtg->rt_dst);
+			else
 			{
-				/*printk("Dynamic route to my own net rejected\n");*/
-				return;
+				if (rtg->rt_hh)
+					ATOMIC_INCR(&rtg->rt_hh->hh_refcnt);
+				rth->rt_hh = rtg->rt_hh;
+				ip_rt_put(rtg);
 			}
-		} 
-		else
-			mask = guess_mask(dst, dev);
-		dst &= mask;
+		}
 	}
-	
-	/*
-	 *	A gateway must be reachable and not a local address
-	 */
-	 
-	if (gw == dev->pa_addr)
-		flags &= ~RTF_GATEWAY;
-		
-	if (flags & RTF_GATEWAY) 
+
+	if (rt_cache_size >= RT_CACHE_SIZE_MAX)
+		rt_garbage_collect();
+
+	cli();
+	rth->rt_next = ip_rt_hash_table[hash];
+#if RT_CACHE_DEBUG >= 2
+	if (rth->rt_next)
 	{
-		/*
-		 *	Don't try to add a gateway we can't reach.. 
-		 */
-		 
-		if (dev != get_gw_dev(gw))
-			return;
-			
-		flags |= RTF_GATEWAY;
-	} 
-	else
-		gw = 0;
-		
+		struct rtable * trth;
+		printk("rt_cache @%02x: %08x", hash, daddr);
+		for (trth=rth->rt_next; trth; trth=trth->rt_next)
+			printk(" . %08x", trth->rt_dst);
+		printk("\n");
+	}
+#endif
+	ip_rt_hash_table[hash] = rth;
+	rthp = &rth->rt_next;
+	sti();
+	rt_cache_size++;
+
 	/*
-	 *	Allocate an entry and fill it in.
+	 * Cleanup duplicate (and aged off) entries.
 	 */
-	 
-	rt = (struct rtable *) kmalloc(sizeof(struct rtable), GFP_ATOMIC);
-	if (rt == NULL) 
+
+	while ((rth = *rthp) != NULL)
 	{
-		return;
+
+		cli();
+		if ((!rth->rt_refcnt && rth->rt_lastuse + RT_CACHE_TIMEOUT < now)
+		    || rth->rt_dst == daddr)
+		{
+			*rthp = rth->rt_next;
+			rt_cache_size--;
+			sti();
+#if RT_CACHE_DEBUG >= 2
+			printk("rt_cache clean %02x@%08x\n", hash, rth->rt_dst);
+#endif
+			rt_free(rth);
+			continue;
+		}
+		sti();
+		rthp = &rth->rt_next;
 	}
-	memset(rt, 0, sizeof(struct rtable));
-	rt->rt_flags = flags | RTF_UP;
-	rt->rt_dst = dst;
-	rt->rt_dev = dev;
-	rt->rt_gateway = gw;
-	rt->rt_mask = mask;
-	rt->rt_mss = dev->mtu - HEADER_SIZE;
-	rt->rt_metric = metric;
-	rt->rt_window = 0;	/* Default is no clamping */
+	restore_flags(flags);
+}
 
-	/* Are the MSS/Window valid ? */
+/*
+   RT should be already locked.
+   
+   We could improve this by keeping a chain of say 32 struct rtable's
+   last freed for fast recycling.
+   
+ */
 
-	if(rt->rt_flags & RTF_MSS)
-		rt->rt_mss = mtu;
-		
-	if(rt->rt_flags & RTF_WINDOW)
-		rt->rt_window = window;
-	if(rt->rt_flags & RTF_IRTT)
-		rt->rt_irtt = irtt;
+struct rtable * ip_rt_slow_route (__u32 daddr, int local)
+{
+	unsigned hash = ip_rt_hash_code(daddr)^local;
+	struct rtable * rth;
+	struct fib_node * f;
+	struct fib_info * fi;
+	__u32 saddr;
+
+#if RT_CACHE_DEBUG >= 2
+	printk("rt_cache miss @%08x\n", daddr);
+#endif
 
-	/*
-	 *	What we have to do is loop though this until we have
-	 *	found the first address which has a higher generality than
-	 *	the one in rt.  Then we can put rt in right before it.
-	 *	The interrupts must be off for this process.
-	 */
+	rth = kmalloc(sizeof(struct rtable), GFP_ATOMIC);
+	if (!rth)
+	{
+		ip_rt_unlock();
+		return NULL;
+	}
 
-	save_flags(cpuflags);
-	cli();
+	if (local)
+		f = fib_lookup_local(daddr);
+	else
+		f = fib_lookup (daddr);
 
-	/*
-	 *	Remove old route if we are getting a duplicate. 
-	 */
-	 
-	rp = &rt_base;
-	while ((r = *rp) != NULL) 
+	if (f)
 	{
-		if (r->rt_dst != dst || 
-		    r->rt_mask != mask)
-		{
-			rp = &r->rt_next;
-			continue;
-		}
-		if (r->rt_metric != metric && r->rt_gateway != gw)
+		fi = f->fib_info;
+		f->fib_use++;
+	}
+
+	if (!f || (fi->fib_flags & RTF_REJECT))
+	{
+#if RT_CACHE_DEBUG >= 2
+		printk("rt_route failed @%08x\n", daddr);
+#endif
+		ip_rt_unlock();
+		kfree_s(rth, sizeof(struct rtable));
+		return NULL;
+	}
+
+	saddr = fi->fib_dev->pa_addr;
+
+	if (daddr == fi->fib_dev->pa_addr)
+	{
+		f->fib_use--;
+		if ((f = fib_loopback) != NULL)
 		{
-			duplicate = 1;
-			rp = &r->rt_next;
-			continue;
+			f->fib_use++;
+			fi = f->fib_info;
 		}
-		*rp = r->rt_next;
-		if (rt_loopback == r)
-			rt_loopback = NULL;
-		ip_netlink_msg(RTMSG_DELROUTE, dst,gw, mask, flags, metric, rt->rt_dev->name);
-		kfree_s(r, sizeof(struct rtable));
 	}
-	
-	/*
-	 *	Add the new route 
-	 */
-	 
-	rp = &rt_base;
-	while ((r = *rp) != NULL) {
-		/*
-		 * When adding a duplicate route, add it before
-		 * the route with a higher metric.
-		 */
-		if (duplicate &&
-		    r->rt_dst == dst &&
-		    r->rt_mask == mask &&
-		    r->rt_metric > metric)
-			break;
-		else
-		/*
-		 * Otherwise, just add it before the
-		 * route with a higher generality.
-		 */
-			if ((r->rt_mask & mask) != mask)
-				break;
-		rp = &r->rt_next;
+
+	if (!f)
+	{
+		ip_rt_unlock();
+		kfree_s(rth, sizeof(struct rtable));
+		return NULL;
 	}
-	rt->rt_next = r;
-	*rp = rt;
-	
-	/*
-	 *	Update the loopback route
-	 */
-	 
-	if ((rt->rt_dev->flags & IFF_LOOPBACK) && !rt_loopback)
-		rt_loopback = rt;
 
-	rt_stamp++;		/* New table revision */
-		
-	/*
-	 *	Restore the interrupts and return
-	 */
+	rth->rt_dst	= daddr;
+	rth->rt_src	= saddr;
+	rth->rt_lastuse	= jiffies;
+	rth->rt_refcnt	= 1;
+	rth->rt_use	= 1;
+	rth->rt_next	= NULL;
+	rth->rt_hh	= NULL;
+	rth->rt_gateway	= fi->fib_gateway;
+	rth->rt_dev	= fi->fib_dev;
+	rth->rt_mtu	= fi->fib_mtu;
+	rth->rt_window	= fi->fib_window;
+	rth->rt_irtt	= fi->fib_irtt;
+	rth->rt_tos	= f->fib_tos;
+	rth->rt_flags   = fi->fib_flags | RTF_HOST;
+	if (local)
+		rth->rt_flags   |= RTF_LOCAL;
 
-	restore_flags(cpuflags);
-	ip_netlink_msg(RTMSG_NEWROUTE, dst,gw, mask, flags, metric, rt->rt_dev->name);
-	return;
+	if (!(rth->rt_flags & RTF_GATEWAY))
+		rth->rt_gateway = rth->rt_dst;
+
+	if (ip_rt_lock == 1)
+		rt_cache_add(hash, rth);
+	else
+	{
+		rt_free(rth);
+#if RT_CACHE_DEBUG >= 1
+		printk("rt_cache: route to %08x was born dead\n", daddr);
+#endif
+	}
+
+	ip_rt_unlock();
+	return rth;
 }
 
+void ip_rt_put(struct rtable * rt)
+{
+	if (rt)
+		ATOMIC_DECR(&rt->rt_refcnt);
+}
 
-/*
- *	Check if a mask is acceptable.
- */
- 
-static inline int bad_mask(__u32 mask, __u32 addr)
+struct rtable * ip_rt_route(__u32 daddr, int local)
 {
-	if (addr & (mask = ~mask))
-		return 1;
-	mask = ntohl(mask);
-	if (mask & (mask+1))
-		return 1;
-	return 0;
+	struct rtable * rth;
+
+	ip_rt_fast_lock();
+
+	for (rth=ip_rt_hash_table[ip_rt_hash_code(daddr)^local]; rth; rth=rth->rt_next)
+	{
+		if (rth->rt_dst == daddr)
+		{
+			rth->rt_lastuse = jiffies;
+			ATOMIC_INCR(&rth->rt_use);
+			ATOMIC_INCR(&rth->rt_refcnt);
+			ip_rt_unlock();
+			return rth;
+		}
+	}
+	return ip_rt_slow_route (daddr, local);
 }
 
+
 /*
- *	Process a route add request from the user
+ *	Process a route add request from the user, or from a kernel
+ *	task.
  */
  
-static int rt_new(struct rtentry *r)
+int ip_rt_new(struct rtentry *r)
 {
 	int err;
 	char * devname;
@@ -465,7 +1629,7 @@
 	/*
 	 *	BSD emulation: Permits route add someroute gw one-of-my-addresses
 	 *	to indicate which iface. Not as clean as the nice Linux dev technique
-	 *	but people keep using it... 
+	 *	but people keep using it...  (and gated likes it ;))
 	 */
 	 
 	if (!dev && (flags & RTF_GATEWAY)) 
@@ -522,8 +1686,8 @@
 	/*
 	 *	Add the route
 	 */
-	 
-	ip_rt_add(flags, daddr, mask, gw, dev, r->rt_mss, r->rt_window, r->rt_irtt, metric);
+
+	rt_add(flags, daddr, mask, gw, dev, r->rt_mss, r->rt_window, r->rt_irtt, metric);
 	return 0;
 }
 
@@ -539,6 +1703,7 @@
 	struct sockaddr_in *gtw;
 	char *devname;
 	int err;
+	struct device * dev = NULL;
 
 	trg = (struct sockaddr_in *) &r->rt_dst;
 	msk = (struct sockaddr_in *) &r->rt_genmask;
@@ -548,159 +1713,20 @@
 		err = getname(devname, &devname);
 		if (err)
 			return err;
+		dev = dev_get(devname);
+		putname(devname);
+		if (!dev)
+			return -ENODEV;
 	}
 	/*
 	 * metric can become negative here if it wasn't filled in
 	 * but that's a fortunate accident; we really use that in rt_del.
 	 */
-	err=rt_del((__u32)trg->sin_addr.s_addr, (__u32)msk->sin_addr.s_addr, devname,
+	err=rt_del((__u32)trg->sin_addr.s_addr, (__u32)msk->sin_addr.s_addr, dev,
 		(__u32)gtw->sin_addr.s_addr, r->rt_flags, r->rt_metric - 1);
-	if ( devname != NULL )
-		putname(devname);
 	return err;
 }
 
-
-/* 
- *	Called from the PROCfs module. This outputs /proc/net/route.
- *
- *	We preserve the old format but pad the buffers out. This means that
- *	we can spin over the other entries as we read them. Remember the
- *	gated BGP4 code could need to read 60,000+ routes on occasion (thats
- *	about 7Mb of data). To do that ok we will need to also cache the
- *	last route we got to (reads will generally be following on from
- *	one another without gaps).
- */
- 
-int rt_get_info(char *buffer, char **start, off_t offset, int length, int dummy)
-{
-	struct rtable *r;
-	int len=128;
-	off_t pos=0;
-	off_t begin=0;
-	char temp[129];
-
-	if(offset<128)
-		sprintf(buffer,"%-127s\n","Iface\tDestination\tGateway \tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU\tWindow\tIRTT");
-	pos=128;
-  	
-	for (r = rt_base; r != NULL; r = r->rt_next) 
-	{
-		/*
-		 *	Spin through entries until we are ready
-		 */
-		if(pos+128<offset)
-		{
-			pos+=128;
-			continue;
-		}
-					
-        	sprintf(temp, "%s\t%08lX\t%08lX\t%02X\t%d\t%lu\t%d\t%08lX\t%d\t%lu\t%u",
-			r->rt_dev->name, (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
-			r->rt_flags, r->rt_refcnt, r->rt_use, r->rt_metric,
-			(unsigned long)r->rt_mask, (int)r->rt_mss, r->rt_window, (int)r->rt_irtt);
-		sprintf(buffer+len,"%-127s\n",temp);
-		len+=128;
-		pos+=128;
-		if(pos<offset)
-		{
-			len=0;
-			begin=pos;
-		}
-		if(pos>offset+length)
-			break;
-  	}
-  	
-  	*start=buffer+(offset-begin);
-  	len-=(offset-begin);
-  	if(len>length)
-  		len=length;
-  	return len;
-}
-
-/*
- *	This is hackish, but results in better code. Use "-S" to see why.
- */
- 
-#define early_out ({ goto no_route; 1; })
-
-/*
- *	Route a packet. This needs to be fairly quick. Florian & Co. 
- *	suggested a unified ARP and IP routing cache. Done right its
- *	probably a brilliant idea. I'd actually suggest a unified
- *	ARP/IP routing/Socket pointer cache. Volunteers welcome
- */
- 
-struct rtable * ip_rt_route(__u32 daddr, struct options *opt, __u32 *src_addr)
-{
-	struct rtable *rt;
-
-	for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next) 
-	{
-		if (!((rt->rt_dst ^ daddr) & rt->rt_mask))
-			break;
-		/*
-		 *	broadcast addresses can be special cases.. 
-		 */
-		if (rt->rt_flags & RTF_GATEWAY)
-			continue;		 
-		if ((rt->rt_dev->flags & IFF_BROADCAST) &&
-		    (rt->rt_dev->pa_brdaddr == daddr))
-			break;
-	}
-	
-	if(rt->rt_flags&RTF_REJECT)
-		return NULL;
-	
-	if(src_addr!=NULL)
-		*src_addr= rt->rt_dev->pa_addr;
-		
-	if (daddr == rt->rt_dev->pa_addr) {
-		if ((rt = rt_loopback) == NULL)
-			goto no_route;
-	}
-	rt->rt_use++;
-	return rt;
-no_route:
-	return NULL;
-}
-
-struct rtable * ip_rt_local(__u32 daddr, struct options *opt, __u32 *src_addr)
-{
-	struct rtable *rt;
-
-	for (rt = rt_base; rt != NULL || early_out ; rt = rt->rt_next) 
-	{
-		/*
-		 *	No routed addressing.
-		 */
-		if (rt->rt_flags&RTF_GATEWAY)
-			continue;
-			
-		if (!((rt->rt_dst ^ daddr) & rt->rt_mask))
-			break;
-		/*
-		 *	broadcast addresses can be special cases.. 
-		 */
-		 
-		if ((rt->rt_dev->flags & IFF_BROADCAST) &&
-		     rt->rt_dev->pa_brdaddr == daddr)
-			break;
-	}
-	
-	if(src_addr!=NULL)
-		*src_addr= rt->rt_dev->pa_addr;
-		
-	if (daddr == rt->rt_dev->pa_addr) {
-		if ((rt = rt_loopback) == NULL)
-			goto no_route;
-	}
-	rt->rt_use++;
-	return rt;
-no_route:
-	return NULL;
-}
-
 /*
  *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
  */
@@ -720,8 +1746,15 @@
 			if (err)
 				return err;
 			memcpy_fromfs(&rt, arg, sizeof(struct rtentry));
-			return (cmd == SIOCDELRT) ? rt_kill(&rt) : rt_new(&rt);
+			return (cmd == SIOCDELRT) ? rt_kill(&rt) : ip_rt_new(&rt);
 	}
 
 	return -EINVAL;
 }
+
+void ip_rt_advice(struct rtable **rp, int advice)
+{
+	/* Thanks! */
+	return;
+}
+

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov with Sam's (original) version
of this