patch-2.0.30 linux/net/ipv4/tcp.c

Next file: linux/net/ipv4/tcp_input.c
Previous file: linux/net/ipv4/sysctl_net_ipv4.c
Back to the patch index
Back to the overall index

diff -u --recursive --new-file v2.0.29/linux/net/ipv4/tcp.c linux/net/ipv4/tcp.c
@@ -203,6 +203,8 @@
  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
  *	Willy Konynenberg	:	Transparent proxying support.
  *		Theodore Ts'o	:	Do secure TCP sequence numbers.
+ *		David S. Miller	:	New socket lookup architecture for ISS.
+ *					This code is dedicated to John Dyson.
  *					
  * To Fix:
  *		Fast path the code. Two things here - fix the window calculation
@@ -438,6 +440,203 @@
 unsigned long seq_offset;
 struct tcp_mib	tcp_statistics;
 
+/* This is for sockets with full identity only.  Sockets here will always
+ * be without wildcards and will have the following invariant:
+ *          TCP_ESTABLISHED <= sk->state < TCP_CLOSE
+ */
+struct sock *tcp_established_hash[TCP_HTABLE_SIZE];
+
+/* All sockets in TCP_LISTEN state will be in here.  This is the only table
+ * where wildcard'd TCP sockets can exist.  Hash function here is just local
+ * port number.  XXX Fix or we'll lose with thousands of IP aliases...
+ */
+struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];
+
+/* Ok, let's try this, I give up, we do need a local binding
+ * TCP hash as well as the others for fast bind/connect.
+ */
+struct sock *tcp_bound_hash[TCP_BHTABLE_SIZE];
+
+extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport);
+
+static int tcp_v4_verify_bind(struct sock *sk, unsigned short snum)
+{
+	struct sock *sk2;
+	int retval = 0, sk_reuse = sk->reuse;
+
+	SOCKHASH_LOCK();
+	sk2 = tcp_bound_hash[tcp_bhashfn(snum)];
+	for(; sk2 != NULL; sk2 = sk2->bind_next) {
+		if((sk2->num == snum) && (sk2 != sk)) {
+			unsigned char state = sk2->state;
+			int sk2_reuse = sk2->reuse;
+
+			if(!sk2->rcv_saddr || !sk->rcv_saddr) {
+				if((!sk2_reuse)			||
+				   (!sk_reuse)			||
+				   (state == TCP_LISTEN)) {
+					retval = 1;
+					break;
+				}
+			} else if(sk2->rcv_saddr == sk->rcv_saddr) {
+				if((!sk_reuse)			||
+				   (!sk2_reuse)			||
+				   (state == TCP_LISTEN)) {
+					retval = 1;
+					break;
+				}
+			}
+		}
+	}
+	SOCKHASH_UNLOCK();
+
+	return retval;
+}
+
+static __inline__ int tcp_lport_inuse(int num)
+{
+	struct sock *sk = tcp_bound_hash[tcp_bhashfn(num)];
+
+	for(; sk != NULL; sk = sk->bind_next) {
+		if(sk->num == num)
+			return 1;
+	}
+	return 0;
+}
+
+/* Find a "good" local port, this is family independant.
+ * There are several strategies working in unison here to
+ * get the best possible performance.  The current socket
+ * load is kept track of, if it is zero there is a strong
+ * likely hood that there is a zero length chain we will
+ * find with a small amount of searching, else the load is
+ * what we shoot for for when the chains all have at least
+ * one entry.  The base helps us walk the chains in an
+ * order such that a good chain is found as quickly as possible.  -DaveM
+ */
+unsigned short tcp_good_socknum(void)
+{
+	static int start = PROT_SOCK;
+	static int binding_contour = 0;
+	int best = 0;
+	int size = 32767; /* a big num. */
+	int retval = 0, i, end, bc;
+
+	SOCKHASH_LOCK();
+	i = tcp_bhashfn(start);
+	end = i + TCP_BHTABLE_SIZE;
+	bc = binding_contour;
+	do {
+		struct sock *sk = tcp_bound_hash[tcp_bhashfn(i)];
+		if(!sk) {
+			retval = (start + i);
+			start  = (retval + 1);
+
+			/* Check for decreasing load. */
+			if(bc != 0)
+				binding_contour = 0;
+			goto done;
+		} else {
+			int j = 0;
+			do { sk = sk->bind_next; } while(++j < size && sk);
+			if(j < size) {
+				best = (start + i);
+				size = j;
+				if(bc && size <= bc) {
+					start = best + 1;
+					goto verify;
+				}
+			}
+		}
+	} while(++i != end);
+
+	/* Socket load is increasing, adjust our load average. */
+	binding_contour = size;
+verify:
+	if(size < binding_contour)
+		binding_contour = size;
+
+	if(best > 32767)
+		best -= (32768 - PROT_SOCK);
+
+	while(tcp_lport_inuse(best))
+		best += TCP_BHTABLE_SIZE;
+	retval = best;
+done:
+	if(start > 32767)
+		start -= (32768 - PROT_SOCK);
+
+	SOCKHASH_UNLOCK();
+
+	return retval;
+}
+
+void tcp_v4_hash(struct sock *sk)
+{
+	unsigned char state;
+
+	SOCKHASH_LOCK();
+	state = sk->state;
+	if(state != TCP_CLOSE || !sk->dead) {
+		struct sock **skp;
+
+		if(state == TCP_LISTEN)
+			skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
+		else
+			skp = &tcp_established_hash[tcp_sk_hashfn(sk)];
+
+		if((sk->next = *skp) != NULL)
+			(*skp)->pprev = &sk->next;
+		*skp = sk;
+		sk->pprev = skp;
+		tcp_sk_bindify(sk);
+	}
+	SOCKHASH_UNLOCK();
+}
+
+void tcp_v4_unhash(struct sock *sk)
+{
+	SOCKHASH_LOCK();
+	if(sk->pprev) {
+		if(sk->next)
+			sk->next->pprev = sk->pprev;
+		*sk->pprev = sk->next;
+		sk->pprev = NULL;
+		tcp_sk_unbindify(sk);
+	}
+	SOCKHASH_UNLOCK();
+}
+
+void tcp_v4_rehash(struct sock *sk)
+{
+	unsigned char state;
+
+	SOCKHASH_LOCK();
+	state = sk->state;
+	if(sk->pprev) {
+		if(sk->next)
+			sk->next->pprev = sk->pprev;
+		*sk->pprev = sk->next;
+		sk->pprev = NULL;
+		tcp_sk_unbindify(sk);
+	}
+	if(state != TCP_CLOSE || !sk->dead) {
+		struct sock **skp;
+
+		if(state == TCP_LISTEN)
+			skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
+		else
+			skp = &tcp_established_hash[tcp_sk_hashfn(sk)];
+
+		if((sk->next = *skp) != NULL)
+			(*skp)->pprev = &sk->next;
+		*skp = sk;
+		sk->pprev = skp;
+		tcp_sk_bindify(sk);
+	}
+	SOCKHASH_UNLOCK();
+}
+
 static void tcp_close(struct sock *sk, unsigned long timeout);
 
 /*
@@ -518,8 +717,7 @@
 	if (len < 8)	/* NOT sizeof(struct tcphdr) */
 		return;
 		
-	sk = get_sock(&tcp_prot, th->source, daddr, th->dest, saddr, 0, 0);
-
+	sk = tcp_v4_lookup(daddr, th->dest, saddr, th->source);
 	if (sk == NULL)
 		return;
 
@@ -1640,6 +1838,7 @@
 		case TCP_CLOSE:
 		case TCP_LISTEN:
 			break;
+		case TCP_LAST_ACK:	/* Could have shutdown() then close()!*/
 		case TCP_CLOSE_WAIT:	/* They have FIN'd us. We send our FIN and
 					   wait only for the ACK */
 			ns=TCP_LAST_ACK;
@@ -1753,7 +1952,6 @@
 
 	lock_sock(sk);
 
-	tcp_cache_zap();
 	if(sk->state == TCP_LISTEN)
 	{
 		/* Special case */
@@ -1761,6 +1959,7 @@
 		tcp_close_pending(sk);
 		release_sock(sk);
 		sk->dead = 1;
+		tcp_v4_unhash(sk);
 		return;
 	}
 
@@ -1813,12 +2012,6 @@
 		sti();
 	}
 
-	/*
-	 * This will destroy it. The timers will take care of actually
-	 * free'ing up the memory.
-	 */
-	tcp_cache_zap();	/* Kill the cache again. */
-
 	/* Now that the socket is dead, if we are in the FIN_WAIT2 state
 	 * we may need to set up a timer.
          */
@@ -1833,6 +2026,9 @@
 
 	release_sock(sk);
 	sk->dead = 1;
+
+	if(sk->state == TCP_CLOSE)
+		tcp_v4_unhash(sk);
 }
 
 
@@ -1916,27 +2112,24 @@
  */
 static int tcp_unique_address(u32 saddr, u16 snum, u32 daddr, u16 dnum)
 {
-	int retval = 1;
+	int retval = 1, hashent = tcp_hashfn(saddr, snum, daddr, dnum);
 	struct sock * sk;
 
-	/* Make sure we are allowed to connect here. */
-	cli();
-	for (sk = tcp_prot.sock_array[snum & (SOCK_ARRAY_SIZE -1)];
-			sk != NULL; sk = sk->next)
-	{
-		/* hash collision? */
-		if (sk->num != snum)
-			continue;
-		if (sk->saddr != saddr)
-			continue;
-		if (sk->daddr != daddr)
-			continue;
-		if (sk->dummy_th.dest != dnum)
-			continue;
-		retval = 0;
-		break;
+	/* Make sure we are allowed to connect here.
+	 * But freeze the hash while we snoop around.
+	 */
+	SOCKHASH_LOCK();
+	sk = tcp_established_hash[hashent];
+	for (; sk != NULL; sk = sk->next) {
+		if(sk->daddr		== daddr		&& /* remote address */
+		   sk->dummy_th.dest	== dnum			&& /* remote port */
+		   sk->num		== snum			&& /* local port */
+		   sk->saddr		== saddr) {		   /* local address */
+			retval = 0;
+			break;
+		}
 	}
-	sti();
+	SOCKHASH_UNLOCK();
 	return retval;
 }
 
@@ -2077,12 +2270,13 @@
 	tcp_send_check(t1, sk->saddr, sk->daddr,
 		  sizeof(struct tcphdr) + 4, buff);
 
-	/*
-	 *	This must go first otherwise a really quick response will get reset.
+	tcp_set_state(sk,TCP_SYN_SENT);
+
+	/* Socket identity change complete, no longer
+	 * in TCP_CLOSE, so rehash.
 	 */
+	tcp_v4_rehash(sk);
 
-	tcp_cache_zap();
-	tcp_set_state(sk,TCP_SYN_SENT);
 	if(rt&&rt->rt_flags&RTF_IRTT)
 		sk->rto = rt->rt_irtt;
 	else
@@ -2174,27 +2368,34 @@
 
 
 struct proto tcp_prot = {
-	tcp_close,
-	ip_build_header,
-	tcp_connect,
-	tcp_accept,
-	ip_queue_xmit,
-	tcp_retransmit,
-	tcp_write_wakeup,
-	tcp_read_wakeup,
-	tcp_rcv,
-	tcp_select,
-	tcp_ioctl,
-	NULL,
-	tcp_shutdown,
-	tcp_setsockopt,
-	tcp_getsockopt,
-	tcp_sendmsg,
-	tcp_recvmsg,
-	NULL,		/* No special bind() */
-	128,
-	0,
-	"TCP",
-	0, 0,
-	{NULL,}
+	(struct sock *)&tcp_prot,	/* sklist_next */
+	(struct sock *)&tcp_prot,	/* sklist_prev */
+	tcp_close,			/* close */
+	ip_build_header,		/* build_header */
+	tcp_connect,			/* connect */
+	tcp_accept,			/* accept */
+	ip_queue_xmit,			/* queue_xmit */
+	tcp_retransmit,			/* retransmit */
+	tcp_write_wakeup,		/* write_wakeup */
+	tcp_read_wakeup,		/* read_wakeup */
+	tcp_rcv,			/* rcv */
+	tcp_select,			/* select */
+	tcp_ioctl,			/* ioctl */
+	NULL,				/* init */
+	tcp_shutdown,			/* shutdown */
+	tcp_setsockopt,			/* setsockopt */
+	tcp_getsockopt,			/* getsockopt */
+	tcp_sendmsg,			/* sendmsg */
+	tcp_recvmsg,			/* recvmsg */
+	NULL,				/* bind */
+	tcp_v4_hash,			/* hash */
+	tcp_v4_unhash,			/* unhash */
+	tcp_v4_rehash,			/* rehash */
+	tcp_good_socknum,		/* good_socknum */
+	tcp_v4_verify_bind,		/* verify_bind */
+	128,				/* max_header */
+	0,				/* retransmits */
+	"TCP",				/* name */
+	0,				/* inuse */
+	0				/* highestinuse */
 };

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen, slshen@lbl.gov