Line data Source code
1 : /*
2 : * INET An implementation of the TCP/IP protocol suite for the LINUX
3 : * operating system. INET is implemented using the BSD Socket
4 : * interface as the means of communication with the user level.
5 : *
6 : * Authors: Lotsa people, from code originally in tcp
7 : *
8 : * This program is free software; you can redistribute it and/or
9 : * modify it under the terms of the GNU General Public License
10 : * as published by the Free Software Foundation; either version
11 : * 2 of the License, or (at your option) any later version.
12 : */
13 :
14 : #ifndef _INET_HASHTABLES_H
15 : #define _INET_HASHTABLES_H
16 :
17 :
18 : #include <linux/interrupt.h>
19 : #include <linux/ip.h>
20 : #include <linux/ipv6.h>
21 : #include <linux/list.h>
22 : #include <linux/slab.h>
23 : #include <linux/socket.h>
24 : #include <linux/spinlock.h>
25 : #include <linux/types.h>
26 : #include <linux/wait.h>
27 : #include <linux/vmalloc.h>
28 :
29 : #include <net/inet_connection_sock.h>
30 : #include <net/inet_sock.h>
31 : #include <net/sock.h>
32 : #include <net/route.h>
33 : #include <net/tcp_states.h>
34 : #include <net/netns/hash.h>
35 :
36 : #include <asm/atomic.h>
37 : #include <asm/byteorder.h>
38 :
39 : /* This is for all connections with a full identity, no wildcards.
40 : * One chain is dedicated to TIME_WAIT sockets.
41 : * I'll experiment with dynamic table growth later.
42 : */
43 : struct inet_ehash_bucket {
44 : struct hlist_nulls_head chain;
45 : struct hlist_nulls_head twchain;
46 : };
47 1 :
48 : /* There are a few simple rules, which allow for local port reuse by
49 : * an application. In essence:
50 : *
51 : * 1) Sockets bound to different interfaces may share a local port.
52 : * Failing that, goto test 2.
53 : * 2) If all sockets have sk->sk_reuse set, and none of them are in
54 : * TCP_LISTEN state, the port may be shared.
55 : * Failing that, goto test 3.
56 : * 3) If all sockets are bound to a specific inet_sk(sk)->rcv_saddr local
57 : * address, and none of them are the same, the port may be
58 : * shared.
59 : * Failing this, the port cannot be shared.
60 : *
61 : * The interesting point, is test #2. This is what an FTP server does
62 : * all day. To optimize this case we use a specific flag bit defined
63 : * below. As we add sockets to a bind bucket list, we perform a
64 : * check of: (newsk->sk_reuse && (newsk->sk_state != TCP_LISTEN))
65 : * As long as all sockets added to a bind bucket pass this test,
66 : * the flag bit will be set.
67 : * The resulting situation is that tcp_v[46]_verify_bind() can just check
68 : * for this flag bit, if it is set and the socket trying to bind has
69 : * sk->sk_reuse set, we don't even have to walk the owners list at all,
70 : * we return that it is ok to bind this socket to the requested local port.
71 : *
72 : * Sounds like a lot of work, but it is worth it. In a more naive
73 : * implementation (ie. current FreeBSD etc.) the entire list of ports
74 : * must be walked for each data port opened by an ftp server. Needless
75 : * to say, this does not scale at all. With a couple thousand FTP
76 : * users logged onto your box, isn't it nice to know that new data
77 : * ports are created in O(1) time? I thought so. ;-) -DaveM
78 : */
79 : struct inet_bind_bucket {
80 : #ifdef CONFIG_NET_NS
81 : struct net *ib_net;
82 : #endif
83 : unsigned short port;
84 : signed short fastreuse;
85 : int num_owners;
86 : struct hlist_node node;
87 : struct hlist_head owners;
88 : };
89 :
90 : static inline struct net *ib_net(struct inet_bind_bucket *ib)
91 : {
92 : return read_pnet(&ib->ib_net);
93 : }
94 1 :
95 : #define inet_bind_bucket_for_each(tb, pos, head) \
96 : hlist_for_each_entry(tb, pos, head, node)
97 :
98 : struct inet_bind_hashbucket {
99 : spinlock_t lock;
100 : struct hlist_head chain;
101 : };
102 1 :
103 : /*
104 : * Sockets can be hashed in established or listening table
105 : * We must use different 'nulls' end-of-chain value for listening
106 : * hash table, or we might find a socket that was closed and
107 : * reallocated/inserted into established hash table
108 : */
109 : #define LISTENING_NULLS_BASE (1U << 29)
110 : struct inet_listen_hashbucket {
111 : spinlock_t lock;
112 : struct hlist_nulls_head head;
113 : };
114 1 :
115 : /* This is for listening sockets, thus all sockets which possess wildcards. */
116 : #define INET_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
117 :
118 : struct inet_hashinfo {
119 : /* This is for sockets with full identity only. Sockets here will
120 : * always be without wildcards and will have the following invariant:
121 : *
122 : * TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
123 : *
124 : * TIME_WAIT sockets use a separate chain (twchain).
125 : */
126 : struct inet_ehash_bucket *ehash;
127 : spinlock_t *ehash_locks;
128 : unsigned int ehash_mask;
129 : unsigned int ehash_locks_mask;
130 :
131 : /* Ok, let's try this, I give up, we do need a local binding
132 : * TCP hash as well as the others for fast bind/connect.
133 : */
134 : struct inet_bind_hashbucket *bhash;
135 :
136 : unsigned int bhash_size;
137 : /* 4 bytes hole on 64 bit */
138 :
139 : struct kmem_cache *bind_bucket_cachep;
140 :
141 : /* All the above members are written once at bootup and
142 : * never written again _or_ are predominantly read-access.
143 : *
144 : * Now align to a new cache line as all the following members
145 : * might be often dirty.
146 : */
147 : /* All sockets in TCP_LISTEN state will be in here. This is the only
148 : * table where wildcard'd TCP sockets can exist. Hash function here
149 : * is just local port number.
150 : */
151 : struct inet_listen_hashbucket listening_hash[INET_LHTABLE_SIZE]
152 : ____cacheline_aligned_in_smp;
153 :
154 : atomic_t bsockets;
155 : };
156 :
157 : static inline struct inet_ehash_bucket *inet_ehash_bucket(
158 : struct inet_hashinfo *hashinfo,
159 : unsigned int hash)
160 : {
161 : return &hashinfo->ehash[hash & hashinfo->ehash_mask];
162 : }
163 :
164 : static inline spinlock_t *inet_ehash_lockp(
165 : struct inet_hashinfo *hashinfo,
166 : unsigned int hash)
167 : {
168 : return &hashinfo->ehash_locks[hash & hashinfo->ehash_locks_mask];
169 : }
170 :
171 : static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
172 : {
173 : unsigned int i, size = 256;
174 : #if defined(CONFIG_PROVE_LOCKING)
175 : unsigned int nr_pcpus = 2;
176 : #else
177 : unsigned int nr_pcpus = num_possible_cpus();
178 : #endif
179 : if (nr_pcpus >= 4)
180 : size = 512;
181 : if (nr_pcpus >= 8)
182 : size = 1024;
183 : if (nr_pcpus >= 16)
184 : size = 2048;
185 : if (nr_pcpus >= 32)
186 : size = 4096;
187 : if (sizeof(spinlock_t) != 0) {
188 : #ifdef CONFIG_NUMA
189 : if (size * sizeof(spinlock_t) > PAGE_SIZE)
190 : hashinfo->ehash_locks = vmalloc(size * sizeof(spinlock_t));
191 : else
192 : #endif
193 : hashinfo->ehash_locks = kmalloc(size * sizeof(spinlock_t),
194 : GFP_KERNEL);
195 : if (!hashinfo->ehash_locks)
196 : return ENOMEM;
197 : for (i = 0; i < size; i++)
198 : spin_lock_init(&hashinfo->ehash_locks[i]);
199 : }
200 : hashinfo->ehash_locks_mask = size - 1;
201 : return 0;
202 : }
203 :
204 : static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo)
205 : {
206 : if (hashinfo->ehash_locks) {
207 : #ifdef CONFIG_NUMA
208 : unsigned int size = (hashinfo->ehash_locks_mask + 1) *
209 : sizeof(spinlock_t);
210 : if (size > PAGE_SIZE)
211 : vfree(hashinfo->ehash_locks);
212 : else
213 : #endif
214 : kfree(hashinfo->ehash_locks);
215 : hashinfo->ehash_locks = NULL;
216 : }
217 : }
218 :
219 : extern struct inet_bind_bucket *
220 : inet_bind_bucket_create(struct kmem_cache *cachep,
221 : struct net *net,
222 : struct inet_bind_hashbucket *head,
223 : const unsigned short snum);
224 : extern void inet_bind_bucket_destroy(struct kmem_cache *cachep,
225 : struct inet_bind_bucket *tb);
226 :
227 : static inline int inet_bhashfn(struct net *net,
228 : const __u16 lport, const int bhash_size)
229 : {
230 : return (lport + net_hash_mix(net)) & (bhash_size - 1);
231 : }
232 :
233 : extern void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
234 : const unsigned short snum);
235 :
236 : /* These can have wildcards, don't try too hard. */
237 : static inline int inet_lhashfn(struct net *net, const unsigned short num)
238 : {
239 : return (num + net_hash_mix(net)) & (INET_LHTABLE_SIZE - 1);
240 : }
241 :
242 : static inline int inet_sk_listen_hashfn(const struct sock *sk)
243 : {
244 : return inet_lhashfn(sock_net(sk), inet_sk(sk)->inet_num);
245 : }
246 :
247 : /* Caller must disable local BH processing. */
248 : extern void __inet_inherit_port(struct sock *sk, struct sock *child);
249 :
250 : extern void inet_put_port(struct sock *sk);
251 :
252 : void inet_hashinfo_init(struct inet_hashinfo *h);
253 :
254 : extern int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw);
255 : extern void inet_hash(struct sock *sk);
256 : extern void inet_unhash(struct sock *sk);
257 :
258 : extern struct sock *__inet_lookup_listener(struct net *net,
259 : struct inet_hashinfo *hashinfo,
260 : const __be32 daddr,
261 : const unsigned short hnum,
262 : const int dif);
263 :
264 : static inline struct sock *inet_lookup_listener(struct net *net,
265 : struct inet_hashinfo *hashinfo,
266 : __be32 daddr, __be16 dport, int dif)
267 : {
268 : return __inet_lookup_listener(net, hashinfo, daddr, ntohs(dport), dif);
269 : }
270 :
271 : /* Socket demux engine toys. */
272 : /* What happens here is ugly; there's a pair of adjacent fields in
273 : struct inet_sock; __be16 dport followed by __u16 num. We want to
274 : search by pair, so we combine the keys into a single 32bit value
275 : and compare with 32bit value read from &...->dport. Let's at least
276 : make sure that it's not mixed with anything else...
277 : On 64bit targets we combine comparisons with pair of adjacent __be32
278 : fields in the same way.
279 : */
280 : typedef __u32 __bitwise __portpair;
281 : #ifdef __BIG_ENDIAN
282 : #define INET_COMBINED_PORTS(__sport, __dport) \
283 : ((__force __portpair)(((__force __u32)(__be16)(__sport) << 16) | (__u32)(__dport)))
284 : #else /* __LITTLE_ENDIAN */
285 : #define INET_COMBINED_PORTS(__sport, __dport) \
286 : ((__force __portpair)(((__u32)(__dport) << 16) | (__force __u32)(__be16)(__sport)))
287 : #endif
288 :
289 : #if (BITS_PER_LONG == 64)
290 : typedef __u64 __bitwise __addrpair;
291 : #ifdef __BIG_ENDIAN
292 : #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
293 : const __addrpair __name = (__force __addrpair) ( \
294 : (((__force __u64)(__be32)(__saddr)) << 32) | \
295 : ((__force __u64)(__be32)(__daddr)));
296 : #else /* __LITTLE_ENDIAN */
297 : #define INET_ADDR_COOKIE(__name, __saddr, __daddr) \
298 : const __addrpair __name = (__force __addrpair) ( \
299 : (((__force __u64)(__be32)(__daddr)) << 32) | \
300 : ((__force __u64)(__be32)(__saddr)));
301 : #endif /* __BIG_ENDIAN */
302 : #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
303 : (((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) && \
304 : ((*((__addrpair *)&(inet_sk(__sk)->inet_daddr))) == (__cookie)) && \
305 : ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports)) && \
306 : (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
307 : #define INET_TW_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
308 : (((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) && \
309 : ((*((__addrpair *)&(inet_twsk(__sk)->tw_daddr))) == (__cookie)) && \
310 : ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \
311 : (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
312 : #else /* 32-bit arch */
313 : #define INET_ADDR_COOKIE(__name, __saddr, __daddr)
314 : #define INET_MATCH(__sk, __net, __hash, __cookie, __saddr, __daddr, __ports, __dif) \
315 : (((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) && \
316 : (inet_sk(__sk)->inet_daddr == (__saddr)) && \
317 : (inet_sk(__sk)->inet_rcv_saddr == (__daddr)) && \
318 : ((*((__portpair *)&(inet_sk(__sk)->inet_dport))) == (__ports)) && \
319 : (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
320 : #define INET_TW_MATCH(__sk, __net, __hash,__cookie, __saddr, __daddr, __ports, __dif) \
321 : (((__sk)->sk_hash == (__hash)) && net_eq(sock_net(__sk), (__net)) && \
322 : (inet_twsk(__sk)->tw_daddr == (__saddr)) && \
323 : (inet_twsk(__sk)->tw_rcv_saddr == (__daddr)) && \
324 : ((*((__portpair *)&(inet_twsk(__sk)->tw_dport))) == (__ports)) && \
325 : (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))
326 : #endif /* 64-bit arch */
327 :
328 : /*
329 : * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so we need
330 : * not check it for lookups anymore, thanks Alexey. -DaveM
331 : *
332 : * Local BH must be disabled here.
333 : */
334 : extern struct sock * __inet_lookup_established(struct net *net,
335 : struct inet_hashinfo *hashinfo,
336 : const __be32 saddr, const __be16 sport,
337 : const __be32 daddr, const u16 hnum, const int dif);
338 :
339 : static inline struct sock *
340 : inet_lookup_established(struct net *net, struct inet_hashinfo *hashinfo,
341 : const __be32 saddr, const __be16 sport,
342 : const __be32 daddr, const __be16 dport,
343 : const int dif)
344 : {
345 : return __inet_lookup_established(net, hashinfo, saddr, sport, daddr,
346 : ntohs(dport), dif);
347 : }
348 :
349 : static inline struct sock *__inet_lookup(struct net *net,
350 : struct inet_hashinfo *hashinfo,
351 : const __be32 saddr, const __be16 sport,
352 : const __be32 daddr, const __be16 dport,
353 : const int dif)
354 : {
355 : u16 hnum = ntohs(dport);
356 : struct sock *sk = __inet_lookup_established(net, hashinfo,
357 : saddr, sport, daddr, hnum, dif);
358 :
359 : return sk ? : __inet_lookup_listener(net, hashinfo, daddr, hnum, dif);
360 : }
361 :
362 : static inline struct sock *inet_lookup(struct net *net,
363 : struct inet_hashinfo *hashinfo,
364 : const __be32 saddr, const __be16 sport,
365 : const __be32 daddr, const __be16 dport,
366 : const int dif)
367 : {
368 : struct sock *sk;
369 :
370 : local_bh_disable();
371 : sk = __inet_lookup(net, hashinfo, saddr, sport, daddr, dport, dif);
372 : local_bh_enable();
373 :
374 : return sk;
375 : }
376 :
377 : static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo,
378 : struct sk_buff *skb,
379 : const __be16 sport,
380 : const __be16 dport)
381 : {
382 : struct sock *sk;
383 : const struct iphdr *iph = ip_hdr(skb);
384 :
385 : if (unlikely(sk = skb_steal_sock(skb)))
386 : return sk;
387 : else
388 : return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
389 : iph->saddr, sport,
390 : iph->daddr, dport, inet_iif(skb));
391 : }
392 :
393 : extern int __inet_hash_connect(struct inet_timewait_death_row *death_row,
394 : struct sock *sk,
395 : u32 port_offset,
396 : int (*check_established)(struct inet_timewait_death_row *,
397 : struct sock *, __u16, struct inet_timewait_sock **),
398 : int (*hash)(struct sock *sk, struct inet_timewait_sock *twp));
399 :
400 : extern int inet_hash_connect(struct inet_timewait_death_row *death_row,
401 : struct sock *sk);
402 : #endif /* _INET_HASHTABLES_H */
|