2 * COPYRIGHT: See COPYING in the top level directory
3 * PROJECT: ReactOS TCP/IP protocol driver
4 * FILE: include/tcpcore.h
5 * PURPOSE: Transmission Control Protocol definitions
7 * CSH 01/01-2003 Ported from linux kernel 2.4.20
11 * INET An implementation of the TCP/IP protocol suite for the LINUX
12 * operating system. INET is implemented using the BSD Socket
13 * interface as the means of communication with the user level.
15 * Definitions for the TCP module.
17 * Version: @(#)tcp.h 1.0.5 05/23/93
19 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
20 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
22 * This program is free software; you can redistribute it and/or
23 * modify it under the terms of the GNU General Public License
24 * as published by the Free Software Foundation; either version
25 * 2 of the License, or (at your option) any later version.
39 #define HAVE_ALLOC_SKB /* For the drivers to know */
40 #define HAVE_ALIGNABLE_SKB /* Ditto 8) */
41 #define SLAB_SKB /* Slabified skbuffs */
43 #define CHECKSUM_NONE 0
45 #define CHECKSUM_UNNECESSARY 2
47 #define SKB_DATA_ALIGN(X) (((X) + (SMP_CACHE_BYTES-1)) & ~(SMP_CACHE_BYTES-1))
48 #define SKB_MAX_ORDER(X,ORDER) (((PAGE_SIZE<<(ORDER)) - (X) - sizeof(struct skb_shared_info))&~(SMP_CACHE_BYTES-1))
49 #define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X),0))
50 #define SKB_MAX_ALLOC (SKB_MAX_ORDER(0,2))
52 /* A. Checksumming of received packets by device.
54 * NONE: device failed to checksum this packet.
55 * skb->csum is undefined.
57 * UNNECESSARY: device parsed packet and wouldbe verified checksum.
58 * skb->csum is undefined.
59 * It is bad option, but, unfortunately, many of vendors do this.
60 * Apparently with secret goal to sell you new device, when you
61 * will add new protocol to your host. F.e. IPv6. 8)
63 * HW: the most generic way. Device supplied checksum of _all_
64 * the packet as seen by netif_rx in skb->csum.
65 * NOTE: Even if device supports only some protocols, but
66 * is able to produce some skb->csum, it MUST use HW,
69 * B. Checksumming on output.
71 * NONE: skb is checksummed by protocol or csum is not required.
73 * HW: device is required to csum packet as seen by hard_start_xmit
74 * from skb->h.raw to the end and to record the checksum
75 * at skb->h.raw+skb->csum.
77 * Device must show its capabilities in dev->features, set
78 * at device setup time.
79 * NETIF_F_HW_CSUM - it is clever device, it is able to checksum
81 * NETIF_F_NO_CSUM - loopback or reliable single hop media.
82 * NETIF_F_IP_CSUM - device is dumb. It is able to csum only
83 * TCP/UDP over IPv4. Sigh. Vendors like this
84 * way by an unknown reason. Though, see comment above
85 * about CHECKSUM_UNNECESSARY. 8)
87 * Any questions? No questions, good. --ANK
91 #define NET_CALLER(arg) (*(((void**)&arg)-1))
93 #define NET_CALLER(arg) __builtin_return_address(0)
96 #ifdef CONFIG_NETFILTER
99 void (*destroy
)(struct nf_conntrack
*);
103 struct nf_conntrack
*master
;
107 struct sk_buff_head
{
108 /* These two members must be first. */
109 struct sk_buff
* next
;
110 struct sk_buff
* prev
;
118 #define MAX_SKB_FRAGS 6
120 typedef struct skb_frag_struct skb_frag_t
;
122 struct skb_frag_struct
129 /* This data is invariant across clones and lives at
130 * the end of the header data, ie. at skb->end.
132 struct skb_shared_info
{
134 unsigned int nr_frags
;
135 struct sk_buff
*frag_list
;
136 skb_frag_t frags
[MAX_SKB_FRAGS
];
140 /* These two members must be first. */
141 struct sk_buff
* next
; /* Next buffer in list */
142 struct sk_buff
* prev
; /* Previous buffer in list */
144 struct sk_buff_head
* list
; /* List we are on */
145 struct sock
*sk
; /* Socket we are owned by */
146 struct timeval stamp
; /* Time we arrived */
147 struct net_device
*dev
; /* Device we arrived on/are leaving by */
149 /* Transport layer header */
154 struct icmphdr
*icmph
;
155 struct igmphdr
*igmph
;
161 /* Network layer header */
165 struct ipv6hdr
*ipv6h
;
171 /* Link layer header */
174 struct ethhdr
*ethernet
;
178 struct dst_entry
*dst
;
181 * This is the control buffer. It is free to use for every
182 * layer. Please put your private variables there. If you
183 * want to keep them across layers you have to do a skb_clone()
184 * first. This is owned by whoever has the skb queued ATM.
188 unsigned int len
; /* Length of actual data */
189 unsigned int data_len
;
190 unsigned int csum
; /* Checksum */
191 unsigned char __unused
, /* Dead field, may be reused */
192 cloned
, /* head may be cloned (check refcnt to be sure). */
193 pkt_type
, /* Packet class */
194 ip_summed
; /* Driver fed us an IP checksum */
195 __u32 priority
; /* Packet queueing priority */
196 atomic_t users
; /* User count - see datagram.c,tcp.c */
197 unsigned short protocol
; /* Packet protocol from driver. */
198 unsigned short security
; /* Security level of packet */
199 unsigned int truesize
; /* Buffer size */
201 unsigned char *head
; /* Head of buffer */
202 unsigned char *data
; /* Data head pointer */
203 unsigned char *tail
; /* Tail pointer */
204 unsigned char *end
; /* End pointer */
206 void (*destructor
)(struct sk_buff
*); /* Destruct function */
207 #ifdef CONFIG_NETFILTER
208 /* Can be used for communication between hooks. */
209 unsigned long nfmark
;
212 /* Associated connection, if any */
213 struct nf_ct_info
*nfct
;
214 #ifdef CONFIG_NETFILTER_DEBUG
215 unsigned int nf_debug
;
217 #endif /*CONFIG_NETFILTER*/
219 #if defined(CONFIG_HIPPI)
225 #ifdef CONFIG_NET_SCHED
226 __u32 tc_index
; /* traffic control index */
230 #define SK_WMEM_MAX 65535
231 #define SK_RMEM_MAX 65535
236 * Handling routines are only of interest to the kernel
239 extern void __kfree_skb(struct sk_buff
*skb
);
240 extern struct sk_buff
* alloc_skb(unsigned int size
, int priority
);
241 extern void kfree_skbmem(struct sk_buff
*skb
);
242 extern struct sk_buff
* skb_clone(struct sk_buff
*skb
, int priority
);
243 extern struct sk_buff
* skb_copy(const struct sk_buff
*skb
, int priority
);
244 extern struct sk_buff
* pskb_copy(struct sk_buff
*skb
, int gfp_mask
);
245 extern int pskb_expand_head(struct sk_buff
*skb
, int nhead
, int ntail
, int gfp_mask
);
246 extern struct sk_buff
* skb_realloc_headroom(struct sk_buff
*skb
, unsigned int headroom
);
247 extern struct sk_buff
* skb_copy_expand(const struct sk_buff
*skb
,
251 #define dev_kfree_skb(a) kfree_skb(a)
252 extern void skb_over_panic(struct sk_buff
*skb
, int len
, void *here
);
253 extern void skb_under_panic(struct sk_buff
*skb
, int len
, void *here
);
256 #define skb_shinfo(SKB) ((struct skb_shared_info *)((SKB)->end))
259 * skb_queue_empty - check if a queue is empty
262 * Returns true if the queue is empty, false otherwise.
265 static __inline
int skb_queue_empty(struct sk_buff_head
*list
)
267 return (list
->next
== (struct sk_buff
*) list
);
271 * skb_get - reference buffer
272 * @skb: buffer to reference
274 * Makes another reference to a socket buffer and returns a pointer
278 static __inline
struct sk_buff
*skb_get(struct sk_buff
*skb
)
280 atomic_inc(&skb
->users
);
285 * If users==1, we are the only owner and are can avoid redundant
290 * kfree_skb - free an sk_buff
291 * @skb: buffer to free
293 * Drop a reference to the buffer and free it if the usage count has
297 static __inline
void kfree_skb(struct sk_buff
*skb
)
299 if (atomic_read(&skb
->users
) == 1 || atomic_dec_and_test(&skb
->users
))
303 /* Use this if you didn't touch the skb state [for fast switching] */
304 static __inline
void kfree_skb_fast(struct sk_buff
*skb
)
306 if (atomic_read(&skb
->users
) == 1 || atomic_dec_and_test(&skb
->users
))
311 * skb_cloned - is the buffer a clone
312 * @skb: buffer to check
314 * Returns true if the buffer was generated with skb_clone() and is
315 * one of multiple shared copies of the buffer. Cloned buffers are
316 * shared data so must not be written to under normal circumstances.
319 static __inline
int skb_cloned(struct sk_buff
*skb
)
321 return skb
->cloned
&& atomic_read(&skb_shinfo(skb
)->dataref
) != 1;
325 * skb_shared - is the buffer shared
326 * @skb: buffer to check
328 * Returns true if more than one person has a reference to this
332 static __inline
int skb_shared(struct sk_buff
*skb
)
334 return (atomic_read(&skb
->users
) != 1);
338 * skb_share_check - check if buffer is shared and if so clone it
339 * @skb: buffer to check
340 * @pri: priority for memory allocation
342 * If the buffer is shared the buffer is cloned and the old copy
343 * drops a reference. A new clone with a single reference is returned.
344 * If the buffer is not shared the original buffer is returned. When
345 * being called from interrupt status or with spinlocks held pri must
348 * NULL is returned on a memory allocation failure.
351 static __inline
struct sk_buff
*skb_share_check(struct sk_buff
*skb
, int pri
)
353 if (skb_shared(skb
)) {
354 struct sk_buff
*nskb
;
355 nskb
= skb_clone(skb
, pri
);
364 * Copy shared buffers into a new sk_buff. We effectively do COW on
365 * packets to handle cases where we have a local reader and forward
366 * and a couple of other messy ones. The normal one is tcpdumping
367 * a packet thats being forwarded.
371 * skb_unshare - make a copy of a shared buffer
372 * @skb: buffer to check
373 * @pri: priority for memory allocation
375 * If the socket buffer is a clone then this function creates a new
376 * copy of the data, drops a reference count on the old copy and returns
377 * the new copy with the reference count at 1. If the buffer is not a clone
378 * the original buffer is returned. When called with a spinlock held or
379 * from interrupt state @pri must be %GFP_ATOMIC
381 * %NULL is returned on a memory allocation failure.
384 static __inline
struct sk_buff
*skb_unshare(struct sk_buff
*skb
, int pri
)
386 struct sk_buff
*nskb
;
389 nskb
=skb_copy(skb
, pri
);
390 kfree_skb(skb
); /* Free our shared copy */
396 * @list_: list to peek at
398 * Peek an &sk_buff. Unlike most other operations you _MUST_
399 * be careful with this one. A peek leaves the buffer on the
400 * list and someone else may run off with it. You must hold
401 * the appropriate locks or have a private queue to do this.
403 * Returns %NULL for an empty list or a pointer to the head element.
404 * The reference count is not incremented and the reference is therefore
405 * volatile. Use with caution.
408 static __inline
struct sk_buff
*skb_peek(struct sk_buff_head
*list_
)
410 struct sk_buff
*list
= ((struct sk_buff
*)list_
)->next
;
411 if (list
== (struct sk_buff
*)list_
)
418 * @list_: list to peek at
420 * Peek an &sk_buff. Unlike most other operations you _MUST_
421 * be careful with this one. A peek leaves the buffer on the
422 * list and someone else may run off with it. You must hold
423 * the appropriate locks or have a private queue to do this.
425 * Returns %NULL for an empty list or a pointer to the tail element.
426 * The reference count is not incremented and the reference is therefore
427 * volatile. Use with caution.
430 static __inline
struct sk_buff
*skb_peek_tail(struct sk_buff_head
*list_
)
432 struct sk_buff
*list
= ((struct sk_buff
*)list_
)->prev
;
433 if (list
== (struct sk_buff
*)list_
)
439 * skb_queue_len - get queue length
440 * @list_: list to measure
442 * Return the length of an &sk_buff queue.
445 static __inline __u32
skb_queue_len(struct sk_buff_head
*list_
)
450 static __inline
void skb_queue_head_init(struct sk_buff_head
*list
)
452 spin_lock_init(&list
->lock
);
453 list
->prev
= (struct sk_buff
*)list
;
454 list
->next
= (struct sk_buff
*)list
;
459 * Insert an sk_buff at the start of a list.
461 * The "__skb_xxxx()" functions are the non-atomic ones that
462 * can only be called with interrupts disabled.
466 * __skb_queue_head - queue a buffer at the list head
468 * @newsk: buffer to queue
470 * Queue a buffer at the start of a list. This function takes no locks
471 * and you must therefore hold required locks before calling it.
473 * A buffer cannot be placed on two lists at the same time.
476 static __inline
void __skb_queue_head(struct sk_buff_head
*list
, struct sk_buff
*newsk
)
478 struct sk_buff
*prev
, *next
;
482 prev
= (struct sk_buff
*)list
;
492 * skb_queue_head - queue a buffer at the list head
494 * @newsk: buffer to queue
496 * Queue a buffer at the start of the list. This function takes the
497 * list lock and can be used safely with other locking &sk_buff functions
500 * A buffer cannot be placed on two lists at the same time.
503 static __inline
void skb_queue_head(struct sk_buff_head
*list
, struct sk_buff
*newsk
)
507 spin_lock_irqsave(&list
->lock
, flags
);
508 __skb_queue_head(list
, newsk
);
509 spin_unlock_irqrestore(&list
->lock
, flags
);
513 * __skb_queue_tail - queue a buffer at the list tail
515 * @newsk: buffer to queue
517 * Queue a buffer at the end of a list. This function takes no locks
518 * and you must therefore hold required locks before calling it.
520 * A buffer cannot be placed on two lists at the same time.
524 static __inline
void __skb_queue_tail(struct sk_buff_head
*list
, struct sk_buff
*newsk
)
526 struct sk_buff
*prev
, *next
;
530 next
= (struct sk_buff
*)list
;
539 * skb_queue_tail - queue a buffer at the list tail
541 * @newsk: buffer to queue
543 * Queue a buffer at the tail of the list. This function takes the
544 * list lock and can be used safely with other locking &sk_buff functions
547 * A buffer cannot be placed on two lists at the same time.
550 static __inline
void skb_queue_tail(struct sk_buff_head
*list
, struct sk_buff
*newsk
)
554 spin_lock_irqsave(&list
->lock
, flags
);
555 __skb_queue_tail(list
, newsk
);
556 spin_unlock_irqrestore(&list
->lock
, flags
);
560 * __skb_dequeue - remove from the head of the queue
561 * @list: list to dequeue from
563 * Remove the head of the list. This function does not take any locks
564 * so must be used with appropriate locks held only. The head item is
565 * returned or %NULL if the list is empty.
568 static __inline
struct sk_buff
*__skb_dequeue(struct sk_buff_head
*list
)
570 struct sk_buff
*next
, *prev
, *result
;
572 prev
= (struct sk_buff
*) list
;
589 * skb_dequeue - remove from the head of the queue
590 * @list: list to dequeue from
592 * Remove the head of the list. The list lock is taken so the function
593 * may be used safely with other locking list functions. The head item is
594 * returned or %NULL if the list is empty.
597 static __inline
struct sk_buff
*skb_dequeue(struct sk_buff_head
*list
)
600 struct sk_buff
*result
;
602 spin_lock_irqsave(&list
->lock
, flags
);
603 result
= __skb_dequeue(list
);
604 spin_unlock_irqrestore(&list
->lock
, flags
);
609 * Insert a packet on a list.
612 static __inline
void __skb_insert(struct sk_buff
*newsk
,
613 struct sk_buff
* prev
, struct sk_buff
*next
,
614 struct sk_buff_head
* list
)
625 * skb_insert - insert a buffer
626 * @old: buffer to insert before
627 * @newsk: buffer to insert
629 * Place a packet before a given packet in a list. The list locks are taken
630 * and this function is atomic with respect to other list locked calls
631 * A buffer cannot be placed on two lists at the same time.
634 static __inline
void skb_insert(struct sk_buff
*old
, struct sk_buff
*newsk
)
638 spin_lock_irqsave(&old
->list
->lock
, flags
);
639 __skb_insert(newsk
, old
->prev
, old
, old
->list
);
640 spin_unlock_irqrestore(&old
->list
->lock
, flags
);
644 * Place a packet after a given packet in a list.
647 static __inline
void __skb_append(struct sk_buff
*old
, struct sk_buff
*newsk
)
649 __skb_insert(newsk
, old
, old
->next
, old
->list
);
653 * skb_append - append a buffer
654 * @old: buffer to insert after
655 * @newsk: buffer to insert
657 * Place a packet after a given packet in a list. The list locks are taken
658 * and this function is atomic with respect to other list locked calls.
659 * A buffer cannot be placed on two lists at the same time.
663 static __inline
void skb_append(struct sk_buff
*old
, struct sk_buff
*newsk
)
667 spin_lock_irqsave(&old
->list
->lock
, flags
);
668 __skb_append(old
, newsk
);
669 spin_unlock_irqrestore(&old
->list
->lock
, flags
);
673 * remove sk_buff from list. _Must_ be called atomically, and with
677 static __inline
void __skb_unlink(struct sk_buff
*skb
, struct sk_buff_head
*list
)
679 struct sk_buff
* next
, * prev
;
692 * skb_unlink - remove a buffer from a list
693 * @skb: buffer to remove
695 * Place a packet after a given packet in a list. The list locks are taken
696 * and this function is atomic with respect to other list locked calls
698 * Works even without knowing the list it is sitting on, which can be
699 * handy at times. It also means that THE LIST MUST EXIST when you
700 * unlink. Thus a list must have its contents unlinked before it is
704 static __inline
void skb_unlink(struct sk_buff
*skb
)
706 struct sk_buff_head
*list
= skb
->list
;
711 spin_lock_irqsave(&list
->lock
, flags
);
712 if(skb
->list
== list
)
713 __skb_unlink(skb
, skb
->list
);
714 spin_unlock_irqrestore(&list
->lock
, flags
);
718 /* XXX: more streamlined implementation */
721 * __skb_dequeue_tail - remove from the tail of the queue
722 * @list: list to dequeue from
724 * Remove the tail of the list. This function does not take any locks
725 * so must be used with appropriate locks held only. The tail item is
726 * returned or %NULL if the list is empty.
729 static __inline
struct sk_buff
*__skb_dequeue_tail(struct sk_buff_head
*list
)
731 struct sk_buff
*skb
= skb_peek_tail(list
);
733 __skb_unlink(skb
, list
);
738 * skb_dequeue - remove from the head of the queue
739 * @list: list to dequeue from
741 * Remove the head of the list. The list lock is taken so the function
742 * may be used safely with other locking list functions. The tail item is
743 * returned or %NULL if the list is empty.
746 static __inline
struct sk_buff
*skb_dequeue_tail(struct sk_buff_head
*list
)
749 struct sk_buff
*result
;
751 spin_lock_irqsave(&list
->lock
, flags
);
752 result
= __skb_dequeue_tail(list
);
753 spin_unlock_irqrestore(&list
->lock
, flags
);
757 static __inline
int skb_is_nonlinear(const struct sk_buff
*skb
)
759 return skb
->data_len
;
762 static __inline
int skb_headlen(const struct sk_buff
*skb
)
764 return skb
->len
- skb
->data_len
;
767 #define SKB_PAGE_ASSERT(skb) do { if (skb_shinfo(skb)->nr_frags) out_of_line_bug(); } while (0)
768 #define SKB_FRAG_ASSERT(skb) do { if (skb_shinfo(skb)->frag_list) out_of_line_bug(); } while (0)
769 #define SKB_LINEAR_ASSERT(skb) do { if (skb_is_nonlinear(skb)) out_of_line_bug(); } while (0)
772 * Add data to an sk_buff
775 static __inline
unsigned char *__skb_put(struct sk_buff
*skb
, unsigned int len
)
777 unsigned char *tmp
=skb
->tail
;
778 SKB_LINEAR_ASSERT(skb
);
785 * skb_put - add data to a buffer
786 * @skb: buffer to use
787 * @len: amount of data to add
789 * This function extends the used data area of the buffer. If this would
790 * exceed the total buffer size the kernel will panic. A pointer to the
791 * first byte of the extra data is returned.
794 static __inline
unsigned char *skb_put(struct sk_buff
*skb
, unsigned int len
)
797 unsigned char *tmp
=skb
->tail
;
798 SKB_LINEAR_ASSERT(skb
);
801 if(skb
->tail
>skb
->end
) {
802 skb_over_panic(skb
, len
, current_text_addr());
810 static __inline
unsigned char *__skb_push(struct sk_buff
*skb
, unsigned int len
)
818 * skb_push - add data to the start of a buffer
819 * @skb: buffer to use
820 * @len: amount of data to add
822 * This function extends the used data area of the buffer at the buffer
823 * start. If this would exceed the total buffer headroom the kernel will
824 * panic. A pointer to the first byte of the extra data is returned.
827 static __inline
unsigned char *skb_push(struct sk_buff
*skb
, unsigned int len
)
832 if(skb
->data
<skb
->head
) {
833 skb_under_panic(skb
, len
, current_text_addr());
841 static __inline
char *__skb_pull(struct sk_buff
*skb
, unsigned int len
)
844 if (skb
->len
< skb
->data_len
)
846 return skb
->data
+=len
;
850 * skb_pull - remove data from the start of a buffer
851 * @skb: buffer to use
852 * @len: amount of data to remove
854 * This function removes data from the start of a buffer, returning
855 * the memory to the headroom. A pointer to the next data in the buffer
856 * is returned. Once the data has been pulled future pushes will overwrite
860 static __inline
unsigned char * skb_pull(struct sk_buff
*skb
, unsigned int len
)
864 return __skb_pull(skb
,len
);
867 extern unsigned char * __pskb_pull_tail(struct sk_buff
*skb
, int delta
);
869 static __inline
char *__pskb_pull(struct sk_buff
*skb
, unsigned int len
)
871 if (len
> skb_headlen(skb
) &&
872 __pskb_pull_tail(skb
, len
-skb_headlen(skb
)) == NULL
)
875 return skb
->data
+= len
;
878 static __inline
unsigned char * pskb_pull(struct sk_buff
*skb
, unsigned int len
)
882 return __pskb_pull(skb
,len
);
885 static __inline
int pskb_may_pull(struct sk_buff
*skb
, unsigned int len
)
887 if (len
<= skb_headlen(skb
))
891 return (__pskb_pull_tail(skb
, len
-skb_headlen(skb
)) != NULL
);
895 * skb_headroom - bytes at buffer head
896 * @skb: buffer to check
898 * Return the number of bytes of free space at the head of an &sk_buff.
901 static __inline
int skb_headroom(const struct sk_buff
*skb
)
903 return skb
->data
-skb
->head
;
907 * skb_tailroom - bytes at buffer end
908 * @skb: buffer to check
910 * Return the number of bytes of free space at the tail of an sk_buff
913 static __inline
int skb_tailroom(const struct sk_buff
*skb
)
915 return skb_is_nonlinear(skb
) ? 0 : skb
->end
-skb
->tail
;
919 * skb_reserve - adjust headroom
920 * @skb: buffer to alter
921 * @len: bytes to move
923 * Increase the headroom of an empty &sk_buff by reducing the tail
924 * room. This is only allowed for an empty buffer.
927 static __inline
void skb_reserve(struct sk_buff
*skb
, unsigned int len
)
933 extern int ___pskb_trim(struct sk_buff
*skb
, unsigned int len
, int realloc
);
935 static __inline
void __skb_trim(struct sk_buff
*skb
, unsigned int len
)
937 if (!skb
->data_len
) {
939 skb
->tail
= skb
->data
+len
;
941 ___pskb_trim(skb
, len
, 0);
946 * skb_trim - remove end from a buffer
947 * @skb: buffer to alter
950 * Cut the length of a buffer down by removing data from the tail. If
951 * the buffer is already under the length specified it is not modified.
954 static __inline
void skb_trim(struct sk_buff
*skb
, unsigned int len
)
956 if (skb
->len
> len
) {
957 __skb_trim(skb
, len
);
962 static __inline
int __pskb_trim(struct sk_buff
*skb
, unsigned int len
)
964 if (!skb
->data_len
) {
966 skb
->tail
= skb
->data
+len
;
969 return ___pskb_trim(skb
, len
, 1);
973 static __inline
int pskb_trim(struct sk_buff
*skb
, unsigned int len
)
976 return __pskb_trim(skb
, len
);
981 * skb_orphan - orphan a buffer
982 * @skb: buffer to orphan
984 * If a buffer currently has an owner then we call the owner's
985 * destructor function and make the @skb unowned. The buffer continues
986 * to exist but is no longer charged to its former owner.
990 static __inline
void skb_orphan(struct sk_buff
*skb
)
993 skb
->destructor(skb
);
994 skb
->destructor
= NULL
;
999 * skb_purge - empty a list
1000 * @list: list to empty
1002 * Delete all buffers on an &sk_buff list. Each buffer is removed from
1003 * the list and one reference dropped. This function takes the list
1004 * lock and is atomic with respect to other list locking functions.
1008 static __inline
void skb_queue_purge(struct sk_buff_head
*list
)
1010 struct sk_buff
*skb
;
1011 while ((skb
=skb_dequeue(list
))!=NULL
)
1016 * __skb_purge - empty a list
1017 * @list: list to empty
1019 * Delete all buffers on an &sk_buff list. Each buffer is removed from
1020 * the list and one reference dropped. This function does not take the
1021 * list lock and the caller must hold the relevant locks to use it.
1025 static __inline
void __skb_queue_purge(struct sk_buff_head
*list
)
1027 struct sk_buff
*skb
;
1028 while ((skb
=__skb_dequeue(list
))!=NULL
)
1033 * __dev_alloc_skb - allocate an skbuff for sending
1034 * @length: length to allocate
1035 * @gfp_mask: get_free_pages mask, passed to alloc_skb
1037 * Allocate a new &sk_buff and assign it a usage count of one. The
1038 * buffer has unspecified headroom built in. Users should allocate
1039 * the headroom they think they need without accounting for the
1040 * built in space. The built in space is used for optimisations.
1042 * %NULL is returned in there is no free memory.
1045 static __inline
struct sk_buff
*__dev_alloc_skb(unsigned int length
,
1048 struct sk_buff
*skb
;
1050 skb
= alloc_skb(length
+16, gfp_mask
);
1052 skb_reserve(skb
,16);
1057 * dev_alloc_skb - allocate an skbuff for sending
1058 * @length: length to allocate
1060 * Allocate a new &sk_buff and assign it a usage count of one. The
1061 * buffer has unspecified headroom built in. Users should allocate
1062 * the headroom they think they need without accounting for the
1063 * built in space. The built in space is used for optimisations.
1065 * %NULL is returned in there is no free memory. Although this function
1066 * allocates memory it can be called from an interrupt.
1069 static __inline
struct sk_buff
*dev_alloc_skb(unsigned int length
)
1072 return __dev_alloc_skb(length
, GFP_ATOMIC
);
1079 * skb_cow - copy header of skb when it is required
1080 * @skb: buffer to cow
1081 * @headroom: needed headroom
1083 * If the skb passed lacks sufficient headroom or its data part
1084 * is shared, data is reallocated. If reallocation fails, an error
1085 * is returned and original skb is not changed.
1087 * The result is skb with writable area skb->head...skb->tail
1088 * and at least @headroom of space at head.
1092 skb_cow(struct sk_buff
*skb
, unsigned int headroom
)
1095 int delta
= (headroom
> 16 ? headroom
: 16) - skb_headroom(skb
);
1100 if (delta
|| skb_cloned(skb
))
1101 return pskb_expand_head(skb
, (delta
+15)&~15, 0, GFP_ATOMIC
);
1109 * skb_linearize - convert paged skb to linear one
1110 * @skb: buffer to linarize
1111 * @gfp: allocation mode
1113 * If there is no free memory -ENOMEM is returned, otherwise zero
1114 * is returned and the old skb data released. */
1115 int skb_linearize(struct sk_buff
*skb
, int gfp
);
1117 static __inline
void *kmap_skb_frag(const skb_frag_t
*frag
)
1120 #ifdef CONFIG_HIGHMEM
1126 return kmap_atomic(frag
->page
, KM_SKB_DATA_SOFTIRQ
);
1132 static __inline
void kunmap_skb_frag(void *vaddr
)
1135 kunmap_atomic(vaddr
, KM_SKB_DATA_SOFTIRQ
);
1136 #ifdef CONFIG_HIGHMEM
1142 #define skb_queue_walk(queue, skb) \
1143 for (skb = (queue)->next; \
1144 (skb != (struct sk_buff *)(queue)); \
1148 extern struct sk_buff
* skb_recv_datagram(struct sock
*sk
,unsigned flags
,int noblock
, int *err
);
1149 extern unsigned int datagram_poll(struct file
*file
, struct socket
*sock
, struct poll_table_struct
*wait
);
1150 extern int skb_copy_datagram(const struct sk_buff
*from
, int offset
, char *to
,int size
);
1151 extern int skb_copy_datagram_iovec(const struct sk_buff
*from
, int offset
, struct iovec
*to
,int size
);
1152 extern int skb_copy_and_csum_datagram(const struct sk_buff
*skb
, int offset
, u8
*to
, int len
, unsigned int *csump
);
1153 extern int skb_copy_and_csum_datagram_iovec(const struct sk_buff
*skb
, int hlen
, struct iovec
*iov
);
1154 extern void skb_free_datagram(struct sock
* sk
, struct sk_buff
*skb
);
1156 extern unsigned int skb_checksum(const struct sk_buff
*skb
, int offset
, int len
, unsigned int csum
);
1157 extern int skb_copy_bits(const struct sk_buff
*skb
, int offset
, void *to
, int len
);
1158 extern unsigned int skb_copy_and_csum_bits(const struct sk_buff
*skb
, int offset
, u8
*to
, int len
, unsigned int csum
);
1159 extern void skb_copy_and_csum_dev(const struct sk_buff
*skb
, u8
*to
);
1161 extern void skb_init(void);
1162 extern void skb_add_mtu(int mtu
);
1164 #ifdef CONFIG_NETFILTER
1165 static __inline
void
1166 nf_conntrack_put(struct nf_ct_info
*nfct
)
1168 if (nfct
&& atomic_dec_and_test(&nfct
->master
->use
))
1169 nfct
->master
->destroy(nfct
->master
);
1171 static __inline
void
1172 nf_conntrack_get(struct nf_ct_info
*nfct
)
1175 atomic_inc(&nfct
->master
->use
);
1188 typedef struct sockaddr
1195 void * msg_name
; /* Socket name */
1196 int msg_namelen
; /* Length of name */
1197 struct iovec
* msg_iov
; /* Data blocks */
1198 __kernel_size_t msg_iovlen
; /* Number of blocks */
1199 void * msg_control
; /* Per protocol magic (eg BSD file descriptor passing) */
1200 __kernel_size_t msg_controllen
; /* Length of cmsg list */
1205 /* IP protocol blocks we attach to sockets.
1206 * socket layer -> transport layer interface
1207 * transport -> network interface is defined by struct inet_proto
1210 void (*close
)(struct sock
*sk
,
1212 int (*connect
)(struct sock
*sk
,
1213 struct sockaddr
*uaddr
,
1215 int (*disconnect
)(struct sock
*sk
, int flags
);
1217 struct sock
* (*accept
) (struct sock
*sk
, int flags
, int *err
);
1219 int (*ioctl
)(struct sock
*sk
, int cmd
,
1221 int (*init
)(struct sock
*sk
);
1222 int (*destroy
)(struct sock
*sk
);
1223 void (*shutdown
)(struct sock
*sk
, int how
);
1224 int (*setsockopt
)(struct sock
*sk
, int level
,
1225 int optname
, char *optval
, int optlen
);
1226 int (*getsockopt
)(struct sock
*sk
, int level
,
1227 int optname
, char *optval
,
1229 int (*sendmsg
)(struct sock
*sk
, struct msghdr
*msg
,
1231 int (*recvmsg
)(struct sock
*sk
, struct msghdr
*msg
,
1232 int len
, int noblock
, int flags
,
1234 int (*bind
)(struct sock
*sk
,
1235 struct sockaddr
*uaddr
, int addr_len
);
1237 int (*backlog_rcv
) (struct sock
*sk
,
1238 struct sk_buff
*skb
);
1240 /* Keeping track of sk's, looking them up, and port selection methods. */
1241 void (*hash
)(struct sock
*sk
);
1242 void (*unhash
)(struct sock
*sk
);
1243 int (*get_port
)(struct sock
*sk
, unsigned short snum
);
1250 // u8 __pad[SMP_CACHE_BYTES - sizeof(int)];
1251 // } stats[NR_CPUS];
1260 /* This defines a selective acknowledgement block. */
1261 struct tcp_sack_block
{
1268 int tcp_header_len
; /* Bytes of tcp header to send */
1271 * Header prediction flags
1272 * 0x5?10 << 16 + snd_wnd in net byte order
1277 * RFC793 variables by their proper names. This means you can
1278 * read the code and the spec side by side (and laugh ...)
1279 * See RFC793 and RFC1122. The RFC writes these in capitals.
1281 __u32 rcv_nxt
; /* What we want to receive next */
1282 __u32 snd_nxt
; /* Next sequence we send */
1284 __u32 snd_una
; /* First byte we want an ack for */
1285 __u32 snd_sml
; /* Last byte of the most recently transmitted small packet */
1286 __u32 rcv_tstamp
; /* timestamp of last received ACK (for keepalives) */
1287 __u32 lsndtime
; /* timestamp of last sent data packet (for restart window) */
1289 /* Delayed ACK control data */
1291 __u8 pending
; /* ACK is pending */
1292 __u8 quick
; /* Scheduled number of quick acks */
1293 __u8 pingpong
; /* The session is interactive */
1294 __u8 blocked
; /* Delayed ACK was blocked by socket lock*/
1295 __u32 ato
; /* Predicted tick of soft clock */
1296 unsigned long timeout
; /* Currently scheduled timeout */
1297 __u32 lrcvtime
; /* timestamp of last received data packet*/
1298 __u16 last_seg_size
; /* Size of last incoming segment */
1299 __u16 rcv_mss
; /* MSS used for delayed ACK decisions */
1302 /* Data for direct copy to user */
1304 //struct sk_buff_head prequeue;
1305 struct task_struct
*task
;
1311 __u32 snd_wl1
; /* Sequence for window update */
1312 __u32 snd_wnd
; /* The window we expect to receive */
1313 __u32 max_window
; /* Maximal window ever seen from peer */
1314 __u32 pmtu_cookie
; /* Last pmtu seen by socket */
1315 __u16 mss_cache
; /* Cached effective mss, not including SACKS */
1316 __u16 mss_clamp
; /* Maximal mss, negotiated at connection setup */
1317 __u16 ext_header_len
; /* Network protocol overhead (IP/IPv6 options) */
1318 __u8 ca_state
; /* State of fast-retransmit machine */
1319 __u8 retransmits
; /* Number of unrecovered RTO timeouts. */
1321 __u8 reordering
; /* Packet reordering metric. */
1322 __u8 queue_shrunk
; /* Write queue has been shrunk recently.*/
1323 __u8 defer_accept
; /* User waits for some data after accept() */
1325 /* RTT measurement */
1326 __u8 backoff
; /* backoff */
1327 __u32 srtt
; /* smothed round trip time << 3 */
1328 __u32 mdev
; /* medium deviation */
1329 __u32 mdev_max
; /* maximal mdev for the last rtt period */
1330 __u32 rttvar
; /* smoothed mdev_max */
1331 __u32 rtt_seq
; /* sequence number to update rttvar */
1332 __u32 rto
; /* retransmit timeout */
1334 __u32 packets_out
; /* Packets which are "in flight" */
1335 __u32 left_out
; /* Packets which leaved network */
1336 __u32 retrans_out
; /* Retransmitted packets out */
1340 * Slow start and congestion control (see also Nagle, and Karn & Partridge)
1342 __u32 snd_ssthresh
; /* Slow start size threshold */
1343 __u32 snd_cwnd
; /* Sending congestion window */
1344 __u16 snd_cwnd_cnt
; /* Linear increase counter */
1345 __u16 snd_cwnd_clamp
; /* Do not allow snd_cwnd to grow above this */
1346 __u32 snd_cwnd_used
;
1347 __u32 snd_cwnd_stamp
;
1349 /* Two commonly used timers in both sender and receiver paths. */
1350 unsigned long timeout
;
1351 struct timer_list retransmit_timer
; /* Resend (no ack) */
1352 struct timer_list delack_timer
; /* Ack delay */
1354 struct sk_buff_head out_of_order_queue
; /* Out of order segments go here */
1356 struct tcp_func
*af_specific
; /* Operations which are AF_INET{4,6} specific */
1357 struct sk_buff
*send_head
; /* Front of stuff to transmit */
1358 struct page
*sndmsg_page
; /* Cached page for sendmsg */
1359 u32 sndmsg_off
; /* Cached offset for sendmsg */
1361 __u32 rcv_wnd
; /* Current receiver window */
1362 __u32 rcv_wup
; /* rcv_nxt on last window update sent */
1363 __u32 write_seq
; /* Tail(+1) of data held in tcp send buffer */
1364 __u32 pushed_seq
; /* Last pushed seq, required to talk to windows */
1365 __u32 copied_seq
; /* Head of yet unread data */
1367 * Options received (usually on last packet, some only on SYN packets).
1369 char tstamp_ok
, /* TIMESTAMP seen on SYN packet */
1370 wscale_ok
, /* Wscale seen on SYN packet */
1371 sack_ok
; /* SACK seen on SYN packet */
1372 char saw_tstamp
; /* Saw TIMESTAMP on last packet */
1373 __u8 snd_wscale
; /* Window scaling received from sender */
1374 __u8 rcv_wscale
; /* Window scaling to send to receiver */
1375 __u8 nonagle
; /* Disable Nagle algorithm? */
1376 __u8 keepalive_probes
; /* num of allowed keep alive probes */
1378 /* PAWS/RTTM data */
1379 __u32 rcv_tsval
; /* Time stamp value */
1380 __u32 rcv_tsecr
; /* Time stamp echo reply */
1381 __u32 ts_recent
; /* Time stamp to echo next */
1382 long ts_recent_stamp
;/* Time we stored ts_recent (for aging) */
1385 __u16 user_mss
; /* mss requested by user in ioctl */
1386 __u8 dsack
; /* D-SACK is scheduled */
1387 __u8 eff_sacks
; /* Size of SACK array to send with next packet */
1388 struct tcp_sack_block duplicate_sack
[1]; /* D-SACK block */
1389 struct tcp_sack_block selective_acks
[4]; /* The SACKS themselves*/
1391 __u32 window_clamp
; /* Maximal window to advertise */
1392 __u32 rcv_ssthresh
; /* Current window clamp */
1393 __u8 probes_out
; /* unanswered 0 window probes */
1394 __u8 num_sacks
; /* Number of SACK blocks */
1395 __u16 advmss
; /* Advertised MSS */
1397 __u8 syn_retries
; /* num of allowed syn retries */
1398 __u8 ecn_flags
; /* ECN status bits. */
1399 __u16 prior_ssthresh
; /* ssthresh saved at recovery start */
1400 __u32 lost_out
; /* Lost packets */
1401 __u32 sacked_out
; /* SACK'd packets */
1402 __u32 fackets_out
; /* FACK'd packets */
1403 __u32 high_seq
; /* snd_nxt at onset of congestion */
1405 __u32 retrans_stamp
; /* Timestamp of the last retransmit,
1406 * also used in SYN-SENT to remember stamp of
1408 __u32 undo_marker
; /* tracking retrans started here. */
1409 int undo_retrans
; /* number of undoable retransmissions. */
1410 __u32 urg_seq
; /* Seq of received urgent pointer */
1411 __u16 urg_data
; /* Saved octet of OOB data and control flags */
1412 __u8 pending
; /* Scheduled timer event */
1413 __u8 urg_mode
; /* In urgent mode */
1414 __u32 snd_up
; /* Urgent pointer */
1416 /* The syn_wait_lock is necessary only to avoid tcp_get_info having
1417 * to grab the main lock sock while browsing the listening hash
1418 * (otherwise it's deadlock prone).
1419 * This lock is acquired in read mode only from tcp_get_info() and
1420 * it's acquired in write mode _only_ from code that is actively
1421 * changing the syn_wait_queue. All readers that are holding
1422 * the master sock lock don't need to grab this lock in read mode
1423 * too as the syn_wait_queue writes are always protected from
1424 * the main sock lock.
1426 rwlock_t syn_wait_lock
;
1427 struct tcp_listen_opt
*listen_opt
;
1429 /* FIFO of established children */
1430 struct open_request
*accept_queue
;
1431 struct open_request
*accept_queue_tail
;
1433 int write_pending
; /* A write to socket waits to start. */
1435 unsigned int keepalive_time
; /* time before keep alive takes place */
1436 unsigned int keepalive_intvl
; /* time interval between keep alive probes */
1439 unsigned long last_synq_overflow
;
1445 /* This is the per-socket lock. The spinlock provides a synchronization
1446 * between user contexts and software interrupt processing, whereas the
1447 * mini-semaphore synchronizes multiple users amongst themselves.
1452 wait_queue_head_t wq
;
1456 /* Socket demultiplex comparisons on incoming packets. */
1457 __u32 daddr
; /* Foreign IPv4 addr */
1458 __u32 rcv_saddr
; /* Bound local IPv4 addr */
1459 __u16 dport
; /* Destination port */
1460 unsigned short num
; /* Local port */
1461 int bound_dev_if
; /* Bound device index if != 0 */
1463 /* Main hash linkage for various protocol lookup tables. */
1465 struct sock
**pprev
;
1466 struct sock
*bind_next
;
1467 struct sock
**bind_pprev
;
1469 volatile unsigned char state
, /* Connection state */
1470 zapped
; /* In ax25 & ipx means not linked */
1471 __u16 sport
; /* Source port */
1473 unsigned short family
; /* Address family */
1474 unsigned char reuse
; /* SO_REUSEADDR setting */
1475 unsigned char shutdown
;
1476 atomic_t refcnt
; /* Reference count */
1478 socket_lock_t lock
; /* Synchronizer... */
1479 int rcvbuf
; /* Size of receive buffer in bytes */
1481 wait_queue_head_t
*sleep
; /* Sock wait queue */
1482 struct dst_entry
*dst_cache
; /* Destination cache */
1484 atomic_t rmem_alloc
; /* Receive queue bytes committed */
1485 struct sk_buff_head receive_queue
; /* Incoming packets */
1486 atomic_t wmem_alloc
; /* Transmit queue bytes committed */
1487 struct sk_buff_head write_queue
; /* Packet sending queue */
1488 atomic_t omem_alloc
; /* "o" is "option" or "other" */
1489 int wmem_queued
; /* Persistent queue size */
1490 int forward_alloc
; /* Space allocated forward. */
1491 __u32 saddr
; /* Sending source */
1492 unsigned int allocation
; /* Allocation mode */
1493 int sndbuf
; /* Size of send buffer in bytes */
1496 /* Not all are volatile, but some are, so we might as well say they all are.
1497 * XXX Make this a flag word -DaveM
1508 unsigned char debug
;
1509 unsigned char rcvtstamp
;
1510 unsigned char use_write_queue
;
1511 unsigned char userlocks
;
1512 /* Hole of 3 bytes. Try to pack. */
1515 unsigned long lingertime
;
1520 /* The backlog queue is special, it is always used with
1521 * the per-socket spinlock held and requires low latency
1522 * access. Therefore we special case it's implementation.
1525 struct sk_buff
*head
;
1526 struct sk_buff
*tail
;
1529 rwlock_t callback_lock
;
1531 /* Error queue, rarely used. */
1532 struct sk_buff_head error_queue
;
1536 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
1538 struct ipv6_pinfo af_inet6
;
1543 struct tcp_opt af_tcp
;
1544 #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
1545 struct raw_opt tp_raw4
;
1547 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
1548 struct raw6_opt tp_raw
;
1549 #endif /* CONFIG_IPV6 */
1550 #if defined(CONFIG_SPX) || defined (CONFIG_SPX_MODULE)
1551 struct spx_opt af_spx
;
1552 #endif /* CONFIG_SPX */
1556 int err
, err_soft
; /* Soft holds errors that don't
1557 cause failure but are the cause
1558 of a persistent failure not just
1560 unsigned short ack_backlog
;
1561 unsigned short max_ack_backlog
;
1563 unsigned short type
;
1564 unsigned char localroute
; /* Route locally only */
1565 unsigned char protocol
;
1566 // struct ucred peercred;
1571 #ifdef CONFIG_FILTER
1572 /* Socket Filtering Instructions */
1573 struct sk_filter
*filter
;
1574 #endif /* CONFIG_FILTER */
1576 /* This is where all the private (optional) areas that don't
1577 * overlap will eventually live.
1580 void *destruct_hook
;
1581 // struct unix_opt af_unix;
1582 #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
1583 struct inet_opt af_inet
;
1585 #if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE)
1586 struct atalk_sock af_at
;
1588 #if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE)
1589 struct ipx_opt af_ipx
;
1591 #if defined (CONFIG_DECNET) || defined(CONFIG_DECNET_MODULE)
1594 #if defined (CONFIG_PACKET) || defined(CONFIG_PACKET_MODULE)
1595 struct packet_opt
*af_packet
;
1597 #if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE)
1600 #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
1603 #if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
1606 #if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE)
1609 #if defined(CONFIG_PPPOE) || defined(CONFIG_PPPOE_MODULE)
1610 struct pppox_opt
*pppox
;
1612 struct netlink_opt
*af_netlink
;
1613 #if defined(CONFIG_ECONET) || defined(CONFIG_ECONET_MODULE)
1614 struct econet_opt
*af_econet
;
1616 #if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE)
1617 struct atm_vcc
*af_atm
;
1619 #if defined(CONFIG_IRDA) || defined(CONFIG_IRDA_MODULE)
1620 struct irda_sock
*irda
;
1622 #if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE)
1623 struct wanpipe_opt
*af_wanpipe
;
1628 /* This part is used for the timeout functions. */
1629 struct timer_list timer
; /* This is the sock cleanup timer. */
1630 struct timeval stamp
;
1632 /* Identd and reporting IO signals */
1633 struct socket
*socket
;
1635 /* RPC layer private data */
1639 void (*state_change
)(struct sock
*sk
);
1640 void (*data_ready
)(struct sock
*sk
,int bytes
);
1641 void (*write_space
)(struct sock
*sk
);
1642 void (*error_report
)(struct sock
*sk
);
1644 int (*backlog_rcv
) (struct sock
*sk
,
1645 struct sk_buff
*skb
);
1646 void (*destruct
)(struct sock
*sk
);
1652 #if 1 /* dst (_NET_DST_H) */
1655 #include <linux/config.h>
1656 #include <net/neighbour.h>
1660 * 0 - no debugging messages
1661 * 1 - rare events and bugs (default)
1664 #define RT_CACHE_DEBUG 0
1666 #define DST_GC_MIN (1*HZ)
1667 #define DST_GC_INC (5*HZ)
1668 #define DST_GC_MAX (120*HZ)
1674 struct dst_entry
*next
;
1675 atomic_t __refcnt
; /* client references */
1677 struct net_device
*dev
;
1681 unsigned long lastuse
;
1682 unsigned long expires
;
1692 unsigned reordering
;
1694 unsigned long rate_last
; /* rate limiting for ICMP */
1695 unsigned long rate_tokens
;
1699 struct neighbour
*neighbour
;
1700 struct hh_cache
*hh
;
1702 int (*input
)(struct sk_buff
*);
1703 int (*output
)(struct sk_buff
*);
1705 #ifdef CONFIG_NET_CLS_ROUTE
1709 struct dst_ops
*ops
;
1717 unsigned short family
;
1718 unsigned short protocol
;
1722 struct dst_entry
* (*check
)(struct dst_entry
*, __u32 cookie
);
1723 struct dst_entry
* (*reroute
)(struct dst_entry
*,
1725 void (*destroy
)(struct dst_entry
*);
1726 struct dst_entry
* (*negative_advice
)(struct dst_entry
*);
1727 void (*link_failure
)(struct sk_buff
*);
1731 kmem_cache_t
*kmem_cachep
;
1736 static __inline
void dst_hold(struct dst_entry
* dst
)
1738 atomic_inc(&dst
->__refcnt
);
1742 struct dst_entry
* dst_clone(struct dst_entry
* dst
)
1745 atomic_inc(&dst
->__refcnt
);
1750 void dst_release(struct dst_entry
* dst
)
1753 atomic_dec(&dst
->__refcnt
);
1756 extern void * dst_alloc(struct dst_ops
* ops
);
1757 extern void __dst_free(struct dst_entry
* dst
);
1758 extern void dst_destroy(struct dst_entry
* dst
);
1761 void dst_free(struct dst_entry
* dst
)
1763 if (dst
->obsolete
> 1)
1765 if (!atomic_read(&dst
->__refcnt
)) {
1772 static __inline
void dst_confirm(struct dst_entry
*dst
)
1775 neigh_confirm(dst
->neighbour
);
1778 static __inline
void dst_negative_advice(struct dst_entry
**dst_p
)
1780 struct dst_entry
* dst
= *dst_p
;
1781 if (dst
&& dst
->ops
->negative_advice
)
1782 *dst_p
= dst
->ops
->negative_advice(dst
);
1785 static __inline
void dst_link_failure(struct sk_buff
*skb
)
1787 struct dst_entry
* dst
= skb
->dst
;
1788 if (dst
&& dst
->ops
&& dst
->ops
->link_failure
)
1789 dst
->ops
->link_failure(skb
);
1792 static __inline
void dst_set_expires(struct dst_entry
*dst
, int timeout
)
1794 unsigned long expires
= jiffies
+ timeout
;
1799 if (dst
->expires
== 0 || (long)(dst
->expires
- expires
) > 0)
1800 dst
->expires
= expires
;
1803 extern void dst_init(void);
1816 #define FASTRETRANS_DEBUG 1
1818 /* Cancel timers, when they are not required. */
1819 #undef TCP_CLEAR_TIMERS
1822 #include <linux/config.h>
1823 #include <linux/tcp.h>
1824 #include <linux/slab.h>
1825 #include <linux/cache.h>
1826 #include <net/checksum.h>
1827 #include <net/sock.h>
1832 /* This is for all connections with a full identity, no wildcards.
1833 * New scheme, half the table is for TIME_WAIT, the other half is
1834 * for the rest. I'll experiment with dynamic table growth later.
1836 struct tcp_ehash_bucket
{
1839 } __attribute__((__aligned__(8)));
1841 /* This is for listening sockets, thus all sockets which possess wildcards. */
1842 #define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
1844 /* There are a few simple rules, which allow for local port reuse by
1845 * an application. In essence:
1847 * 1) Sockets bound to different interfaces may share a local port.
1848 * Failing that, goto test 2.
1849 * 2) If all sockets have sk->reuse set, and none of them are in
1850 * TCP_LISTEN state, the port may be shared.
1851 * Failing that, goto test 3.
1852 * 3) If all sockets are bound to a specific sk->rcv_saddr local
1853 * address, and none of them are the same, the port may be
1855 * Failing this, the port cannot be shared.
1857 * The interesting point, is test #2. This is what an FTP server does
1858 * all day. To optimize this case we use a specific flag bit defined
1859 * below. As we add sockets to a bind bucket list, we perform a
1860 * check of: (newsk->reuse && (newsk->state != TCP_LISTEN))
1861 * As long as all sockets added to a bind bucket pass this test,
1862 * the flag bit will be set.
1863 * The resulting situation is that tcp_v[46]_verify_bind() can just check
1864 * for this flag bit, if it is set and the socket trying to bind has
1865 * sk->reuse set, we don't even have to walk the owners list at all,
1866 * we return that it is ok to bind this socket to the requested local port.
1868 * Sounds like a lot of work, but it is worth it. In a more naive
1869 * implementation (ie. current FreeBSD etc.) the entire list of ports
1870 * must be walked for each data port opened by an ftp server. Needless
1871 * to say, this does not scale at all. With a couple thousand FTP
1872 * users logged onto your box, isn't it nice to know that new data
1873 * ports are created in O(1) time? I thought so. ;-) -DaveM
1875 struct tcp_bind_bucket
{
1876 unsigned short port
;
1877 signed short fastreuse
;
1878 struct tcp_bind_bucket
*next
;
1879 struct sock
*owners
;
1880 struct tcp_bind_bucket
**pprev
;
1883 struct tcp_bind_hashbucket
{
1885 struct tcp_bind_bucket
*chain
;
1888 extern struct tcp_hashinfo
{
1889 /* This is for sockets with full identity only. Sockets here will
1890 * always be without wildcards and will have the following invariant:
1892 * TCP_ESTABLISHED <= sk->state < TCP_CLOSE
1894 * First half of the table is for sockets not in TIME_WAIT, second half
1895 * is for TIME_WAIT sockets only.
1897 struct tcp_ehash_bucket
*__tcp_ehash
;
1899 /* Ok, let's try this, I give up, we do need a local binding
1900 * TCP hash as well as the others for fast bind/connect.
1902 struct tcp_bind_hashbucket
*__tcp_bhash
;
1904 int __tcp_bhash_size
;
1905 int __tcp_ehash_size
;
1907 /* All sockets in TCP_LISTEN state will be in here. This is the only
1908 * table where wildcard'd TCP sockets can exist. Hash function here
1909 * is just local port number.
1911 struct sock
*__tcp_listening_hash
[TCP_LHTABLE_SIZE
];
1913 /* All the above members are written once at bootup and
1914 * never written again _or_ are predominantly read-access.
1916 * Now align to a new cache line as all the following members
1919 rwlock_t __tcp_lhash_lock ____cacheline_aligned
;
1920 atomic_t __tcp_lhash_users
;
1921 wait_queue_head_t __tcp_lhash_wait
;
1922 spinlock_t __tcp_portalloc_lock
;
1925 #define tcp_ehash (tcp_hashinfo.__tcp_ehash)
1926 #define tcp_bhash (tcp_hashinfo.__tcp_bhash)
1927 #define tcp_ehash_size (tcp_hashinfo.__tcp_ehash_size)
1928 #define tcp_bhash_size (tcp_hashinfo.__tcp_bhash_size)
1929 #define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash)
1930 #define tcp_lhash_lock (tcp_hashinfo.__tcp_lhash_lock)
1931 #define tcp_lhash_users (tcp_hashinfo.__tcp_lhash_users)
1932 #define tcp_lhash_wait (tcp_hashinfo.__tcp_lhash_wait)
1933 #define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock)
1935 extern kmem_cache_t
*tcp_bucket_cachep
;
1936 extern struct tcp_bind_bucket
*tcp_bucket_create(struct tcp_bind_hashbucket
*head
,
1937 unsigned short snum
);
1938 extern void tcp_bucket_unlock(struct sock
*sk
);
1939 extern int tcp_port_rover
;
1940 extern struct sock
*tcp_v4_lookup_listener(u32 addr
, unsigned short hnum
, int dif
);
1942 /* These are AF independent. */
1943 static __inline
int tcp_bhashfn(__u16 lport
)
1945 return (lport
& (tcp_bhash_size
- 1));
1948 /* This is a TIME_WAIT bucket. It works around the memory consumption
1949 * problems of sockets in such a state on heavily loaded servers, but
1950 * without violating the protocol specification.
1952 struct tcp_tw_bucket
{
1953 /* These _must_ match the beginning of struct sock precisely.
1954 * XXX Yes I know this is gross, but I'd have to edit every single
1955 * XXX networking file if I created a "struct sock_header". -DaveM
1963 struct sock
**pprev
;
1964 struct sock
*bind_next
;
1965 struct sock
**bind_pprev
;
1966 unsigned char state
,
1967 substate
; /* "zapped" is replaced with "substate" */
1969 unsigned short family
;
1970 unsigned char reuse
,
1971 rcv_wscale
; /* It is also TW bucket specific */
1974 /* And these are ours. */
1981 long ts_recent_stamp
;
1983 struct tcp_bind_bucket
*tb
;
1984 struct tcp_tw_bucket
*next_death
;
1985 struct tcp_tw_bucket
**pprev_death
;
1987 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1988 struct in6_addr v6_daddr
;
1989 struct in6_addr v6_rcv_saddr
;
1993 extern kmem_cache_t
*tcp_timewait_cachep
;
1995 static __inline
void tcp_tw_put(struct tcp_tw_bucket
*tw
)
1997 if (atomic_dec_and_test(&tw
->refcnt
)) {
1998 #ifdef INET_REFCNT_DEBUG
1999 printk(KERN_DEBUG
"tw_bucket %p released\n", tw
);
2001 kmem_cache_free(tcp_timewait_cachep
, tw
);
2005 extern atomic_t tcp_orphan_count
;
2006 extern int tcp_tw_count
;
2007 extern void tcp_time_wait(struct sock
*sk
, int state
, int timeo
);
2008 extern void tcp_timewait_kill(struct tcp_tw_bucket
*tw
);
2009 extern void tcp_tw_schedule(struct tcp_tw_bucket
*tw
, int timeo
);
2010 extern void tcp_tw_deschedule(struct tcp_tw_bucket
*tw
);
2013 /* Socket demux engine toys. */
2015 #define TCP_COMBINED_PORTS(__sport, __dport) \
2016 (((__u32)(__sport)<<16) | (__u32)(__dport))
2017 #else /* __LITTLE_ENDIAN */
2018 #define TCP_COMBINED_PORTS(__sport, __dport) \
2019 (((__u32)(__dport)<<16) | (__u32)(__sport))
2022 #if (BITS_PER_LONG == 64)
2024 #define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
2025 __u64 __name = (((__u64)(__saddr))<<32)|((__u64)(__daddr));
2026 #else /* __LITTLE_ENDIAN */
2027 #define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
2028 __u64 __name = (((__u64)(__daddr))<<32)|((__u64)(__saddr));
2029 #endif /* __BIG_ENDIAN */
2030 #define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
2031 (((*((__u64 *)&((__sk)->daddr)))== (__cookie)) && \
2032 ((*((__u32 *)&((__sk)->dport)))== (__ports)) && \
2033 (!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif))))
2034 #else /* 32-bit arch */
2035 #define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr)
2036 #define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
2037 (((__sk)->daddr == (__saddr)) && \
2038 ((__sk)->rcv_saddr == (__daddr)) && \
2039 ((*((__u32 *)&((__sk)->dport)))== (__ports)) && \
2040 (!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif))))
2041 #endif /* 64-bit arch */
2043 #define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \
2044 (((*((__u32 *)&((__sk)->dport)))== (__ports)) && \
2045 ((__sk)->family == AF_INET6) && \
2046 !ipv6_addr_cmp(&(__sk)->net_pinfo.af_inet6.daddr, (__saddr)) && \
2047 !ipv6_addr_cmp(&(__sk)->net_pinfo.af_inet6.rcv_saddr, (__daddr)) && \
2048 (!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif))))
2050 /* These can have wildcards, don't try too hard. */
2051 static __inline
int tcp_lhashfn(unsigned short num
)
2054 return num
& (TCP_LHTABLE_SIZE
- 1);
2060 static __inline
int tcp_sk_listen_hashfn(struct sock
*sk
)
2063 return tcp_lhashfn(sk
->num
);
2069 #define MAX_TCP_HEADER (128 + MAX_HEADER)
2072 * Never offer a window over 32767 without using window scaling. Some
2073 * poor stacks do signed 16bit maths!
2075 #define MAX_TCP_WINDOW 32767U
2077 /* Minimal accepted MSS. It is (60+60+8) - (20+20). */
2078 #define TCP_MIN_MSS 88U
2080 /* Minimal RCV_MSS. */
2081 #define TCP_MIN_RCVMSS 536U
2083 /* After receiving this amount of duplicate ACKs fast retransmit starts. */
2084 #define TCP_FASTRETRANS_THRESH 3
2086 /* Maximal reordering. */
2087 #define TCP_MAX_REORDERING 127
2089 /* Maximal number of ACKs sent quickly to accelerate slow-start. */
2090 #define TCP_MAX_QUICKACKS 16U
2092 /* urg_data states */
2093 #define TCP_URG_VALID 0x0100
2094 #define TCP_URG_NOTYET 0x0200
2095 #define TCP_URG_READ 0x0400
2097 #define TCP_RETR1 3 /*
2098 * This is how many retries it does before it
2099 * tries to figure out if the gateway is
2100 * down. Minimal RFC value is 3; it corresponds
2101 * to ~3sec-8min depending on RTO.
2104 #define TCP_RETR2 15 /*
2105 * This should take at least
2106 * 90 minutes to time out.
2107 * RFC1122 says that the limit is 100 sec.
2108 * 15 is ~13-30min depending on RTO.
2111 #define TCP_SYN_RETRIES 5 /* number of times to retry active opening a
2112 * connection: ~180sec is RFC minumum */
2114 #define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a
2115 * connection: ~180sec is RFC minumum */
2118 #define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned
2119 * socket. 7 is ~50sec-16min.
2123 #define TCP_TIMEWAIT_LEN (60*1000)
2124 //#define TCP_TIMEWAIT_LEN (60*HZ)
2125 /* how long to wait to destroy TIME-WAIT
2126 * state, about 60 seconds */
2127 #define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN
2128 /* BSD style FIN_WAIT2 deadlock breaker.
2129 * It used to be 3min, new value is 60sec,
2130 * to combine FIN-WAIT-2 timeout with
2134 #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
2136 #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */
2137 #define TCP_ATO_MIN ((unsigned)(HZ/25))
2139 #define TCP_DELACK_MIN 4U
2140 #define TCP_ATO_MIN 4U
2142 #define TCP_RTO_MAX ((unsigned)(120*HZ))
2143 #define TCP_RTO_MIN ((unsigned)(HZ/5))
2144 #define TCP_TIMEOUT_INIT ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value */
2146 #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
2147 * for local resources.
2150 #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
2151 #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
2152 #define TCP_KEEPALIVE_INTVL (75*HZ)
2154 #define MAX_TCP_KEEPIDLE 32767
2155 #define MAX_TCP_KEEPINTVL 32767
2156 #define MAX_TCP_KEEPCNT 127
2157 #define MAX_TCP_SYNCNT 127
2159 /* TIME_WAIT reaping mechanism. */
2160 #define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
2161 #define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
2163 #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
2164 #define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */
2166 #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
2167 #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
2168 * after this time. It should be equal
2169 * (or greater than) TCP_TIMEWAIT_LEN
2170 * to provide reliability equal to one
2171 * provided by timewait state.
2173 #define TCP_PAWS_WINDOW 1 /* Replay window for per-host
2174 * timestamps. It must be less than
2175 * minimal timewait lifetime.
2178 #define TCP_TW_RECYCLE_SLOTS_LOG 5
2179 #define TCP_TW_RECYCLE_SLOTS (1<<TCP_TW_RECYCLE_SLOTS_LOG)
2181 /* If time > 4sec, it is "slow" path, no recycling is required,
2182 so that we select tick to get range about 4 seconds.
2186 #if HZ <= 16 || HZ > 4096
2187 # error Unsupported: HZ <= 16 or HZ > 4096
2189 # define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG)
2191 # define TCP_TW_RECYCLE_TICK (6+2-TCP_TW_RECYCLE_SLOTS_LOG)
2193 # define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG)
2195 # define TCP_TW_RECYCLE_TICK (8+2-TCP_TW_RECYCLE_SLOTS_LOG)
2197 # define TCP_TW_RECYCLE_TICK (9+2-TCP_TW_RECYCLE_SLOTS_LOG)
2199 # define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG)
2201 # define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG)
2203 # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
2206 #define TCP_TW_RECYCLE_TICK (0)
2213 #define TCPOPT_NOP 1 /* Padding */
2214 #define TCPOPT_EOL 0 /* End of options */
2215 #define TCPOPT_MSS 2 /* Segment size negotiating */
2216 #define TCPOPT_WINDOW 3 /* Window scaling */
2217 #define TCPOPT_SACK_PERM 4 /* SACK Permitted */
2218 #define TCPOPT_SACK 5 /* SACK Block */
2219 #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
2222 * TCP option lengths
2225 #define TCPOLEN_MSS 4
2226 #define TCPOLEN_WINDOW 3
2227 #define TCPOLEN_SACK_PERM 2
2228 #define TCPOLEN_TIMESTAMP 10
2230 /* But this is what stacks really send out. */
2231 #define TCPOLEN_TSTAMP_ALIGNED 12
2232 #define TCPOLEN_WSCALE_ALIGNED 4
2233 #define TCPOLEN_SACKPERM_ALIGNED 4
2234 #define TCPOLEN_SACK_BASE 2
2235 #define TCPOLEN_SACK_BASE_ALIGNED 4
2236 #define TCPOLEN_SACK_PERBLOCK 8
2238 #define TCP_TIME_RETRANS 1 /* Retransmit timer */
2239 #define TCP_TIME_DACK 2 /* Delayed ack timer */
2240 #define TCP_TIME_PROBE0 3 /* Zero window probe timer */
2241 #define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */
2244 /* sysctl variables for tcp */
2245 extern int sysctl_max_syn_backlog
;
2246 extern int sysctl_tcp_timestamps
;
2247 extern int sysctl_tcp_window_scaling
;
2248 extern int sysctl_tcp_sack
;
2249 extern int sysctl_tcp_fin_timeout
;
2250 extern int sysctl_tcp_tw_recycle
;
2251 extern int sysctl_tcp_keepalive_time
;
2252 extern int sysctl_tcp_keepalive_probes
;
2253 extern int sysctl_tcp_keepalive_intvl
;
2254 extern int sysctl_tcp_syn_retries
;
2255 extern int sysctl_tcp_synack_retries
;
2256 extern int sysctl_tcp_retries1
;
2257 extern int sysctl_tcp_retries2
;
2258 extern int sysctl_tcp_orphan_retries
;
2259 extern int sysctl_tcp_syncookies
;
2260 extern int sysctl_tcp_retrans_collapse
;
2261 extern int sysctl_tcp_stdurg
;
2262 extern int sysctl_tcp_rfc1337
;
2263 extern int sysctl_tcp_abort_on_overflow
;
2264 extern int sysctl_tcp_max_orphans
;
2265 extern int sysctl_tcp_max_tw_buckets
;
2266 extern int sysctl_tcp_fack
;
2267 extern int sysctl_tcp_reordering
;
2268 extern int sysctl_tcp_ecn
;
2269 extern int sysctl_tcp_dsack
;
2270 extern int sysctl_tcp_mem
[3];
2271 extern int sysctl_tcp_wmem
[3];
2272 extern int sysctl_tcp_rmem
[3];
2273 extern int sysctl_tcp_app_win
;
2274 extern int sysctl_tcp_adv_win_scale
;
2275 extern int sysctl_tcp_tw_reuse
;
2278 extern atomic_t tcp_memory_allocated
;
2279 extern atomic_t tcp_sockets_allocated
;
2280 extern int tcp_memory_pressure
;
2282 struct open_request
;
2284 struct or_calltable
{
2286 int (*rtx_syn_ack
) (struct sock
*sk
, struct open_request
*req
, struct dst_entry
*);
2287 void (*send_ack
) (struct sk_buff
*skb
, struct open_request
*req
);
2288 void (*destructor
) (struct open_request
*req
);
2289 void (*send_reset
) (struct sk_buff
*skb
);
2292 struct tcp_v4_open_req
{
2295 struct ip_options
*opt
;
2298 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
2299 struct tcp_v6_open_req
{
2300 struct in6_addr loc_addr
;
2301 struct in6_addr rmt_addr
;
2302 struct sk_buff
*pktopts
;
2307 /* this structure is too big */
2308 struct open_request
{
2309 struct open_request
*dl_next
; /* Must be first member! */
2316 __u16 snd_wscale
: 4,
2323 /* The following two fields can be easily recomputed I think -AK */
2324 __u32 window_clamp
; /* window clamp at creation time */
2325 __u32 rcv_wnd
; /* rcv_wnd offered first time */
2327 unsigned long expires
;
2328 struct or_calltable
*class;
2331 struct tcp_v4_open_req v4_req
;
2332 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
2333 struct tcp_v6_open_req v6_req
;
2338 /* SLAB cache for open requests. */
2339 extern kmem_cache_t
*tcp_openreq_cachep
;
2341 #define tcp_openreq_alloc() kmem_cache_alloc(tcp_openreq_cachep, SLAB_ATOMIC)
2342 #define tcp_openreq_fastfree(req) kmem_cache_free(tcp_openreq_cachep, req)
2344 static __inline
void tcp_openreq_free(struct open_request
*req
)
2346 req
->class->destructor(req
);
2347 tcp_openreq_fastfree(req
);
2350 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2351 #define TCP_INET_FAMILY(fam) ((fam) == AF_INET)
2353 #define TCP_INET_FAMILY(fam) 1
2357 * Pointers to address related TCP functions
2358 * (i.e. things that depend on the address family)
2360 * BUGGG_FUTURE: all the idea behind this struct is wrong.
2361 * It mixes socket frontend with transport function.
2362 * With port sharing between IPv6/v4 it gives the only advantage,
2363 * only poor IPv6 needs to permanently recheck, that it
2364 * is still IPv6 8)8) It must be cleaned up as soon as possible.
2369 int (*queue_xmit
) (struct sk_buff
*skb
);
2371 void (*send_check
) (struct sock
*sk
,
2374 struct sk_buff
*skb
);
2376 int (*rebuild_header
) (struct sock
*sk
);
2378 int (*conn_request
) (struct sock
*sk
,
2379 struct sk_buff
*skb
);
2381 struct sock
* (*syn_recv_sock
) (struct sock
*sk
,
2382 struct sk_buff
*skb
,
2383 struct open_request
*req
,
2384 struct dst_entry
*dst
);
2386 int (*remember_stamp
) (struct sock
*sk
);
2388 __u16 net_header_len
;
2390 int (*setsockopt
) (struct sock
*sk
,
2396 int (*getsockopt
) (struct sock
*sk
,
2403 void (*addr2sockaddr
) (struct sock
*sk
,
2410 * The next routines deal with comparing 32 bit unsigned ints
2411 * and worry about wraparound (automatic with unsigned arithmetic).
2414 extern __inline
int before(__u32 seq1
, __u32 seq2
)
2416 return (__s32
)(seq1
-seq2
) < 0;
2419 extern __inline
int after(__u32 seq1
, __u32 seq2
)
2421 return (__s32
)(seq2
-seq1
) < 0;
2425 /* is s2<=s1<=s3 ? */
2426 extern __inline
int between(__u32 seq1
, __u32 seq2
, __u32 seq3
)
2428 return seq3
- seq2
>= seq1
- seq2
;
2432 extern struct proto tcp_prot
;
2434 #ifdef ROS_STATISTICS
2435 extern struct tcp_mib tcp_statistics
[NR_CPUS
*2];
2437 #define TCP_INC_STATS(field) SNMP_INC_STATS(tcp_statistics, field)
2438 #define TCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(tcp_statistics, field)
2439 #define TCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(tcp_statistics, field)
2442 extern void tcp_put_port(struct sock
*sk
);
2443 extern void __tcp_put_port(struct sock
*sk
);
2444 extern void tcp_inherit_port(struct sock
*sk
, struct sock
*child
);
2446 extern void tcp_v4_err(struct sk_buff
*skb
, u32
);
2448 extern void tcp_shutdown (struct sock
*sk
, int how
);
2450 extern int tcp_v4_rcv(struct sk_buff
*skb
);
2452 extern int tcp_v4_remember_stamp(struct sock
*sk
);
2454 extern int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket
*tw
);
2456 extern int tcp_sendmsg(struct sock
*sk
, struct msghdr
*msg
, int size
);
2457 extern ssize_t
tcp_sendpage(struct socket
*sock
, struct page
*page
, int offset
, size_t size
, int flags
);
2459 extern int tcp_ioctl(struct sock
*sk
,
2463 extern int tcp_rcv_state_process(struct sock
*sk
,
2464 struct sk_buff
*skb
,
2468 extern int tcp_rcv_established(struct sock
*sk
,
2469 struct sk_buff
*skb
,
2473 enum tcp_ack_state_t
2480 static __inline
void tcp_schedule_ack(struct tcp_opt
*tp
)
2482 tp
->ack
.pending
|= TCP_ACK_SCHED
;
2485 static __inline
int tcp_ack_scheduled(struct tcp_opt
*tp
)
2487 return tp
->ack
.pending
&TCP_ACK_SCHED
;
2490 static __inline
void tcp_dec_quickack_mode(struct tcp_opt
*tp
)
2492 if (tp
->ack
.quick
&& --tp
->ack
.quick
== 0) {
2493 /* Leaving quickack mode we deflate ATO. */
2494 tp
->ack
.ato
= TCP_ATO_MIN
;
2498 extern void tcp_enter_quickack_mode(struct tcp_opt
*tp
);
2500 static __inline
void tcp_delack_init(struct tcp_opt
*tp
)
2502 memset(&tp
->ack
, 0, sizeof(tp
->ack
));
2505 static __inline
void tcp_clear_options(struct tcp_opt
*tp
)
2507 tp
->tstamp_ok
= tp
->sack_ok
= tp
->wscale_ok
= tp
->snd_wscale
= 0;
2519 extern enum tcp_tw_status
tcp_timewait_state_process(struct tcp_tw_bucket
*tw
,
2520 struct sk_buff
*skb
,
2524 extern struct sock
* tcp_check_req(struct sock
*sk
,struct sk_buff
*skb
,
2525 struct open_request
*req
,
2526 struct open_request
**prev
);
2527 extern int tcp_child_process(struct sock
*parent
,
2529 struct sk_buff
*skb
);
2530 extern void tcp_enter_loss(struct sock
*sk
, int how
);
2531 extern void tcp_clear_retrans(struct tcp_opt
*tp
);
2532 extern void tcp_update_metrics(struct sock
*sk
);
2534 extern void tcp_close(struct sock
*sk
,
2536 extern struct sock
* tcp_accept(struct sock
*sk
, int flags
, int *err
);
2537 extern unsigned int tcp_poll(struct file
* file
, struct socket
*sock
, struct poll_table_struct
*wait
);
2538 extern void tcp_write_space(struct sock
*sk
);
2540 extern int tcp_getsockopt(struct sock
*sk
, int level
,
2541 int optname
, char *optval
,
2543 extern int tcp_setsockopt(struct sock
*sk
, int level
,
2544 int optname
, char *optval
,
2546 extern void tcp_set_keepalive(struct sock
*sk
, int val
);
2547 extern int tcp_recvmsg(struct sock
*sk
,
2549 int len
, int nonblock
,
2550 int flags
, int *addr_len
);
2552 extern int tcp_listen_start(struct sock
*sk
);
2554 extern void tcp_parse_options(struct sk_buff
*skb
,
2559 * TCP v4 functions exported for the inet6 API
2562 extern int tcp_v4_rebuild_header(struct sock
*sk
);
2564 extern int tcp_v4_build_header(struct sock
*sk
,
2565 struct sk_buff
*skb
);
2567 extern void tcp_v4_send_check(struct sock
*sk
,
2568 struct tcphdr
*th
, int len
,
2569 struct sk_buff
*skb
);
2571 extern int tcp_v4_conn_request(struct sock
*sk
,
2572 struct sk_buff
*skb
);
2574 extern struct sock
* tcp_create_openreq_child(struct sock
*sk
,
2575 struct open_request
*req
,
2576 struct sk_buff
*skb
);
2578 extern struct sock
* tcp_v4_syn_recv_sock(struct sock
*sk
,
2579 struct sk_buff
*skb
,
2580 struct open_request
*req
,
2581 struct dst_entry
*dst
);
2583 extern int tcp_v4_do_rcv(struct sock
*sk
,
2584 struct sk_buff
*skb
);
2586 extern int tcp_v4_connect(struct sock
*sk
,
2587 struct sockaddr
*uaddr
,
2590 extern int tcp_connect(struct sock
*sk
);
2592 extern struct sk_buff
* tcp_make_synack(struct sock
*sk
,
2593 struct dst_entry
*dst
,
2594 struct open_request
*req
);
2596 extern int tcp_disconnect(struct sock
*sk
, int flags
);
2598 extern void tcp_unhash(struct sock
*sk
);
2600 extern int tcp_v4_hash_connecting(struct sock
*sk
);
2603 /* From syncookies.c */
2604 extern struct sock
*cookie_v4_check(struct sock
*sk
, struct sk_buff
*skb
,
2605 struct ip_options
*opt
);
2606 extern __u32
cookie_v4_init_sequence(struct sock
*sk
, struct sk_buff
*skb
,
2611 extern int tcp_write_xmit(struct sock
*, int nonagle
);
2612 extern int tcp_retransmit_skb(struct sock
*, struct sk_buff
*);
2613 extern void tcp_xmit_retransmit_queue(struct sock
*);
2614 extern void tcp_simple_retransmit(struct sock
*);
2616 extern void tcp_send_probe0(struct sock
*);
2617 extern void tcp_send_partial(struct sock
*);
2618 extern int tcp_write_wakeup(struct sock
*);
2619 extern void tcp_send_fin(struct sock
*sk
);
2620 extern void tcp_send_active_reset(struct sock
*sk
, int priority
);
2621 extern int tcp_send_synack(struct sock
*);
2622 extern int tcp_transmit_skb(struct sock
*, struct sk_buff
*);
2623 extern void tcp_send_skb(struct sock
*, struct sk_buff
*, int force_queue
, unsigned mss_now
);
2624 extern void tcp_push_one(struct sock
*, unsigned mss_now
);
2625 extern void tcp_send_ack(struct sock
*sk
);
2626 extern void tcp_send_delayed_ack(struct sock
*sk
);
2629 extern void tcp_init_xmit_timers(struct sock
*);
2630 extern void tcp_clear_xmit_timers(struct sock
*);
2632 extern void tcp_delete_keepalive_timer (struct sock
*);
2633 extern void tcp_reset_keepalive_timer (struct sock
*, unsigned long);
2634 extern int tcp_sync_mss(struct sock
*sk
, u32 pmtu
);
2636 extern const char timer_bug_msg
[];
2638 /* Read 'sendfile()'-style from a TCP socket */
2639 typedef int (*sk_read_actor_t
)(read_descriptor_t
*, struct sk_buff
*,
2640 unsigned int, size_t);
2641 extern int tcp_read_sock(struct sock
*sk
, read_descriptor_t
*desc
,
2642 sk_read_actor_t recv_actor
);
2644 static __inline
void tcp_clear_xmit_timer(struct sock
*sk
, int what
)
2647 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
2650 case TCP_TIME_RETRANS
:
2651 case TCP_TIME_PROBE0
:
2654 #ifdef TCP_CLEAR_TIMERS
2655 if (timer_pending(&tp
->retransmit_timer
) &&
2656 del_timer(&tp
->retransmit_timer
))
2661 tp
->ack
.blocked
= 0;
2662 tp
->ack
.pending
= 0;
2664 #ifdef TCP_CLEAR_TIMERS
2665 if (timer_pending(&tp
->delack_timer
) &&
2666 del_timer(&tp
->delack_timer
))
2671 printk(timer_bug_msg
);
2678 * Reset the retransmission timer
2680 static __inline
void tcp_reset_xmit_timer(struct sock
*sk
, int what
, unsigned long when
)
2683 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
2685 if (when
> TCP_RTO_MAX
) {
2687 printk(KERN_DEBUG
"reset_xmit_timer sk=%p %d when=0x%lx, caller=%p\n", sk
, what
, when
, current_text_addr());
2693 case TCP_TIME_RETRANS
:
2694 case TCP_TIME_PROBE0
:
2696 tp
->timeout
= jiffies
+when
;
2697 if (!mod_timer(&tp
->retransmit_timer
, tp
->timeout
))
2702 tp
->ack
.pending
|= TCP_ACK_TIMER
;
2703 tp
->ack
.timeout
= jiffies
+when
;
2704 if (!mod_timer(&tp
->delack_timer
, tp
->ack
.timeout
))
2709 printk(KERN_DEBUG
"bug: unknown timer value\n");
2714 /* Compute the current effective MSS, taking SACKs and IP options,
2715 * and even PMTU discovery events into account.
2718 static __inline
unsigned int tcp_current_mss(struct sock
*sk
)
2721 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
2722 struct dst_entry
*dst
= __sk_dst_get(sk
);
2723 int mss_now
= tp
->mss_cache
;
2725 if (dst
&& dst
->pmtu
!= tp
->pmtu_cookie
)
2726 mss_now
= tcp_sync_mss(sk
, dst
->pmtu
);
2729 mss_now
-= (TCPOLEN_SACK_BASE_ALIGNED
+
2730 (tp
->eff_sacks
* TCPOLEN_SACK_PERBLOCK
));
2737 /* Initialize RCV_MSS value.
2738 * RCV_MSS is an our guess about MSS used by the peer.
2739 * We haven't any direct information about the MSS.
2740 * It's better to underestimate the RCV_MSS rather than overestimate.
2741 * Overestimations make us ACKing less frequently than needed.
2742 * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
2745 static __inline
void tcp_initialize_rcv_mss(struct sock
*sk
)
2748 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
2749 unsigned int hint
= min(tp
->advmss
, tp
->mss_cache
);
2751 hint
= min(hint
, tp
->rcv_wnd
/2);
2752 hint
= min(hint
, TCP_MIN_RCVMSS
);
2753 hint
= max(hint
, TCP_MIN_MSS
);
2755 tp
->ack
.rcv_mss
= hint
;
2759 static __inline
void __tcp_fast_path_on(struct tcp_opt
*tp
, u32 snd_wnd
)
2762 tp
->pred_flags
= htonl((tp
->tcp_header_len
<< 26) |
2763 ntohl(TCP_FLAG_ACK
) |
2768 static __inline
void tcp_fast_path_on(struct tcp_opt
*tp
)
2771 __tcp_fast_path_on(tp
, tp
->snd_wnd
>>tp
->snd_wscale
);
2775 static __inline
void tcp_fast_path_check(struct sock
*sk
, struct tcp_opt
*tp
)
2778 if (skb_queue_len(&tp
->out_of_order_queue
) == 0 &&
2780 atomic_read(&sk
->rmem_alloc
) < sk
->rcvbuf
&&
2782 tcp_fast_path_on(tp
);
2786 /* Compute the actual receive window we are currently advertising.
2787 * Rcv_nxt can be after the window if our peer push more data
2788 * than the offered window.
2790 static __inline u32
tcp_receive_window(struct tcp_opt
*tp
)
2793 s32 win
= tp
->rcv_wup
+ tp
->rcv_wnd
- tp
->rcv_nxt
;
2803 /* Choose a new window, without checks for shrinking, and without
2804 * scaling applied to the result. The caller does these things
2805 * if necessary. This is a "raw" window selection.
2807 extern u32
__tcp_select_window(struct sock
*sk
);
2809 /* TCP timestamps are only 32-bits, this causes a slight
2810 * complication on 64-bit systems since we store a snapshot
2811 * of jiffies in the buffer control blocks below. We decidely
2812 * only use of the low 32-bits of jiffies and hide the ugly
2813 * casts with the following macro.
2815 #define tcp_time_stamp ((__u32)(jiffies))
2817 /* This is what the send packet queueing engine uses to pass
2818 * TCP per-packet control information to the transmission
2819 * code. We also store the host-order sequence numbers in
2820 * here too. This is 36 bytes on 32-bit architectures,
2821 * 40 bytes on 64-bit machines, if this grows please adjust
2822 * skbuff.h:skbuff->cb[xxx] size appropriately.
2827 struct inet_skb_parm h4
;
2829 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
2830 struct inet6_skb_parm h6
;
2832 } header
; /* For incoming frames */
2833 __u32 seq
; /* Starting sequence number */
2834 __u32 end_seq
; /* SEQ + FIN + SYN + datalen */
2835 __u32 when
; /* used to compute rtt's */
2836 __u8 flags
; /* TCP header flags. */
2838 /* NOTE: These must match up to the flags byte in a
2841 #define TCPCB_FLAG_FIN 0x01
2842 #define TCPCB_FLAG_SYN 0x02
2843 #define TCPCB_FLAG_RST 0x04
2844 #define TCPCB_FLAG_PSH 0x08
2845 #define TCPCB_FLAG_ACK 0x10
2846 #define TCPCB_FLAG_URG 0x20
2847 #define TCPCB_FLAG_ECE 0x40
2848 #define TCPCB_FLAG_CWR 0x80
2850 __u8 sacked
; /* State flags for SACK/FACK. */
2851 #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */
2852 #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
2853 #define TCPCB_LOST 0x04 /* SKB is lost */
2854 #define TCPCB_TAGBITS 0x07 /* All tag bits */
2856 #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
2857 #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS)
2859 #define TCPCB_URG 0x20 /* Urgent pointer advenced here */
2861 #define TCPCB_AT_TAIL (TCPCB_URG)
2863 __u16 urg_ptr
; /* Valid w/URG flags is set. */
2864 __u32 ack_seq
; /* Sequence number ACK'd */
2867 #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
2869 #define for_retrans_queue(skb, sk, tp) \
2870 for (skb = (sk)->write_queue.next; \
2871 (skb != (tp)->send_head) && \
2872 (skb != (struct sk_buff *)&(sk)->write_queue); \
2876 //#include <net/tcp_ecn.h>
2880 * Compute minimal free write space needed to queue new packets.
2882 static __inline
int tcp_min_write_space(struct sock
*sk
)
2885 return sk
->wmem_queued
/2;
2891 static __inline
int tcp_wspace(struct sock
*sk
)
2894 return sk
->sndbuf
- sk
->wmem_queued
;
2901 /* This determines how many packets are "in the network" to the best
2902 * of our knowledge. In many cases it is conservative, but where
2903 * detailed information is available from the receiver (via SACK
2904 * blocks etc.) we can make more aggressive calculations.
2906 * Use this for decisions involving congestion control, use just
2907 * tp->packets_out to determine if the send queue is empty or not.
2909 * Read this equation as:
2911 * "Packets sent once on transmission queue" MINUS
2912 * "Packets left network, but not honestly ACKed yet" PLUS
2913 * "Packets fast retransmitted"
2915 static __inline
unsigned int tcp_packets_in_flight(struct tcp_opt
*tp
)
2918 return tp
->packets_out
- tp
->left_out
+ tp
->retrans_out
;
2924 /* Recalculate snd_ssthresh, we want to set it to:
2926 * one half the current congestion window, but no
2927 * less than two segments
2929 static __inline __u32
tcp_recalc_ssthresh(struct tcp_opt
*tp
)
2932 return max(tp
->snd_cwnd
>> 1U, 2U);
2938 /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
2939 * The exception is rate halving phase, when cwnd is decreasing towards
2942 static __inline __u32
tcp_current_ssthresh(struct tcp_opt
*tp
)
2945 if ((1<<tp
->ca_state
)&(TCPF_CA_CWR
|TCPF_CA_Recovery
))
2946 return tp
->snd_ssthresh
;
2948 return max(tp
->snd_ssthresh
,
2949 ((tp
->snd_cwnd
>> 1) +
2950 (tp
->snd_cwnd
>> 2)));
2956 static __inline
void tcp_sync_left_out(struct tcp_opt
*tp
)
2959 if (tp
->sack_ok
&& tp
->sacked_out
>= tp
->packets_out
- tp
->lost_out
)
2960 tp
->sacked_out
= tp
->packets_out
- tp
->lost_out
;
2961 tp
->left_out
= tp
->sacked_out
+ tp
->lost_out
;
2965 extern void tcp_cwnd_application_limited(struct sock
*sk
);
2967 /* Congestion window validation. (RFC2861) */
2969 static __inline
void tcp_cwnd_validate(struct sock
*sk
, struct tcp_opt
*tp
)
2972 if (tp
->packets_out
>= tp
->snd_cwnd
) {
2973 /* Network is feed fully. */
2974 tp
->snd_cwnd_used
= 0;
2975 tp
->snd_cwnd_stamp
= tcp_time_stamp
;
2977 /* Network starves. */
2978 if (tp
->packets_out
> tp
->snd_cwnd_used
)
2979 tp
->snd_cwnd_used
= tp
->packets_out
;
2981 if ((s32
)(tcp_time_stamp
- tp
->snd_cwnd_stamp
) >= tp
->rto
)
2982 tcp_cwnd_application_limited(sk
);
2987 /* Set slow start threshould and cwnd not falling to slow start */
2988 static __inline
void __tcp_enter_cwr(struct tcp_opt
*tp
)
2991 tp
->undo_marker
= 0;
2992 tp
->snd_ssthresh
= tcp_recalc_ssthresh(tp
);
2993 tp
->snd_cwnd
= min(tp
->snd_cwnd
,
2994 tcp_packets_in_flight(tp
) + 1U);
2995 tp
->snd_cwnd_cnt
= 0;
2996 tp
->high_seq
= tp
->snd_nxt
;
2997 tp
->snd_cwnd_stamp
= tcp_time_stamp
;
2998 TCP_ECN_queue_cwr(tp
);
3002 static __inline
void tcp_enter_cwr(struct tcp_opt
*tp
)
3005 tp
->prior_ssthresh
= 0;
3006 if (tp
->ca_state
< TCP_CA_CWR
) {
3007 __tcp_enter_cwr(tp
);
3008 tp
->ca_state
= TCP_CA_CWR
;
3013 extern __u32
tcp_init_cwnd(struct tcp_opt
*tp
);
3015 /* Slow start with delack produces 3 packets of burst, so that
3016 * it is safe "de facto".
3018 static __inline __u32
tcp_max_burst(struct tcp_opt
*tp
)
3023 static __inline__
int tcp_minshall_check(struct tcp_opt
*tp
)
3026 return after(tp
->snd_sml
,tp
->snd_una
) &&
3027 !after(tp
->snd_sml
, tp
->snd_nxt
);
3033 static __inline
void tcp_minshall_update(struct tcp_opt
*tp
, int mss
, struct sk_buff
*skb
)
3037 tp
->snd_sml
= TCP_SKB_CB(skb
)->end_seq
;
3041 /* Return 0, if packet can be sent now without violation Nagle's rules:
3042 1. It is full sized.
3043 2. Or it contains FIN.
3044 3. Or TCP_NODELAY was set.
3045 4. Or TCP_CORK is not set, and all sent packets are ACKed.
3046 With Minshall's modification: all sent small packets are ACKed.
3050 tcp_nagle_check(struct tcp_opt
*tp
, struct sk_buff
*skb
, unsigned mss_now
, int nonagle
)
3053 return (skb
->len
< mss_now
&&
3054 !(TCP_SKB_CB(skb
)->flags
& TCPCB_FLAG_FIN
) &&
3058 tcp_minshall_check(tp
))));
3064 /* This checks if the data bearing packet SKB (usually tp->send_head)
3065 * should be put on the wire right now.
3067 static __inline
int tcp_snd_test(struct tcp_opt
*tp
, struct sk_buff
*skb
,
3068 unsigned cur_mss
, int nonagle
)
3071 /* RFC 1122 - section 4.2.3.4
3075 * a) The right edge of this frame exceeds the window
3076 * b) There are packets in flight and we have a small segment
3077 * [SWS avoidance and Nagle algorithm]
3078 * (part of SWS is done on packetization)
3079 * Minshall version sounds: there are no _small_
3080 * segments in flight. (tcp_nagle_check)
3081 * c) We have too many packets 'in flight'
3083 * Don't use the nagle rule for urgent data (or
3084 * for the final FIN -DaveM).
3086 * Also, Nagle rule does not apply to frames, which
3087 * sit in the middle of queue (they have no chances
3088 * to get new data) and if room at tail of skb is
3089 * not enough to save something seriously (<32 for now).
3092 /* Don't be strict about the congestion window for the
3093 * final FIN frame. -DaveM
3095 return ((nonagle
==1 || tp
->urg_mode
3096 || !tcp_nagle_check(tp
, skb
, cur_mss
, nonagle
)) &&
3097 ((tcp_packets_in_flight(tp
) < tp
->snd_cwnd
) ||
3098 (TCP_SKB_CB(skb
)->flags
& TCPCB_FLAG_FIN
)) &&
3099 !after(TCP_SKB_CB(skb
)->end_seq
, tp
->snd_una
+ tp
->snd_wnd
));
3105 static __inline
void tcp_check_probe_timer(struct sock
*sk
, struct tcp_opt
*tp
)
3108 if (!tp
->packets_out
&& !tp
->pending
)
3109 tcp_reset_xmit_timer(sk
, TCP_TIME_PROBE0
, tp
->rto
);
3113 static __inline
int tcp_skb_is_last(struct sock
*sk
, struct sk_buff
*skb
)
3116 return (skb
->next
== (struct sk_buff
*)&sk
->write_queue
);
3122 /* Push out any pending frames which were held back due to
3123 * TCP_CORK or attempt at coalescing tiny packets.
3124 * The socket must be locked by the caller.
3126 static __inline
void __tcp_push_pending_frames(struct sock
*sk
,
3132 struct sk_buff
*skb
= tp
->send_head
;
3135 if (!tcp_skb_is_last(sk
, skb
))
3137 if (!tcp_snd_test(tp
, skb
, cur_mss
, nonagle
) ||
3138 tcp_write_xmit(sk
, nonagle
))
3139 tcp_check_probe_timer(sk
, tp
);
3141 tcp_cwnd_validate(sk
, tp
);
3145 static __inline
void tcp_push_pending_frames(struct sock
*sk
,
3149 __tcp_push_pending_frames(sk
, tp
, tcp_current_mss(sk
), tp
->nonagle
);
3153 static __inline
int tcp_may_send_now(struct sock
*sk
, struct tcp_opt
*tp
)
3156 struct sk_buff
*skb
= tp
->send_head
;
3159 tcp_snd_test(tp
, skb
, tcp_current_mss(sk
),
3160 tcp_skb_is_last(sk
, skb
) ? 1 : tp
->nonagle
));
3166 static __inline
void tcp_init_wl(struct tcp_opt
*tp
, u32 ack
, u32 seq
)
3173 static __inline
void tcp_update_wl(struct tcp_opt
*tp
, u32 ack
, u32 seq
)
3180 extern void tcp_destroy_sock(struct sock
*sk
);
3184 * Calculate(/check) TCP checksum
3186 static __inline u16
tcp_v4_check(struct tcphdr
*th
, int len
,
3187 unsigned long saddr
, unsigned long daddr
,
3191 return csum_tcpudp_magic(saddr
,daddr
,len
,IPPROTO_TCP
,base
);
3197 static __inline
int __tcp_checksum_complete(struct sk_buff
*skb
)
3200 return (unsigned short)csum_fold(skb_checksum(skb
, 0, skb
->len
, skb
->csum
));
3206 static __inline
int tcp_checksum_complete(struct sk_buff
*skb
)
3209 return skb
->ip_summed
!= CHECKSUM_UNNECESSARY
&&
3210 __tcp_checksum_complete(skb
);
3216 /* Prequeue for VJ style copy to user, combined with checksumming. */
3218 static __inline
void tcp_prequeue_init(struct tcp_opt
*tp
)
3221 tp
->ucopy
.task
= NULL
;
3223 tp
->ucopy
.memory
= 0;
3224 skb_queue_head_init(&tp
->ucopy
.prequeue
);
3228 /* Packet is added to VJ-style prequeue for processing in process
3229 * context, if a reader task is waiting. Apparently, this exciting
3230 * idea (VJ's mail "Re: query about TCP header on tcp-ip" of 07 Sep 93)
3231 * failed somewhere. Latency? Burstiness? Well, at least now we will
3232 * see, why it failed. 8)8) --ANK
3234 * NOTE: is this not too big to inline?
3236 static __inline
int tcp_prequeue(struct sock
*sk
, struct sk_buff
*skb
)
3239 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
3241 if (tp
->ucopy
.task
) {
3242 __skb_queue_tail(&tp
->ucopy
.prequeue
, skb
);
3243 tp
->ucopy
.memory
+= skb
->truesize
;
3244 if (tp
->ucopy
.memory
> sk
->rcvbuf
) {
3245 struct sk_buff
*skb1
;
3250 while ((skb1
= __skb_dequeue(&tp
->ucopy
.prequeue
)) != NULL
) {
3251 sk
->backlog_rcv(sk
, skb1
);
3252 NET_INC_STATS_BH(TCPPrequeueDropped
);
3255 tp
->ucopy
.memory
= 0;
3256 } else if (skb_queue_len(&tp
->ucopy
.prequeue
) == 1) {
3257 wake_up_interruptible(sk
->sleep
);
3258 if (!tcp_ack_scheduled(tp
))
3259 tcp_reset_xmit_timer(sk
, TCP_TIME_DACK
, (3*TCP_RTO_MIN
)/4);
3273 static char *statename
[]={
3274 "Unused","Established","Syn Sent","Syn Recv",
3275 "Fin Wait 1","Fin Wait 2","Time Wait", "Close",
3276 "Close Wait","Last ACK","Listen","Closing"
3280 static __inline
void tcp_set_state(struct sock
*sk
, int state
)
3283 int oldstate
= sk
->state
;
3286 case TCP_ESTABLISHED
:
3287 if (oldstate
!= TCP_ESTABLISHED
)
3288 TCP_INC_STATS(TcpCurrEstab
);
3292 sk
->prot
->unhash(sk
);
3293 if (sk
->prev
&& !(sk
->userlocks
&SOCK_BINDPORT_LOCK
))
3297 if (oldstate
==TCP_ESTABLISHED
)
3298 tcp_statistics
[smp_processor_id()*2+!in_softirq()].TcpCurrEstab
--;
3301 /* Change state AFTER socket is unhashed to avoid closed
3302 * socket sitting in hash tables.
3307 SOCK_DEBUG(sk
, "TCP sk=%p, State %s -> %s\n",sk
, statename
[oldstate
],statename
[state
]);
3312 static __inline
void tcp_done(struct sock
*sk
)
3315 tcp_set_state(sk
, TCP_CLOSE
);
3316 tcp_clear_xmit_timers(sk
);
3318 sk
->shutdown
= SHUTDOWN_MASK
;
3321 sk
->state_change(sk
);
3323 tcp_destroy_sock(sk
);
3327 static __inline
void tcp_sack_reset(struct tcp_opt
*tp
)
3336 static __inline
void tcp_build_and_update_options(__u32
*ptr
, struct tcp_opt
*tp
, __u32 tstamp
)
3339 if (tp
->tstamp_ok
) {
3340 *ptr
++ = __constant_htonl((TCPOPT_NOP
<< 24) |
3341 (TCPOPT_NOP
<< 16) |
3342 (TCPOPT_TIMESTAMP
<< 8) |
3344 *ptr
++ = htonl(tstamp
);
3345 *ptr
++ = htonl(tp
->ts_recent
);
3347 if (tp
->eff_sacks
) {
3348 struct tcp_sack_block
*sp
= tp
->dsack
? tp
->duplicate_sack
: tp
->selective_acks
;
3351 *ptr
++ = __constant_htonl((TCPOPT_NOP
<< 24) |
3352 (TCPOPT_NOP
<< 16) |
3353 (TCPOPT_SACK
<< 8) |
3354 (TCPOLEN_SACK_BASE
+
3355 (tp
->eff_sacks
* TCPOLEN_SACK_PERBLOCK
)));
3356 for(this_sack
= 0; this_sack
< tp
->eff_sacks
; this_sack
++) {
3357 *ptr
++ = htonl(sp
[this_sack
].start_seq
);
3358 *ptr
++ = htonl(sp
[this_sack
].end_seq
);
3368 /* Construct a tcp options header for a SYN or SYN_ACK packet.
3369 * If this is every changed make sure to change the definition of
3370 * MAX_SYN_SIZE to match the new maximum number of options that you
3373 static __inline
void tcp_syn_build_options(__u32
*ptr
, int mss
, int ts
, int sack
,
3374 int offer_wscale
, int wscale
, __u32 tstamp
, __u32 ts_recent
)
3377 /* We always get an MSS option.
3378 * The option bytes which will be seen in normal data
3379 * packets should timestamps be used, must be in the MSS
3380 * advertised. But we subtract them from tp->mss_cache so
3381 * that calculations in tcp_sendmsg are simpler etc.
3382 * So account for this fact here if necessary. If we
3383 * don't do this correctly, as a receiver we won't
3384 * recognize data packets as being full sized when we
3385 * should, and thus we won't abide by the delayed ACK
3387 * SACKs don't matter, we never delay an ACK when we
3388 * have any of those going out.
3390 *ptr
++ = htonl((TCPOPT_MSS
<< 24) | (TCPOLEN_MSS
<< 16) | mss
);
3393 *ptr
++ = __constant_htonl((TCPOPT_SACK_PERM
<< 24) | (TCPOLEN_SACK_PERM
<< 16) |
3394 (TCPOPT_TIMESTAMP
<< 8) | TCPOLEN_TIMESTAMP
);
3396 *ptr
++ = __constant_htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16) |
3397 (TCPOPT_TIMESTAMP
<< 8) | TCPOLEN_TIMESTAMP
);
3398 *ptr
++ = htonl(tstamp
); /* TSVAL */
3399 *ptr
++ = htonl(ts_recent
); /* TSECR */
3401 *ptr
++ = __constant_htonl((TCPOPT_NOP
<< 24) | (TCPOPT_NOP
<< 16) |
3402 (TCPOPT_SACK_PERM
<< 8) | TCPOLEN_SACK_PERM
);
3404 *ptr
++ = htonl((TCPOPT_NOP
<< 24) | (TCPOPT_WINDOW
<< 16) | (TCPOLEN_WINDOW
<< 8) | (wscale
));
3408 /* Determine a window scaling and initial window to offer.
3409 * Based on the assumption that the given amount of space
3410 * will be offered. Store the results in the tp structure.
3411 * NOTE: for smooth operation initial space offering should
3412 * be a multiple of mss if possible. We assume here that mss >= 1.
3413 * This MUST be enforced by all callers.
3415 static __inline
void tcp_select_initial_window(int __space
, __u32 mss
,
3417 __u32
*window_clamp
,
3422 unsigned int space
= (__space
< 0 ? 0 : __space
);
3424 /* If no clamp set the clamp to the max possible scaled window */
3425 if (*window_clamp
== 0)
3426 (*window_clamp
) = (65535 << 14);
3427 space
= min(*window_clamp
, space
);
3429 /* Quantize space offering to a multiple of mss if possible. */
3431 space
= (space
/ mss
) * mss
;
3433 /* NOTE: offering an initial window larger than 32767
3434 * will break some buggy TCP stacks. We try to be nice.
3435 * If we are not window scaling, then this truncates
3436 * our initial window offering to 32k. There should also
3437 * be a sysctl option to stop being nice.
3439 (*rcv_wnd
) = min(space
, MAX_TCP_WINDOW
);
3442 /* See RFC1323 for an explanation of the limit to 14 */
3443 while (space
> 65535 && (*rcv_wscale
) < 14) {
3447 if (*rcv_wscale
&& sysctl_tcp_app_win
&& space
>=mss
&&
3448 space
- max((space
>>sysctl_tcp_app_win
), mss
>>*rcv_wscale
) < 65536/2)
3452 /* Set initial window to value enough for senders,
3453 * following RFC1414. Senders, not following this RFC,
3454 * will be satisfied with 2.
3456 if (mss
> (1<<*rcv_wscale
)) {
3460 else if (mss
> 1460)
3462 if (*rcv_wnd
> init_cwnd
*mss
)
3463 *rcv_wnd
= init_cwnd
*mss
;
3465 /* Set the clamp no higher than max representable value */
3466 (*window_clamp
) = min(65535U << (*rcv_wscale
), *window_clamp
);
3470 static __inline
int tcp_win_from_space(int space
)
3473 return sysctl_tcp_adv_win_scale
<=0 ?
3474 (space
>>(-sysctl_tcp_adv_win_scale
)) :
3475 space
- (space
>>sysctl_tcp_adv_win_scale
);
3481 /* Note: caller must be prepared to deal with negative returns */
3482 static __inline
int tcp_space(struct sock
*sk
)
3485 return tcp_win_from_space(sk
->rcvbuf
- atomic_read(&sk
->rmem_alloc
));
3491 static __inline
int tcp_full_space( struct sock
*sk
)
3494 return tcp_win_from_space(sk
->rcvbuf
);
3500 static __inline
void tcp_acceptq_removed(struct sock
*sk
)
3507 static __inline
void tcp_acceptq_added(struct sock
*sk
)
3514 static __inline
int tcp_acceptq_is_full(struct sock
*sk
)
3517 return sk
->ack_backlog
> sk
->max_ack_backlog
;
3523 static __inline
void tcp_acceptq_queue(struct sock
*sk
, struct open_request
*req
,
3527 struct tcp_opt
*tp
= &sk
->tp_pinfo
.af_tcp
;
3530 tcp_acceptq_added(sk
);
3532 if (!tp
->accept_queue_tail
) {
3533 tp
->accept_queue
= req
;
3535 tp
->accept_queue_tail
->dl_next
= req
;
3537 tp
->accept_queue_tail
= req
;
3538 req
->dl_next
= NULL
;
3542 struct tcp_listen_opt
3544 u8 max_qlen_log
; /* log_2 of maximal queued SYNs */
3548 struct open_request
*syn_table
[TCP_SYNQ_HSIZE
];
3551 static __inline
void
3552 tcp_synq_removed(struct sock
*sk
, struct open_request
*req
)
3555 struct tcp_listen_opt
*lopt
= sk
->tp_pinfo
.af_tcp
.listen_opt
;
3557 if (--lopt
->qlen
== 0)
3558 tcp_delete_keepalive_timer(sk
);
3559 if (req
->retrans
== 0)
3564 static __inline
void tcp_synq_added(struct sock
*sk
)
3567 struct tcp_listen_opt
*lopt
= sk
->tp_pinfo
.af_tcp
.listen_opt
;
3569 if (lopt
->qlen
++ == 0)
3570 tcp_reset_keepalive_timer(sk
, TCP_TIMEOUT_INIT
);
3575 static __inline
int tcp_synq_len(struct sock
*sk
)
3578 return sk
->tp_pinfo
.af_tcp
.listen_opt
->qlen
;
3584 static __inline
int tcp_synq_young(struct sock
*sk
)
3587 return sk
->tp_pinfo
.af_tcp
.listen_opt
->qlen_young
;
3593 static __inline
int tcp_synq_is_full(struct sock
*sk
)
3596 return tcp_synq_len(sk
)>>sk
->tp_pinfo
.af_tcp
.listen_opt
->max_qlen_log
;
3602 static __inline
void tcp_synq_unlink(struct tcp_opt
*tp
, struct open_request
*req
,
3603 struct open_request
**prev
)
3606 write_lock(&tp
->syn_wait_lock
);
3607 *prev
= req
->dl_next
;
3608 write_unlock(&tp
->syn_wait_lock
);
3612 static __inline
void tcp_synq_drop(struct sock
*sk
, struct open_request
*req
,
3613 struct open_request
**prev
)
3616 tcp_synq_unlink(&sk
->tp_pinfo
.af_tcp
, req
, prev
);
3617 tcp_synq_removed(sk
, req
);
3618 tcp_openreq_free(req
);
3622 static __inline
void tcp_openreq_init(struct open_request
*req
,
3624 struct sk_buff
*skb
)
3627 req
->rcv_wnd
= 0; /* So that tcp_send_synack() knows! */
3628 req
->rcv_isn
= TCP_SKB_CB(skb
)->seq
;
3629 req
->mss
= tp
->mss_clamp
;
3630 req
->ts_recent
= tp
->saw_tstamp
? tp
->rcv_tsval
: 0;
3631 req
->tstamp_ok
= tp
->tstamp_ok
;
3632 req
->sack_ok
= tp
->sack_ok
;
3633 req
->snd_wscale
= tp
->snd_wscale
;
3634 req
->wscale_ok
= tp
->wscale_ok
;
3637 req
->rmt_port
= skb
->h
.th
->source
;
3641 #define TCP_MEM_QUANTUM ((int)PAGE_SIZE)
3643 static __inline
void tcp_free_skb(struct sock
*sk
, struct sk_buff
*skb
)
3646 sk
->tp_pinfo
.af_tcp
.queue_shrunk
= 1;
3647 sk
->wmem_queued
-= skb
->truesize
;
3648 sk
->forward_alloc
+= skb
->truesize
;
3653 static __inline
void tcp_charge_skb(struct sock
*sk
, struct sk_buff
*skb
)
3656 sk
->wmem_queued
+= skb
->truesize
;
3657 sk
->forward_alloc
-= skb
->truesize
;
3661 extern void __tcp_mem_reclaim(struct sock
*sk
);
3662 extern int tcp_mem_schedule(struct sock
*sk
, int size
, int kind
);
3664 static __inline
void tcp_mem_reclaim(struct sock
*sk
)
3667 if (sk
->forward_alloc
>= TCP_MEM_QUANTUM
)
3668 __tcp_mem_reclaim(sk
);
3672 static __inline
void tcp_enter_memory_pressure(void)
3675 if (!tcp_memory_pressure
) {
3676 NET_INC_STATS(TCPMemoryPressures
);
3677 tcp_memory_pressure
= 1;
3682 static __inline
void tcp_moderate_sndbuf(struct sock
*sk
)
3685 if (!(sk
->userlocks
&SOCK_SNDBUF_LOCK
)) {
3686 sk
->sndbuf
= min(sk
->sndbuf
, sk
->wmem_queued
/2);
3687 sk
->sndbuf
= max(sk
->sndbuf
, SOCK_MIN_SNDBUF
);
3692 static __inline
struct sk_buff
*tcp_alloc_pskb(struct sock
*sk
, int size
, int mem
, int gfp
)
3695 struct sk_buff
*skb
= alloc_skb(size
+MAX_TCP_HEADER
, gfp
);
3698 skb
->truesize
+= mem
;
3699 if (sk
->forward_alloc
>= (int)skb
->truesize
||
3700 tcp_mem_schedule(sk
, skb
->truesize
, 0)) {
3701 skb_reserve(skb
, MAX_TCP_HEADER
);
3706 tcp_enter_memory_pressure();
3707 tcp_moderate_sndbuf(sk
);
3715 static __inline
struct sk_buff
*tcp_alloc_skb(struct sock
*sk
, int size
, int gfp
)
3718 return tcp_alloc_pskb(sk
, size
, 0, gfp
);
3724 static __inline
struct page
* tcp_alloc_page(struct sock
*sk
)
3727 if (sk
->forward_alloc
>= (int)PAGE_SIZE
||
3728 tcp_mem_schedule(sk
, PAGE_SIZE
, 0)) {
3729 struct page
*page
= alloc_pages(sk
->allocation
, 0);
3733 tcp_enter_memory_pressure();
3734 tcp_moderate_sndbuf(sk
);
3741 static __inline
void tcp_writequeue_purge(struct sock
*sk
)
3744 struct sk_buff
*skb
;
3746 while ((skb
= __skb_dequeue(&sk
->write_queue
)) != NULL
)
3747 tcp_free_skb(sk
, skb
);
3748 tcp_mem_reclaim(sk
);
3752 extern void tcp_rfree(struct sk_buff
*skb
);
3754 static __inline
void tcp_set_owner_r(struct sk_buff
*skb
, struct sock
*sk
)
3758 skb
->destructor
= tcp_rfree
;
3759 atomic_add(skb
->truesize
, &sk
->rmem_alloc
);
3760 sk
->forward_alloc
-= skb
->truesize
;
3764 extern void tcp_listen_wlock(void);
3766 /* - We may sleep inside this lock.
3767 * - If sleeping is not required (or called from BH),
3768 * use plain read_(un)lock(&tcp_lhash_lock).
3771 static __inline
void tcp_listen_lock(void)
3774 /* read_lock synchronizes to candidates to writers */
3775 read_lock(&tcp_lhash_lock
);
3776 atomic_inc(&tcp_lhash_users
);
3777 read_unlock(&tcp_lhash_lock
);
3781 static __inline
void tcp_listen_unlock(void)
3784 if (atomic_dec_and_test(&tcp_lhash_users
))
3785 wake_up(&tcp_lhash_wait
);
3789 static __inline
int keepalive_intvl_when(struct tcp_opt
*tp
)
3792 return tp
->keepalive_intvl
? : sysctl_tcp_keepalive_intvl
;
3798 static __inline
int keepalive_time_when(struct tcp_opt
*tp
)
3801 return tp
->keepalive_time
? : sysctl_tcp_keepalive_time
;
3807 static __inline
int tcp_fin_time(struct tcp_opt
*tp
)
3810 int fin_timeout
= tp
->linger2
? : sysctl_tcp_fin_timeout
;
3812 if (fin_timeout
< (tp
->rto
<<2) - (tp
->rto
>>1))
3813 fin_timeout
= (tp
->rto
<<2) - (tp
->rto
>>1);
3821 static __inline
int tcp_paws_check(struct tcp_opt
*tp
, int rst
)
3824 if ((s32
)(tp
->rcv_tsval
- tp
->ts_recent
) >= 0)
3826 if (xtime
.tv_sec
>= tp
->ts_recent_stamp
+ TCP_PAWS_24DAYS
)
3829 /* RST segments are not recommended to carry timestamp,
3830 and, if they do, it is recommended to ignore PAWS because
3831 "their cleanup function should take precedence over timestamps."
3832 Certainly, it is mistake. It is necessary to understand the reasons
3833 of this constraint to relax it: if peer reboots, clock may go
3834 out-of-sync and half-open connections will not be reset.
3835 Actually, the problem would be not existing if all
3836 the implementations followed draft about maintaining clock
3837 via reboots. Linux-2.2 DOES NOT!
3839 However, we can relax time bounds for RST segments to MSL.
3841 if (rst
&& xtime
.tv_sec
>= tp
->ts_recent_stamp
+ TCP_PAWS_MSL
)
3849 #define TCP_CHECK_TIMER(sk) do { } while (0)
3851 #endif /* __TCPCORE_H */