e3557eea13e4a76e9f3849c33bfe4bc541d76aba
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * @(#)tcp_subr.c 8.1 (Berkeley) 6/10/93
36 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/protosw.h>
45 #include <sys/errno.h>
46 #include <sys/queue.h>
48 #include <net/route.h>
51 #include <netinet/in.h>
52 #include <netinet/in_systm.h>
53 #include <netinet/ip.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/in_var.h>
56 #include <netinet/ip_var.h>
57 #include <netinet/ip_icmp.h>
58 #include <netinet/tcp.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_timer.h>
62 #include <netinet/tcp_var.h>
63 #include <netinet/tcpip.h>
65 #include <netinet/tcp_debug.h>
69 /* patchable/settable parameters for tcp */
70 int tcp_mssdflt
= TCP_MSS
;
71 int tcp_rttdflt
= TCPTV_SRTTDFLT
/ PR_SLOWHZ
;
72 int tcp_do_rfc1323
= 1;
73 int tcp_do_rfc1644
= 1;
74 static void tcp_cleartaocache(void);
77 * Target size of TCP PCB hash table. Will be rounded down to a prime
81 #define TCBHASHSIZE 128
91 tcp_iss
= boottime
.tv_sec
; /* wrong */
95 tcbinfo
.listhead
= &tcb
;
96 tcbinfo
.hashbase
= phashinit(TCBHASHSIZE
, M_PCB
, &tcbinfo
.hashsize
);
97 if (max_protohdr
< sizeof(struct tcpiphdr
))
98 max_protohdr
= sizeof(struct tcpiphdr
);
99 if (max_linkhdr
+ sizeof(struct tcpiphdr
) > MHLEN
)
104 * Create template to be used to send tcp packets on a connection.
105 * Call after host entry created, allocates an mbuf and fills
106 * in a skeletal tcp/ip header, minimizing the amount of work
107 * necessary when the connection is used.
113 register struct inpcb
*inp
= tp
->t_inpcb
;
114 register struct mbuf
*m
;
115 register struct tcpiphdr
*n
;
117 if ((n
= tp
->t_template
) == 0) {
118 m
= m_get(M_DONTWAIT
, MT_HEADER
);
119 OS_DbgPrint(OSK_MID_TRACE
,("tp->t_template = %x\n", m
));
122 m
->m_len
= sizeof (struct tcpiphdr
);
123 n
= mtod(m
, struct tcpiphdr
*);
125 n
->ti_next
= n
->ti_prev
= 0;
127 n
->ti_pr
= IPPROTO_TCP
;
128 n
->ti_len
= htons(sizeof (struct tcpiphdr
) - sizeof (struct ip
));
129 n
->ti_src
= inp
->inp_laddr
;
130 OS_DbgPrint(OSK_MID_TRACE
,("INP_LADDR = %x\n", n
->ti_src
));
131 n
->ti_dst
= inp
->inp_faddr
;
132 n
->ti_sport
= inp
->inp_lport
;
133 n
->ti_dport
= inp
->inp_fport
;
146 * Send a single message to the TCP at address specified by
147 * the given TCP/IP header. If m == 0, then we make a copy
148 * of the tcpiphdr at ti and send directly to the addressed host.
149 * This is used to force keep alive messages out using the TCP
150 * template for a connection tp->t_template. If flags are given
151 * then we send a message back to the TCP which originated the
152 * segment ti, and discard the mbuf containing it and any other
155 * In any case the ack and sequence number of the transmitted
156 * segment are as specified by the parameters.
159 tcp_respond(tp
, ti
, m
, ack
, seq
, flags
)
161 register struct tcpiphdr
*ti
;
162 register struct mbuf
*m
;
169 struct route
*ro
= 0;
172 win
= sbspace(&tp
->t_inpcb
->inp_socket
->so_rcv
);
173 ro
= &tp
->t_inpcb
->inp_route
;
176 m
= m_gethdr(M_DONTWAIT
, MT_HEADER
);
184 m
->m_data
+= max_linkhdr
;
185 *mtod(m
, struct tcpiphdr
*) = *ti
;
186 ti
= mtod(m
, struct tcpiphdr
*);
191 m
->m_data
= (caddr_t
)ti
;
192 m
->m_len
= sizeof (struct tcpiphdr
);
194 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
195 xchg(ti
->ti_dst
.s_addr
, ti
->ti_src
.s_addr
, u_long
);
196 xchg(ti
->ti_dport
, ti
->ti_sport
, u_short
);
199 ti
->ti_len
= htons((u_short
)(sizeof (struct tcphdr
) + tlen
));
200 tlen
+= sizeof (struct tcpiphdr
);
202 m
->m_pkthdr
.len
= tlen
;
203 m
->m_pkthdr
.rcvif
= (struct ifnet
*) 0;
204 ti
->ti_next
= ti
->ti_prev
= 0;
206 ti
->ti_seq
= htonl(seq
);
207 ti
->ti_ack
= htonl(ack
);
209 ti
->ti_off
= sizeof (struct tcphdr
) >> 2;
210 ti
->ti_flags
= flags
;
212 ti
->ti_win
= htons((u_short
) (win
>> tp
->rcv_scale
));
214 ti
->ti_win
= htons((u_short
)win
);
217 ti
->ti_sum
= in_cksum(m
, tlen
);
218 ((struct ip
*)ti
)->ip_len
= tlen
;
219 ((struct ip
*)ti
)->ip_ttl
= ip_defttl
;
221 if (tp
== NULL
|| (tp
->t_inpcb
->inp_socket
->so_options
& SO_DEBUG
))
222 tcp_trace(TA_OUTPUT
, 0, tp
, ti
, 0);
224 (void) ip_output(m
, NULL
, ro
, 0, NULL
);
226 /* We allocated m, so we are responsible for freeing it. If the mbuf
227 contains a pointer to an external datablock, we (or rather, m_copy)
228 didn't allocate it but pointed it to the data to send. So we have
229 to cheat a little bit and keep M_FREE from freeing the external
232 m
->m_flags
&= ~M_EXT
;
240 * Create a new TCP control block, making an
241 * empty reassembly queue and hooking it to the argument
242 * protocol control block.
248 register struct tcpcb
*tp
;
250 tp
= malloc(sizeof(*tp
), M_PCB
, M_NOWAIT
);
252 return ((struct tcpcb
*)0);
253 bzero((char *) tp
, sizeof(struct tcpcb
));
254 tp
->seg_next
= tp
->seg_prev
= (struct tcpiphdr
*)tp
;
255 tp
->t_maxseg
= tp
->t_maxopd
= tcp_mssdflt
;
258 tp
->t_flags
= (TF_REQ_SCALE
|TF_REQ_TSTMP
);
260 tp
->t_flags
|= TF_REQ_CC
;
263 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
264 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
265 * reasonable initial retransmit time.
267 tp
->t_srtt
= TCPTV_SRTTBASE
;
268 tp
->t_rttvar
= tcp_rttdflt
* PR_SLOWHZ
<< 2;
269 tp
->t_rttmin
= TCPTV_MIN
;
270 TCPT_RANGESET(tp
->t_rxtcur
,
271 ((TCPTV_SRTTBASE
>> 2) + (TCPTV_SRTTDFLT
<< 2)) >> 1,
272 TCPTV_MIN
, TCPTV_REXMTMAX
);
273 tp
->snd_cwnd
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
274 tp
->snd_ssthresh
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
275 inp
->inp_ip
.ip_ttl
= ip_defttl
;
276 inp
->inp_ppcb
= (caddr_t
)tp
;
281 * Drop a TCP connection, reporting
282 * the specified error. If connection is synchronized,
283 * then send a RST to peer.
287 register struct tcpcb
*tp
;
290 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
292 if (TCPS_HAVERCVDSYN(tp
->t_state
)) {
293 tp
->t_state
= TCPS_CLOSED
;
294 (void) tcp_output(tp
);
295 tcpstat
.tcps_drops
++;
297 tcpstat
.tcps_conndrops
++;
298 if (errno
== ETIMEDOUT
&& tp
->t_softerror
)
299 errno
= tp
->t_softerror
;
300 so
->so_error
= errno
;
301 return (tcp_close(tp
));
305 * Close a TCP control block:
306 * discard all space held by the tcp
307 * discard internet protocol block
308 * wake up any sleepers
312 register struct tcpcb
*tp
;
314 register struct tcpiphdr
*t
;
315 struct inpcb
*inp
= tp
->t_inpcb
;
316 struct socket
*so
= inp
->inp_socket
;
317 register struct mbuf
*m
;
319 register struct rtentry
*rt
;
322 * If we got enough samples through the srtt filter,
323 * save the rtt and rttvar in the routing entry.
324 * 'Enough' is arbitrarily defined as the 16 samples.
325 * 16 samples is enough for the srtt filter to converge
326 * to within 5% of the correct value; fewer samples and
327 * we could save a very bogus rtt.
329 * Don't update the default route's characteristics and don't
330 * update anything that the user "locked".
332 if (tp
->t_rttupdated
>= 16 &&
333 (rt
= inp
->inp_route
.ro_rt
) &&
334 ((struct sockaddr_in
*)rt_key(rt
))->sin_addr
.s_addr
!= INADDR_ANY
) {
335 register u_long i
= 0;
337 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTT
) == 0) {
339 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTT_SCALE
));
340 if (rt
->rt_rmx
.rmx_rtt
&& i
)
342 * filter this update to half the old & half
343 * the new values, converting scale.
344 * See route.h and tcp_var.h for a
345 * description of the scaling constants.
348 (rt
->rt_rmx
.rmx_rtt
+ i
) / 2;
350 rt
->rt_rmx
.rmx_rtt
= i
;
351 tcpstat
.tcps_cachedrtt
++;
353 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTTVAR
) == 0) {
355 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTTVAR_SCALE
));
356 if (rt
->rt_rmx
.rmx_rttvar
&& i
)
357 rt
->rt_rmx
.rmx_rttvar
=
358 (rt
->rt_rmx
.rmx_rttvar
+ i
) / 2;
360 rt
->rt_rmx
.rmx_rttvar
= i
;
361 tcpstat
.tcps_cachedrttvar
++;
364 * update the pipelimit (ssthresh) if it has been updated
365 * already or if a pipesize was specified & the threshhold
366 * got below half the pipesize. I.e., wait for bad news
367 * before we start updating, then update on both good
370 if (((rt
->rt_rmx
.rmx_locks
& RTV_SSTHRESH
) == 0 &&
371 ((i
= tp
->snd_ssthresh
) != 0) && rt
->rt_rmx
.rmx_ssthresh
) ||
372 i
< (rt
->rt_rmx
.rmx_sendpipe
/ 2)) {
374 * convert the limit from user data bytes to
375 * packets then to packet data bytes.
377 i
= (i
+ tp
->t_maxseg
/ 2) / tp
->t_maxseg
;
380 i
*= (u_long
)(tp
->t_maxseg
+ sizeof (struct tcpiphdr
));
381 if (rt
->rt_rmx
.rmx_ssthresh
)
382 rt
->rt_rmx
.rmx_ssthresh
=
383 (rt
->rt_rmx
.rmx_ssthresh
+ i
) / 2;
385 rt
->rt_rmx
.rmx_ssthresh
= i
;
386 tcpstat
.tcps_cachedssthresh
++;
390 /* free the reassembly queue, if any */
392 while (t
!= (struct tcpiphdr
*)tp
) {
393 t
= (struct tcpiphdr
*)t
->ti_next
;
394 m
= REASS_MBUF((struct tcpiphdr
*)t
->ti_prev
);
399 (void) m_free(dtom(tp
->t_template
));
402 soisdisconnected(so
);
404 tcpstat
.tcps_closed
++;
405 return ((struct tcpcb
*)0);
415 * Notify a tcp user of an asynchronous error;
416 * store error as soft error, but wake up user
417 * (for now, won't do anything until can select for soft error).
420 tcp_notify(inp
, error
)
424 register struct tcpcb
*tp
= (struct tcpcb
*)inp
->inp_ppcb
;
425 register struct socket
*so
= inp
->inp_socket
;
428 * Ignore some errors if we are hooked up.
429 * If connection hasn't completed, has retransmitted several times,
430 * and receives a second error, give up now. This is better
431 * than waiting a long time to establish a connection that
432 * can never complete.
434 if (tp
->t_state
== TCPS_ESTABLISHED
&&
435 (error
== EHOSTUNREACH
|| error
== ENETUNREACH
||
436 error
== EHOSTDOWN
)) {
438 } else if (tp
->t_state
< TCPS_ESTABLISHED
&& tp
->t_rxtshift
> 3 &&
440 so
->so_error
= error
;
442 tp
->t_softerror
= error
;
443 wakeup( so
, (caddr_t
) &so
->so_timeo
);
449 tcp_ctlinput(cmd
, sa
, ip
)
452 register struct ip
*ip
;
454 register struct tcphdr
*th
;
455 void (*notify
) __P((struct inpcb
*, int)) = tcp_notify
;
457 if (cmd
== PRC_QUENCH
)
460 else if (cmd
== PRC_MSGSIZE
)
461 notify
= tcp_mtudisc
;
463 else if (!PRC_IS_REDIRECT(cmd
) &&
464 ((unsigned)cmd
> PRC_NCMDS
|| inetctlerrmap
[cmd
] == 0))
467 th
= (struct tcphdr
*)((caddr_t
)ip
+ (ip
->ip_hl
<< 2));
468 in_pcbnotify(&tcb
, sa
, th
->th_dport
, ip
->ip_src
, th
->th_sport
,
471 in_pcbnotify(&tcb
, sa
, 0, zeroin_addr
, 0, cmd
, notify
);
475 * When a source quench is received, close congestion window
476 * to one segment. We will gradually open it again as we proceed.
479 tcp_quench(inp
, errno
)
483 struct tcpcb
*tp
= intotcpcb(inp
);
486 tp
->snd_cwnd
= tp
->t_maxseg
;
491 * When `need fragmentation' ICMP is received, update our idea of the MSS
492 * based on the new value in the route. Also nudge TCP to send something,
493 * since we know the packet we just sent was dropped.
494 * This duplicates some code in the tcp_mss() function in tcp_input.c.
497 tcp_mtudisc(inp
, errno
)
501 struct tcpcb
*tp
= intotcpcb(inp
);
503 struct rmxp_tao
*taop
;
504 struct socket
*so
= inp
->inp_socket
;
509 rt
= tcp_rtlookup(inp
);
510 if (!rt
|| !rt
->rt_rmx
.rmx_mtu
) {
511 tp
->t_maxopd
= tp
->t_maxseg
= tcp_mssdflt
;
514 taop
= rmx_taop(rt
->rt_rmx
);
515 offered
= taop
->tao_mssopt
;
516 mss
= rt
->rt_rmx
.rmx_mtu
- sizeof(struct tcpiphdr
);
518 mss
= min(mss
, offered
);
520 * XXX - The above conditional probably violates the TCP
521 * spec. The problem is that, since we don't know the
522 * other end's MSS, we are supposed to use a conservative
523 * default. But, if we do that, then MTU discovery will
524 * never actually take place, because the conservative
525 * default is much less than the MTUs typically seen
526 * on the Internet today. For the moment, we'll sweep
527 * this under the carpet.
529 * The conservative default might not actually be a problem
530 * if the only case this occurs is when sending an initial
531 * SYN with options and data to a host we've never talked
532 * to before. Then, they will reply with an MSS value which
533 * will get recorded and the new parameters should get
534 * recomputed. For Further Study.
536 if (tp
->t_maxopd
<= mss
)
540 if ((tp
->t_flags
& (TF_REQ_TSTMP
|TF_NOOPT
)) == TF_REQ_TSTMP
&&
541 (tp
->t_flags
& TF_RCVD_TSTMP
) == TF_RCVD_TSTMP
)
542 mss
-= TCPOLEN_TSTAMP_APPA
;
543 if ((tp
->t_flags
& (TF_REQ_CC
|TF_NOOPT
)) == TF_REQ_CC
&&
544 (tp
->t_flags
& TF_RCVD_CC
) == TF_RCVD_CC
)
545 mss
-= TCPOLEN_CC_APPA
;
546 #if (MCLBYTES & (MCLBYTES - 1)) == 0
548 mss
&= ~(MCLBYTES
-1);
551 mss
= mss
/ MCLBYTES
* MCLBYTES
;
553 if (so
->so_snd
.sb_hiwat
< mss
)
554 mss
= so
->so_snd
.sb_hiwat
;
558 tcpstat
.tcps_mturesent
++;
560 tp
->snd_nxt
= tp
->snd_una
;
567 * Look-up the routing entry to the peer of this inpcb. If no route
568 * is found and it cannot be allocated the return NULL. This routine
569 * is called by TCP routines that access the rmx structure and by tcp_mss
570 * to get the interface MTU.
579 ro
= &inp
->inp_route
;
581 if (rt
== NULL
|| !(rt
->rt_flags
& RTF_UP
)) {
582 /* No route yet, so try to acquire one */
583 if (inp
->inp_faddr
.s_addr
!= INADDR_ANY
) {
584 ro
->ro_dst
.sa_family
= AF_INET
;
585 ro
->ro_dst
.sa_len
= sizeof(ro
->ro_dst
);
586 ((struct sockaddr_in
*) &ro
->ro_dst
)->sin_addr
=
596 * Return a pointer to the cached information about the remote host.
597 * The cached information is stored in the protocol specific part of
604 struct rtentry
*rt
= tcp_rtlookup(inp
);
606 /* Make sure this is a host route and is up. */
608 (rt
->rt_flags
& (RTF_UP
|RTF_HOST
)) != (RTF_UP
|RTF_HOST
))
611 return rmx_taop(rt
->rt_rmx
);
615 * Clear all the TAO cache entries, called from tcp_init.
618 * This routine is just an empty one, because we assume that the routing
619 * routing tables are initialized at the same time when TCP, so there is
620 * nothing in the cache left over.
623 tcp_cleartaocache(void)