2 * Copyright (c) 1982, 1986, 1988, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * @(#)tcp_subr.c 8.1 (Berkeley) 6/10/93
36 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/malloc.h>
42 #include <sys/socket.h>
43 #include <sys/socketvar.h>
44 #include <sys/protosw.h>
45 #include <sys/errno.h>
46 #include <sys/queue.h>
48 #include <net/route.h>
51 #include <netinet/in.h>
52 #include <netinet/in_systm.h>
53 #include <netinet/ip.h>
54 #include <netinet/in_pcb.h>
55 #include <netinet/in_var.h>
56 #include <netinet/ip_var.h>
57 #include <netinet/ip_icmp.h>
58 #include <netinet/tcp.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_timer.h>
62 #include <netinet/tcp_var.h>
63 #include <netinet/tcpip.h>
65 #include <netinet/tcp_debug.h>
69 /* patchable/settable parameters for tcp */
70 int tcp_mssdflt
= TCP_MSS
;
71 int tcp_rttdflt
= TCPTV_SRTTDFLT
/ PR_SLOWHZ
;
72 int tcp_do_rfc1323
= 1;
73 int tcp_do_rfc1644
= 1;
74 static void tcp_cleartaocache(void);
77 * Target size of TCP PCB hash table. Will be rounded down to a prime
81 #define TCBHASHSIZE 128
91 tcp_iss
= boottime
.tv_sec
; /* wrong */
95 tcbinfo
.listhead
= &tcb
;
96 tcbinfo
.hashbase
= phashinit(TCBHASHSIZE
, M_PCB
, &tcbinfo
.hashsize
);
97 if (max_protohdr
< sizeof(struct tcpiphdr
))
98 max_protohdr
= sizeof(struct tcpiphdr
);
99 if (max_linkhdr
+ sizeof(struct tcpiphdr
) > MHLEN
)
104 * Create template to be used to send tcp packets on a connection.
105 * Call after host entry created, allocates an mbuf and fills
106 * in a skeletal tcp/ip header, minimizing the amount of work
107 * necessary when the connection is used.
113 register struct inpcb
*inp
= tp
->t_inpcb
;
114 register struct mbuf
*m
;
115 register struct tcpiphdr
*n
;
117 if ((n
= tp
->t_template
) == 0) {
118 m
= m_get(M_DONTWAIT
, MT_HEADER
);
119 OS_DbgPrint(OSK_MID_TRACE
,("tp->t_template = %x\n", m
));
122 m
->m_len
= sizeof (struct tcpiphdr
);
123 n
= mtod(m
, struct tcpiphdr
*);
125 n
->ti_next
= n
->ti_prev
= 0;
127 n
->ti_pr
= IPPROTO_TCP
;
128 n
->ti_len
= htons(sizeof (struct tcpiphdr
) - sizeof (struct ip
));
129 n
->ti_src
= inp
->inp_laddr
;
130 OS_DbgPrint(OSK_MID_TRACE
,("INP_LADDR = %x\n", n
->ti_src
));
131 n
->ti_dst
= inp
->inp_faddr
;
132 n
->ti_sport
= inp
->inp_lport
;
133 n
->ti_dport
= inp
->inp_fport
;
146 * Send a single message to the TCP at address specified by
147 * the given TCP/IP header. If m == 0, then we make a copy
148 * of the tcpiphdr at ti and send directly to the addressed host.
149 * This is used to force keep alive messages out using the TCP
150 * template for a connection tp->t_template. If flags are given
151 * then we send a message back to the TCP which originated the
152 * segment ti, and discard the mbuf containing it and any other
155 * In any case the ack and sequence number of the transmitted
156 * segment are as specified by the parameters.
159 tcp_respond(tp
, ti
, m
, ack
, seq
, flags
)
161 register struct tcpiphdr
*ti
;
162 register struct mbuf
*m
;
168 struct route
*ro
= 0;
171 win
= sbspace(&tp
->t_inpcb
->inp_socket
->so_rcv
);
172 ro
= &tp
->t_inpcb
->inp_route
;
175 m
= m_gethdr(M_DONTWAIT
, MT_HEADER
);
183 m
->m_data
+= max_linkhdr
;
184 *mtod(m
, struct tcpiphdr
*) = *ti
;
185 ti
= mtod(m
, struct tcpiphdr
*);
190 m
->m_data
= (caddr_t
)ti
;
191 m
->m_len
= sizeof (struct tcpiphdr
);
193 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
194 xchg(ti
->ti_dst
.s_addr
, ti
->ti_src
.s_addr
, u_long
);
195 xchg(ti
->ti_dport
, ti
->ti_sport
, u_short
);
198 ti
->ti_len
= htons((u_short
)(sizeof (struct tcphdr
) + tlen
));
199 tlen
+= sizeof (struct tcpiphdr
);
201 m
->m_pkthdr
.len
= tlen
;
202 m
->m_pkthdr
.rcvif
= (struct ifnet
*) 0;
203 ti
->ti_next
= ti
->ti_prev
= 0;
205 ti
->ti_seq
= htonl(seq
);
206 ti
->ti_ack
= htonl(ack
);
208 ti
->ti_off
= sizeof (struct tcphdr
) >> 2;
209 ti
->ti_flags
= flags
;
211 ti
->ti_win
= htons((u_short
) (win
>> tp
->rcv_scale
));
213 ti
->ti_win
= htons((u_short
)win
);
216 ti
->ti_sum
= in_cksum(m
, tlen
);
217 ((struct ip
*)ti
)->ip_len
= tlen
;
218 ((struct ip
*)ti
)->ip_ttl
= ip_defttl
;
220 if (tp
== NULL
|| (tp
->t_inpcb
->inp_socket
->so_options
& SO_DEBUG
))
221 tcp_trace(TA_OUTPUT
, 0, tp
, ti
, 0);
223 (void) ip_output(m
, NULL
, ro
, 0, NULL
);
227 * Create a new TCP control block, making an
228 * empty reassembly queue and hooking it to the argument
229 * protocol control block.
235 register struct tcpcb
*tp
;
237 tp
= malloc(sizeof(*tp
), M_PCB
, M_NOWAIT
);
239 return ((struct tcpcb
*)0);
240 bzero((char *) tp
, sizeof(struct tcpcb
));
241 tp
->seg_next
= tp
->seg_prev
= (struct tcpiphdr
*)tp
;
242 tp
->t_maxseg
= tp
->t_maxopd
= tcp_mssdflt
;
245 tp
->t_flags
= (TF_REQ_SCALE
|TF_REQ_TSTMP
);
247 tp
->t_flags
|= TF_REQ_CC
;
250 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
251 * rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
252 * reasonable initial retransmit time.
254 tp
->t_srtt
= TCPTV_SRTTBASE
;
255 tp
->t_rttvar
= tcp_rttdflt
* PR_SLOWHZ
<< 2;
256 tp
->t_rttmin
= TCPTV_MIN
;
257 TCPT_RANGESET(tp
->t_rxtcur
,
258 ((TCPTV_SRTTBASE
>> 2) + (TCPTV_SRTTDFLT
<< 2)) >> 1,
259 TCPTV_MIN
, TCPTV_REXMTMAX
);
260 tp
->snd_cwnd
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
261 tp
->snd_ssthresh
= TCP_MAXWIN
<< TCP_MAX_WINSHIFT
;
262 inp
->inp_ip
.ip_ttl
= ip_defttl
;
263 inp
->inp_ppcb
= (caddr_t
)tp
;
268 * Drop a TCP connection, reporting
269 * the specified error. If connection is synchronized,
270 * then send a RST to peer.
274 register struct tcpcb
*tp
;
277 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
279 if (TCPS_HAVERCVDSYN(tp
->t_state
)) {
280 tp
->t_state
= TCPS_CLOSED
;
281 (void) tcp_output(tp
);
282 tcpstat
.tcps_drops
++;
284 tcpstat
.tcps_conndrops
++;
285 if (errno
== ETIMEDOUT
&& tp
->t_softerror
)
286 errno
= tp
->t_softerror
;
287 so
->so_error
= errno
;
288 return (tcp_close(tp
));
292 * Close a TCP control block:
293 * discard all space held by the tcp
294 * discard internet protocol block
295 * wake up any sleepers
299 register struct tcpcb
*tp
;
301 register struct tcpiphdr
*t
;
302 struct inpcb
*inp
= tp
->t_inpcb
;
303 struct socket
*so
= inp
->inp_socket
;
304 register struct mbuf
*m
;
306 register struct rtentry
*rt
;
309 * If we got enough samples through the srtt filter,
310 * save the rtt and rttvar in the routing entry.
311 * 'Enough' is arbitrarily defined as the 16 samples.
312 * 16 samples is enough for the srtt filter to converge
313 * to within 5% of the correct value; fewer samples and
314 * we could save a very bogus rtt.
316 * Don't update the default route's characteristics and don't
317 * update anything that the user "locked".
319 if (tp
->t_rttupdated
>= 16 &&
320 (rt
= inp
->inp_route
.ro_rt
) &&
321 ((struct sockaddr_in
*)rt_key(rt
))->sin_addr
.s_addr
!= INADDR_ANY
) {
322 register u_long i
= 0;
324 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTT
) == 0) {
326 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTT_SCALE
));
327 if (rt
->rt_rmx
.rmx_rtt
&& i
)
329 * filter this update to half the old & half
330 * the new values, converting scale.
331 * See route.h and tcp_var.h for a
332 * description of the scaling constants.
335 (rt
->rt_rmx
.rmx_rtt
+ i
) / 2;
337 rt
->rt_rmx
.rmx_rtt
= i
;
338 tcpstat
.tcps_cachedrtt
++;
340 if ((rt
->rt_rmx
.rmx_locks
& RTV_RTTVAR
) == 0) {
342 (RTM_RTTUNIT
/ (PR_SLOWHZ
* TCP_RTTVAR_SCALE
));
343 if (rt
->rt_rmx
.rmx_rttvar
&& i
)
344 rt
->rt_rmx
.rmx_rttvar
=
345 (rt
->rt_rmx
.rmx_rttvar
+ i
) / 2;
347 rt
->rt_rmx
.rmx_rttvar
= i
;
348 tcpstat
.tcps_cachedrttvar
++;
351 * update the pipelimit (ssthresh) if it has been updated
352 * already or if a pipesize was specified & the threshhold
353 * got below half the pipesize. I.e., wait for bad news
354 * before we start updating, then update on both good
357 if (((rt
->rt_rmx
.rmx_locks
& RTV_SSTHRESH
) == 0 &&
358 ((i
= tp
->snd_ssthresh
) != 0) && rt
->rt_rmx
.rmx_ssthresh
) ||
359 i
< (rt
->rt_rmx
.rmx_sendpipe
/ 2)) {
361 * convert the limit from user data bytes to
362 * packets then to packet data bytes.
364 i
= (i
+ tp
->t_maxseg
/ 2) / tp
->t_maxseg
;
367 i
*= (u_long
)(tp
->t_maxseg
+ sizeof (struct tcpiphdr
));
368 if (rt
->rt_rmx
.rmx_ssthresh
)
369 rt
->rt_rmx
.rmx_ssthresh
=
370 (rt
->rt_rmx
.rmx_ssthresh
+ i
) / 2;
372 rt
->rt_rmx
.rmx_ssthresh
= i
;
373 tcpstat
.tcps_cachedssthresh
++;
377 /* free the reassembly queue, if any */
379 while (t
!= (struct tcpiphdr
*)tp
) {
380 t
= (struct tcpiphdr
*)t
->ti_next
;
381 m
= REASS_MBUF((struct tcpiphdr
*)t
->ti_prev
);
386 (void) m_free(dtom(tp
->t_template
));
389 soisdisconnected(so
);
391 tcpstat
.tcps_closed
++;
392 return ((struct tcpcb
*)0);
402 * Notify a tcp user of an asynchronous error;
403 * store error as soft error, but wake up user
404 * (for now, won't do anything until can select for soft error).
407 tcp_notify(inp
, error
)
411 register struct tcpcb
*tp
= (struct tcpcb
*)inp
->inp_ppcb
;
412 register struct socket
*so
= inp
->inp_socket
;
415 * Ignore some errors if we are hooked up.
416 * If connection hasn't completed, has retransmitted several times,
417 * and receives a second error, give up now. This is better
418 * than waiting a long time to establish a connection that
419 * can never complete.
421 if (tp
->t_state
== TCPS_ESTABLISHED
&&
422 (error
== EHOSTUNREACH
|| error
== ENETUNREACH
||
423 error
== EHOSTDOWN
)) {
425 } else if (tp
->t_state
< TCPS_ESTABLISHED
&& tp
->t_rxtshift
> 3 &&
427 so
->so_error
= error
;
429 tp
->t_softerror
= error
;
430 wakeup( so
, (caddr_t
) &so
->so_timeo
);
436 tcp_ctlinput(cmd
, sa
, ip
)
439 register struct ip
*ip
;
441 register struct tcphdr
*th
;
442 void (*notify
) __P((struct inpcb
*, int)) = tcp_notify
;
444 if (cmd
== PRC_QUENCH
)
447 else if (cmd
== PRC_MSGSIZE
)
448 notify
= tcp_mtudisc
;
450 else if (!PRC_IS_REDIRECT(cmd
) &&
451 ((unsigned)cmd
> PRC_NCMDS
|| inetctlerrmap
[cmd
] == 0))
454 th
= (struct tcphdr
*)((caddr_t
)ip
+ (ip
->ip_hl
<< 2));
455 in_pcbnotify(&tcb
, sa
, th
->th_dport
, ip
->ip_src
, th
->th_sport
,
458 in_pcbnotify(&tcb
, sa
, 0, zeroin_addr
, 0, cmd
, notify
);
462 * When a source quench is received, close congestion window
463 * to one segment. We will gradually open it again as we proceed.
466 tcp_quench(inp
, errno
)
470 struct tcpcb
*tp
= intotcpcb(inp
);
473 tp
->snd_cwnd
= tp
->t_maxseg
;
478 * When `need fragmentation' ICMP is received, update our idea of the MSS
479 * based on the new value in the route. Also nudge TCP to send something,
480 * since we know the packet we just sent was dropped.
481 * This duplicates some code in the tcp_mss() function in tcp_input.c.
484 tcp_mtudisc(inp
, errno
)
488 struct tcpcb
*tp
= intotcpcb(inp
);
490 struct rmxp_tao
*taop
;
491 struct socket
*so
= inp
->inp_socket
;
496 rt
= tcp_rtlookup(inp
);
497 if (!rt
|| !rt
->rt_rmx
.rmx_mtu
) {
498 tp
->t_maxopd
= tp
->t_maxseg
= tcp_mssdflt
;
501 taop
= rmx_taop(rt
->rt_rmx
);
502 offered
= taop
->tao_mssopt
;
503 mss
= rt
->rt_rmx
.rmx_mtu
- sizeof(struct tcpiphdr
);
505 mss
= min(mss
, offered
);
507 * XXX - The above conditional probably violates the TCP
508 * spec. The problem is that, since we don't know the
509 * other end's MSS, we are supposed to use a conservative
510 * default. But, if we do that, then MTU discovery will
511 * never actually take place, because the conservative
512 * default is much less than the MTUs typically seen
513 * on the Internet today. For the moment, we'll sweep
514 * this under the carpet.
516 * The conservative default might not actually be a problem
517 * if the only case this occurs is when sending an initial
518 * SYN with options and data to a host we've never talked
519 * to before. Then, they will reply with an MSS value which
520 * will get recorded and the new parameters should get
521 * recomputed. For Further Study.
523 if (tp
->t_maxopd
<= mss
)
527 if ((tp
->t_flags
& (TF_REQ_TSTMP
|TF_NOOPT
)) == TF_REQ_TSTMP
&&
528 (tp
->t_flags
& TF_RCVD_TSTMP
) == TF_RCVD_TSTMP
)
529 mss
-= TCPOLEN_TSTAMP_APPA
;
530 if ((tp
->t_flags
& (TF_REQ_CC
|TF_NOOPT
)) == TF_REQ_CC
&&
531 (tp
->t_flags
& TF_RCVD_CC
) == TF_RCVD_CC
)
532 mss
-= TCPOLEN_CC_APPA
;
533 #if (MCLBYTES & (MCLBYTES - 1)) == 0
535 mss
&= ~(MCLBYTES
-1);
538 mss
= mss
/ MCLBYTES
* MCLBYTES
;
540 if (so
->so_snd
.sb_hiwat
< mss
)
541 mss
= so
->so_snd
.sb_hiwat
;
545 tcpstat
.tcps_mturesent
++;
547 tp
->snd_nxt
= tp
->snd_una
;
554 * Look-up the routing entry to the peer of this inpcb. If no route
555 * is found and it cannot be allocated the return NULL. This routine
556 * is called by TCP routines that access the rmx structure and by tcp_mss
557 * to get the interface MTU.
566 ro
= &inp
->inp_route
;
568 if (rt
== NULL
|| !(rt
->rt_flags
& RTF_UP
)) {
569 /* No route yet, so try to acquire one */
570 if (inp
->inp_faddr
.s_addr
!= INADDR_ANY
) {
571 ro
->ro_dst
.sa_family
= AF_INET
;
572 ro
->ro_dst
.sa_len
= sizeof(ro
->ro_dst
);
573 ((struct sockaddr_in
*) &ro
->ro_dst
)->sin_addr
=
583 * Return a pointer to the cached information about the remote host.
584 * The cached information is stored in the protocol specific part of
591 struct rtentry
*rt
= tcp_rtlookup(inp
);
593 /* Make sure this is a host route and is up. */
595 (rt
->rt_flags
& (RTF_UP
|RTF_HOST
)) != (RTF_UP
|RTF_HOST
))
598 return rmx_taop(rt
->rt_rmx
);
602 * Clear all the TAO cache entries, called from tcp_init.
605 * This routine is just an empty one, because we assume that the routing
606 * routing tables are initialized at the same time when TCP, so there is
607 * nothing in the cache left over.
610 tcp_cleartaocache(void)