2 * Copyright (c) 1982, 1986, 1988, 1993
3 * The Regents of the University of California. All rights reserved.
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/kernel.h>
39 #include <sys/malloc.h>
41 #include <sys/socket.h>
42 #include <sys/socketvar.h>
43 #include <sys/protosw.h>
44 #include <sys/errno.h>
47 #include <sys/sysctl.h>
50 #include <net/route.h>
52 #include <netinet/in.h>
53 #include <netinet/in_systm.h>
54 #include <netinet/ip.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_var.h>
57 #include <netinet/ip_var.h>
58 #include <netinet/tcp.h>
59 #include <netinet/tcp_fsm.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_timer.h>
62 #include <netinet/tcp_var.h>
63 #include <netinet/tcpip.h>
65 #include <netinet/tcp_debug.h>
69 * TCP protocol interface to socket abstraction.
71 extern char *tcpstates
[];
74 * Process a TCP user request for TCP tb. If this is a send request
75 * then m is the mbuf chain of send data. If this is a timer expiration
76 * (called from the software clock routine), then timertype tells which timer.
80 tcp_usrreq(so
, req
, m
, nam
, control
)
83 struct mbuf
*m
, *nam
, *control
;
85 register struct inpcb
*inp
;
86 register struct tcpcb
*tp
= 0;
87 struct sockaddr_in
*sinp
;
94 if (req
== PRU_CONTROL
)
95 return (in_control(so
, (u_long
)m
, (caddr_t
)nam
,
96 (struct ifnet
*)control
));
97 if (control
&& control
->m_len
) {
107 * When a TCP is attached to a socket, then there will be
108 * a (struct inpcb) pointed at by the socket, and this
109 * structure will point at a subsidary (struct tcpcb).
111 if (inp
== 0 && req
!= PRU_ATTACH
) {
115 * The following corrects an mbuf leak under rare
116 * circumstances, but has not been fully tested.
118 if (m
&& req
!= PRU_SENSE
)
121 /* safer version of fix for mbuf leak */
122 if (m
&& (req
== PRU_SEND
|| req
== PRU_SENDOOB
))
125 return (EINVAL
); /* XXX */
129 /* WHAT IF TP IS 0? */
131 tcp_acounts
[tp
->t_state
][req
]++;
134 ostate
= tp
->t_state
;
139 #endif /* TCPDEBUG */
144 * TCP attaches to socket via PRU_ATTACH, reserving space,
145 * and an internet control block.
152 error
= tcp_attach(so
);
155 if ((so
->so_options
& SO_LINGER
) && so
->so_linger
== 0)
156 so
->so_linger
= TCP_LINGERTIME
* hz
;
161 * PRU_DETACH detaches the TCP protocol from the socket.
162 * If the protocol state is non-embryonic, then can't
163 * do this directly: have to initiate a PRU_DISCONNECT,
164 * which may finish later; embryonic TCB's can just
168 if (tp
->t_state
> TCPS_LISTEN
)
169 tp
= tcp_disconnect(tp
);
175 * Give the socket an address.
179 * Must check for multicast addresses and disallow binding
182 sinp
= mtod(nam
, struct sockaddr_in
*);
183 if (sinp
->sin_family
== AF_INET
&&
184 IN_MULTICAST(ntohl(sinp
->sin_addr
.s_addr
))) {
185 error
= EAFNOSUPPORT
;
188 error
= in_pcbbind(inp
, nam
);
194 * Prepare to accept connections.
197 if (inp
->inp_lport
== 0)
198 error
= in_pcbbind(inp
, NULL
);
200 tp
->t_state
= TCPS_LISTEN
;
204 * Initiate connection to peer.
205 * Create a template for use in transmissions on this connection.
206 * Enter SYN_SENT state, and mark socket as connecting.
207 * Start keep-alive timer, and seed output sequence space.
208 * Send initial segment on connection.
212 * Must disallow TCP ``connections'' to multicast addresses.
214 sinp
= mtod(nam
, struct sockaddr_in
*);
215 if (sinp
->sin_family
== AF_INET
216 && IN_MULTICAST(ntohl(sinp
->sin_addr
.s_addr
))) {
217 error
= EAFNOSUPPORT
;
221 if ((error
= tcp_connect(tp
, nam
)) != 0)
223 error
= tcp_output(tp
);
227 * Create a TCP connection between two sockets.
234 * Initiate disconnect from peer.
235 * If connection never passed embryonic stage, just drop;
236 * else if don't need to let data drain, then can just drop anyways,
237 * else have to begin TCP shutdown process: mark socket disconnecting,
238 * drain unread data, state switch to reflect user close, and
239 * send segment (e.g. FIN) to peer. Socket will be really disconnected
240 * when peer sends FIN and acks ours.
242 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
245 tp
= tcp_disconnect(tp
);
249 * Accept a connection. Essentially all the work is
250 * done at higher levels; just return the address
251 * of the peer, storing through addr.
254 in_setpeeraddr(inp
, nam
);
258 * Mark the connection as being incapable of further output.
262 tp
= tcp_usrclosed(tp
);
264 error
= tcp_output(tp
);
268 * After a receive, possibly send window update to peer.
271 (void) tcp_output(tp
);
275 * Do a send by putting data in output queue and updating urgent
276 * marker if URG set. Possibly send more data.
280 sbappend(&so
->so_snd
, m
);
281 if (nam
&& tp
->t_state
< TCPS_SYN_SENT
) {
283 * Do implied connect if not yet connected,
284 * initialize window to default value, and
285 * initialize maxseg/maxopd using peer's cached
288 error
= tcp_connect(tp
, nam
);
291 tp
->snd_wnd
= TTCP_CLIENT_SND_WND
;
295 if (req
== PRU_SEND_EOF
) {
297 * Close the send side of the connection after
301 tp
= tcp_usrclosed(tp
);
304 error
= tcp_output(tp
);
311 tp
= tcp_drop(tp
, ECONNABORTED
);
315 ((struct stat
*) m
)->st_blksize
= so
->so_snd
.sb_hiwat
;
320 if ((so
->so_oobmark
== 0 &&
321 (so
->so_state
& SS_RCVATMARK
) == 0) ||
322 so
->so_options
& SO_OOBINLINE
||
323 tp
->t_oobflags
& TCPOOB_HADDATA
) {
327 if ((tp
->t_oobflags
& TCPOOB_HAVEDATA
) == 0) {
332 *mtod(m
, caddr_t
) = tp
->t_iobc
;
333 if (((int)nam
& MSG_PEEK
) == 0)
334 tp
->t_oobflags
^= (TCPOOB_HAVEDATA
| TCPOOB_HADDATA
);
338 if (sbspace(&so
->so_snd
) < -512) {
344 * According to RFC961 (Assigned Protocols),
345 * the urgent pointer points to the last octet
346 * of urgent data. We continue, however,
347 * to consider it to indicate the first octet
348 * of data past the urgent section.
349 * Otherwise, snd_up should be one lower.
351 sbappend(&so
->so_snd
, m
);
352 if (nam
&& tp
->t_state
< TCPS_SYN_SENT
) {
354 * Do implied connect if not yet connected,
355 * initialize window to default value, and
356 * initialize maxseg/maxopd using peer's cached
359 error
= tcp_connect(tp
, nam
);
362 tp
->snd_wnd
= TTCP_CLIENT_SND_WND
;
365 tp
->snd_up
= tp
->snd_una
+ so
->so_snd
.sb_cc
;
367 error
= tcp_output(tp
);
372 in_setsockaddr(inp
, nam
);
376 in_setpeeraddr(inp
, nam
);
380 * TCP slow timer went off; going through this
381 * routine for tracing's sake.
384 tp
= tcp_timers(tp
, (int)nam
);
386 req
|= (int)nam
<< 8; /* for debug's sake */
394 if (tp
&& (so
->so_options
& SO_DEBUG
))
395 tcp_trace(TA_USER
, ostate
, tp
, (struct tcpiphdr
*)0, req
);
402 * Common subroutine to open a TCP connection to remote host specified
403 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
404 * port number if needed. Call in_pcbladdr to do the routing and to choose
405 * a local host address (interface). If there is an existing incarnation
406 * of the same connection in TIME-WAIT state and if the remote host was
407 * sending CC options and if the connection duration was < MSL, then
408 * truncate the previous TIME-WAIT state and proceed.
409 * Initialize connection parameters and enter SYN-SENT state.
413 register struct tcpcb
*tp
;
416 struct inpcb
*inp
= tp
->t_inpcb
, *oinp
;
417 struct socket
*so
= inp
->inp_socket
;
419 struct sockaddr_in
*sin
= mtod(nam
, struct sockaddr_in
*);
420 struct sockaddr_in
*ifaddr
;
422 struct rmxp_tao
*taop
;
423 struct rmxp_tao tao_noncached
;
425 OS_DbgPrint(OSK_MID_TRACE
,("Called\n"));
427 if (inp
->inp_lport
== 0) {
428 error
= in_pcbbind(inp
, NULL
);
434 * Cannot simply call in_pcbconnect, because there might be an
435 * earlier incarnation of this same connection still in
436 * TIME_WAIT state, creating an ADDRINUSE error.
438 error
= in_pcbladdr(inp
, nam
, &ifaddr
);
440 OS_DbgPrint(OSK_MID_TRACE
,("leaving %d\n", error
));
443 oinp
= in_pcblookup(inp
->inp_pcbinfo
->listhead
,
444 sin
->sin_addr
, sin
->sin_port
,
445 inp
->inp_laddr
.s_addr
!= INADDR_ANY
? inp
->inp_laddr
449 if (oinp
!= inp
&& (otp
= intotcpcb(oinp
)) != NULL
&&
450 otp
->t_state
== TCPS_TIME_WAIT
&&
451 otp
->t_duration
< TCPTV_MSL
&&
452 (otp
->t_flags
& TF_RCVD_CC
))
453 otp
= tcp_close(otp
);
455 OS_DbgPrint(OSK_MID_TRACE
,("leaving EADDRINUSE\n"));
459 if (inp
->inp_laddr
.s_addr
== INADDR_ANY
)
460 inp
->inp_laddr
= ifaddr
->sin_addr
;
461 inp
->inp_faddr
= sin
->sin_addr
;
462 inp
->inp_fport
= sin
->sin_port
;
465 tp
->t_template
= tcp_template(tp
);
466 if (tp
->t_template
== 0) {
467 in_pcbdisconnect(inp
);
468 OS_DbgPrint(OSK_MID_TRACE
,("Leaving ENOBUFS\n"));
472 /* Compute window scaling to request. */
473 while (tp
->request_r_scale
< TCP_MAX_WINSHIFT
&&
474 (TCP_MAXWIN
<< tp
->request_r_scale
) < so
->so_rcv
.sb_hiwat
)
475 tp
->request_r_scale
++;
478 tcpstat
.tcps_connattempt
++;
479 tp
->t_state
= TCPS_SYN_SENT
;
480 tp
->t_timer
[TCPT_KEEP
] = tcp_keepinit
;
481 tp
->iss
= tcp_iss
; tcp_iss
+= TCP_ISSINCR
/2;
485 * Generate a CC value for this connection and
486 * check whether CC or CCnew should be used.
488 if ((taop
= tcp_gettaocache(tp
->t_inpcb
)) == NULL
) {
489 taop
= &tao_noncached
;
490 bzero(taop
, sizeof(*taop
));
493 tp
->cc_send
= CC_INC(tcp_ccgen
);
494 if (taop
->tao_ccsent
!= 0 &&
495 CC_GEQ(tp
->cc_send
, taop
->tao_ccsent
)) {
496 taop
->tao_ccsent
= tp
->cc_send
;
498 taop
->tao_ccsent
= 0;
499 tp
->t_flags
|= TF_SENDCCNEW
;
502 OS_DbgPrint(OSK_MID_TRACE
,("Leaving 0\n"));
507 tcp_ctloutput(op
, so
, level
, optname
, mp
)
515 register struct tcpcb
*tp
;
516 register struct mbuf
*m
;
523 if (op
== PRCO_SETOPT
&& *mp
)
527 if (level
!= IPPROTO_TCP
) {
528 error
= ip_ctloutput(op
, so
, level
, optname
, mp
);
541 if (m
== NULL
|| m
->m_len
< sizeof (int))
543 else if (*mtod(m
, int *))
544 tp
->t_flags
|= TF_NODELAY
;
546 tp
->t_flags
&= ~TF_NODELAY
;
550 if (m
&& (i
= *mtod(m
, int *)) > 0 && i
<= tp
->t_maxseg
)
557 if (m
== NULL
|| m
->m_len
< sizeof (int))
559 else if (*mtod(m
, int *))
560 tp
->t_flags
|= TF_NOOPT
;
562 tp
->t_flags
&= ~TF_NOOPT
;
566 if (m
== NULL
|| m
->m_len
< sizeof (int))
568 else if (*mtod(m
, int *))
569 tp
->t_flags
|= TF_NOPUSH
;
571 tp
->t_flags
&= ~TF_NOPUSH
;
583 *mp
= m
= m_get(M_WAIT
, MT_SOOPTS
);
584 m
->m_len
= sizeof(int);
588 *mtod(m
, int *) = tp
->t_flags
& TF_NODELAY
;
591 *mtod(m
, int *) = tp
->t_maxseg
;
594 *mtod(m
, int *) = tp
->t_flags
& TF_NOOPT
;
597 *mtod(m
, int *) = tp
->t_flags
& TF_NOPUSH
;
610 * tcp_sendspace and tcp_recvspace are the default send and receive window
611 * sizes, respectively. These are obsolescent (this information should
612 * be set by the route).
614 u_long tcp_sendspace
= 1024*16;
615 u_long tcp_recvspace
= 1024*16;
618 * Attach TCP protocol to socket, allocating
619 * internet protocol control block, tcp control block,
620 * bufer space, and entering LISTEN state if to accept connections.
626 register struct tcpcb
*tp
;
630 if (so
->so_snd
.sb_hiwat
== 0 || so
->so_rcv
.sb_hiwat
== 0) {
631 error
= soreserve(so
, tcp_sendspace
, tcp_recvspace
);
635 error
= in_pcballoc(so
, &tcbinfo
);
639 tp
= tcp_newtcpcb(inp
);
641 int nofd
= so
->so_state
& SS_NOFDREF
; /* XXX */
643 so
->so_state
&= ~SS_NOFDREF
; /* don't free the socket yet */
645 so
->so_state
|= nofd
;
648 tp
->t_state
= TCPS_CLOSED
;
653 * Initiate (or continue) disconnect.
654 * If embryonic state, just send reset (once).
655 * If in ``let data drain'' option and linger null, just drop.
656 * Otherwise (hard), mark socket disconnecting and drop
657 * current input data; switch states based on user close, and
658 * send segment to peer (with FIN).
662 register struct tcpcb
*tp
;
664 struct socket
*so
= tp
->t_inpcb
->inp_socket
;
666 if (tp
->t_state
< TCPS_ESTABLISHED
)
668 else if ((so
->so_options
& SO_LINGER
) && so
->so_linger
== 0)
669 tp
= tcp_drop(tp
, 0);
671 soisdisconnecting(so
);
672 sbflush(&so
->so_rcv
);
673 tp
= tcp_usrclosed(tp
);
675 (void) tcp_output(tp
);
681 * User issued close, and wish to trail through shutdown states:
682 * if never received SYN, just forget it. If got a SYN from peer,
683 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
684 * If already got a FIN from peer, then almost done; go to LAST_ACK
685 * state. In all other cases, have already sent FIN to peer (e.g.
686 * after PRU_SHUTDOWN), and just have to play tedious game waiting
687 * for peer to send FIN or not respond to keep-alives, etc.
688 * We can let the user exit from the close as soon as the FIN is acked.
692 register struct tcpcb
*tp
;
695 switch (tp
->t_state
) {
699 tp
->t_state
= TCPS_CLOSED
;
704 case TCPS_SYN_RECEIVED
:
705 tp
->t_flags
|= TF_NEEDFIN
;
708 case TCPS_ESTABLISHED
:
709 tp
->t_state
= TCPS_FIN_WAIT_1
;
712 case TCPS_CLOSE_WAIT
:
713 tp
->t_state
= TCPS_LAST_ACK
;
716 if (tp
&& tp
->t_state
>= TCPS_FIN_WAIT_2
) {
717 soisdisconnected(tp
->t_inpcb
->inp_socket
);
718 /* To prevent the connection hanging in FIN_WAIT_2 forever. */
719 if (tp
->t_state
== TCPS_FIN_WAIT_2
)
720 tp
->t_timer
[TCPT_2MSL
] = tcp_maxidle
;
726 * Sysctl for tcp variables.
729 tcp_sysctl(name
, namelen
, oldp
, oldlenp
, newp
, newlen
)
737 /* All sysctl names at this level are terminal. */
742 case TCPCTL_DO_RFC1323
:
743 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
745 case TCPCTL_DO_RFC1644
:
746 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
749 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
752 return (sysctl_rdstruct(oldp
, oldlenp
, newp
, &tcpstat
,
755 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
, &tcp_rttdflt
));
756 case TCPCTL_KEEPIDLE
:
757 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
759 case TCPCTL_KEEPINTVL
:
760 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
762 case TCPCTL_SENDSPACE
:
763 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
764 (int *)&tcp_sendspace
)); /* XXX */
765 case TCPCTL_RECVSPACE
:
766 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
767 (int *)&tcp_recvspace
)); /* XXX */
768 case TCPCTL_KEEPINIT
:
769 return (sysctl_int(oldp
, oldlenp
, newp
, newlen
,
772 return (ENOPROTOOPT
);