Index: sys/netinet/tcp_output.c =================================================================== RCS file: /home/kmacy/devel/ncvs/src/sys/netinet/tcp_output.c,v retrieving revision 1.141 diff -d -u -r1.141 tcp_output.c --- sys/netinet/tcp_output.c 7 Oct 2007 20:44:24 -0000 1.141 +++ sys/netinet/tcp_output.c 21 Oct 2007 21:01:33 -0000 @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -118,540 +119,345 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); - -/* - * Tcp output routine: figure out what should be sent and send it. - */ -int -tcp_output(struct tcpcb *tp) -{ - struct socket *so = tp->t_inpcb->inp_socket; - long len, recwin, sendwin; - int off, flags, error; +struct tcp_output_state { + int sack_rxmit; + int sack_bytes_rxmt; + int sendwin; + int recwin; + int off; + int idle; + int tso; + int sendalot; + int isipv6; + unsigned int hdrlen; + unsigned int optlen; + struct sackhole *p; #ifdef TCP_SIGNATURE - int sigoff = 0; + int sigoff; #endif - struct mbuf *m; - struct ip *ip = NULL; - struct ipovly *ipov = NULL; - struct tcphdr *th; u_char opt[TCP_MAXOLEN]; - unsigned ipoptlen, optlen, hdrlen; - int idle, sendalot; - int sack_rxmit, sack_bytes_rxmt; - struct sackhole *p; - int tso = 0; - struct tcpopt to; -#if 0 - int maxburst = TCP_MAXBURST; -#endif -#ifdef INET6 - struct ip6_hdr *ip6 = NULL; - int isipv6; - - isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; -#endif - - INP_LOCK_ASSERT(tp->t_inpcb); +}; - /* - * Determine length of data that should be transmitted, - * and flags that will be used. - * If there is some data or critical controls (SYN, RST) - * to send, then transmit; otherwise, investigate further. - */ - idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); - if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { +static int +tcp_output_sack_fast_recovery(struct tcpcb *tp, struct tcp_output_state *tos) +{ + long cwin; + struct sackhole *p = tos->p; + int len = 0; + + cwin = min(tp->snd_wnd, tp->snd_cwnd) - tos->sack_bytes_rxmt; + if (cwin < 0) + cwin = 0; + /* Do not retransmit SACK segments beyond snd_recover */ + if (SEQ_GT(p->end, tp->snd_recover)) { /* - * We have been idle for "a while" and no acks are - * expected to clock out any data we send -- - * slow start to get ack "clock" running again. - * - * Set the slow-start flight size depending on whether - * this is a local network or not. + * (At least) part of sack hole extends beyond + * snd_recover. Check to see if we can rexmit data + * for this hole. */ - int ss = ss_fltsz; -#ifdef INET6 - if (isipv6) { - if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) - ss = ss_fltsz_local; - } else -#endif /* INET6 */ - if (in_localaddr(tp->t_inpcb->inp_faddr)) - ss = ss_fltsz_local; - tp->snd_cwnd = tp->t_maxseg * ss; - } - tp->t_flags &= ~TF_LASTIDLE; - if (idle) { - if (tp->t_flags & TF_MORETOCOME) { - tp->t_flags |= TF_LASTIDLE; - idle = 0; - } - } -again: - /* - * If we've recently taken a timeout, snd_max will be greater than - * snd_nxt. There may be SACK information that allows us to avoid - * resending already delivered data. Adjust snd_nxt accordingly. - */ - if ((tp->t_flags & TF_SACK_PERMIT) && - SEQ_LT(tp->snd_nxt, tp->snd_max)) - tcp_sack_adjust(tp); - sendalot = 0; - off = tp->snd_nxt - tp->snd_una; - sendwin = min(tp->snd_wnd, tp->snd_cwnd); - sendwin = min(sendwin, tp->snd_bwnd); - - flags = tcp_outflags[tp->t_state]; - /* - * Send any SACK-generated retransmissions. If we're explicitly trying - * to send out new data (when sendalot is 1), bypass this function. - * If we retransmit in fast recovery mode, decrement snd_cwnd, since - * we're replacing a (future) new transmission with a retransmission - * now, and we previously incremented snd_cwnd in tcp_input(). - */ - /* - * Still in sack recovery , reset rxmit flag to zero. - */ - sack_rxmit = 0; - sack_bytes_rxmt = 0; - len = 0; - p = NULL; - if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) && - (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { - long cwin; - - cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; - if (cwin < 0) - cwin = 0; - /* Do not retransmit SACK segments beyond snd_recover */ - if (SEQ_GT(p->end, tp->snd_recover)) { + if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { /* - * (At least) part of sack hole extends beyond - * snd_recover. Check to see if we can rexmit data - * for this hole. + * Can't rexmit any more data for this hole. + * That data will be rexmitted in the next + * sack recovery episode, when snd_recover + * moves past p->rxmit. */ - if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { - /* - * Can't rexmit any more data for this hole. - * That data will be rexmitted in the next - * sack recovery episode, when snd_recover - * moves past p->rxmit. - */ - p = NULL; - goto after_sack_rexmit; - } else - /* Can rexmit part of the current hole */ - len = ((long)ulmin(cwin, - tp->snd_recover - p->rxmit)); + tos->p = NULL; + return (len); } else - len = ((long)ulmin(cwin, p->end - p->rxmit)); - off = p->rxmit - tp->snd_una; - KASSERT(off >= 0,("%s: sack block to the left of una : %d", - __func__, off)); - if (len > 0) { - sack_rxmit = 1; - sendalot = 1; - tcpstat.tcps_sack_rexmits++; - tcpstat.tcps_sack_rexmit_bytes += - min(len, tp->t_maxseg); - } - } -after_sack_rexmit: - /* - * Get standard flags, and add SYN or FIN if requested by 'hidden' - * state flags. - */ - if (tp->t_flags & TF_NEEDFIN) - flags |= TH_FIN; - if (tp->t_flags & TF_NEEDSYN) - flags |= TH_SYN; - - SOCKBUF_LOCK(&so->so_snd); - /* - * If in persist timeout with window of 0, send 1 byte. - * Otherwise, if window is small but nonzero - * and timer expired, we will send what we can - * and go to transmit state. - */ - if (tp->t_flags & TF_FORCEDATA) { - if (sendwin == 0) { - /* - * If we still have some data to send, then - * clear the FIN bit. Usually this would - * happen below when it realizes that we - * aren't sending all the data. However, - * if we have exactly 1 byte of unsent data, - * then it won't clear the FIN bit below, - * and if we are in persist state, we wind - * up sending the packet without recording - * that we sent the FIN bit. - * - * We can't just blindly clear the FIN bit, - * because if we don't have any more data - * to send then the probe will be the FIN - * itself. - */ - if (off < so->so_snd.sb_cc) - flags &= ~TH_FIN; - sendwin = 1; - } else { - tcp_timer_activate(tp, TT_PERSIST, 0); - tp->t_rxtshift = 0; - } - } - - /* - * If snd_nxt == snd_max and we have transmitted a FIN, the - * offset will be > 0 even if so_snd.sb_cc is 0, resulting in - * a negative length. This can also occur when TCP opens up - * its congestion window while receiving additional duplicate - * acks after fast-retransmit because TCP will reset snd_nxt - * to snd_max after the fast-retransmit. - * - * In the normal retransmit-FIN-only case, however, snd_nxt will - * be set to snd_una, the offset will be 0, and the length may - * wind up 0. - * - * If sack_rxmit is true we are retransmitting from the scoreboard - * in which case len is already set. - */ - if (sack_rxmit == 0) { - if (sack_bytes_rxmt == 0) - len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); - else { - long cwin; - - /* - * We are inside of a SACK recovery episode and are - * sending new data, having retransmitted all the - * data possible in the scoreboard. - */ - len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - - off); - /* - * Don't remove this (len > 0) check ! - * We explicitly check for len > 0 here (although it - * isn't really necessary), to work around a gcc - * optimization issue - to force gcc to compute - * len above. Without this check, the computation - * of len is bungled by the optimizer. - */ - if (len > 0) { - cwin = tp->snd_cwnd - - (tp->snd_nxt - tp->sack_newdata) - - sack_bytes_rxmt; - if (cwin < 0) - cwin = 0; - len = lmin(len, cwin); - } - } - } - - /* - * Lop off SYN bit if it has already been sent. However, if this - * is SYN-SENT state and if segment contains data and if we don't - * know that foreign host supports TAO, suppress sending segment. - */ - if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { - if (tp->t_state != TCPS_SYN_RECEIVED) - flags &= ~TH_SYN; - off--, len++; + /* Can rexmit part of the current hole */ + len = ((long)ulmin(cwin, + tp->snd_recover - p->rxmit)); + } else + len = ((long)ulmin(cwin, p->end - p->rxmit)); + tos->off = p->rxmit - tp->snd_una; + KASSERT(tos->off >= 0,("%s: sack block to the left of una : %d", + __func__, tos->off)); + if (len > 0) { + tos->sack_rxmit = 1; + tos->sendalot = 1; + tcpstat.tcps_sack_rexmits++; + tcpstat.tcps_sack_rexmit_bytes += + min(len, tp->t_maxseg); } + return (len); +} - /* - * Be careful not to send data and/or FIN on SYN segments. - * This measure is needed to prevent interoperability problems - * with not fully conformant TCP implementations. - */ - if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { - len = 0; - flags &= ~TH_FIN; +static void +tcp_output_check_force_byte(struct socket *so, int *flags, struct tcp_output_state *tos) +{ + struct tcpcb *tp = sototcpcb(so); + + if (tos->sendwin == 0) { + /* + * If we still have some data to send, then + * clear the FIN bit. Usually this would + * happen below when it realizes that we + * aren't sending all the data. However, + * if we have exactly 1 byte of unsent data, + * then it won't clear the FIN bit below, + * and if we are in persist state, we wind + * up sending the packet without recording + * that we sent the FIN bit. + * + * We can't just blindly clear the FIN bit, + * because if we don't have any more data + * to send then the probe will be the FIN + * itself. + */ + if (tos->off < so->so_snd.sb_cc) + *flags &= ~TH_FIN; + tos->sendwin = 1; + } else { + tcp_timer_activate(tp, TT_PERSIST, 0); + tp->t_rxtshift = 0; } +} +/* + * If snd_nxt == snd_max and we have transmitted a FIN, the + * offset will be > 0 even if so_snd.sb_cc is 0, resulting in + * a negative length. This can also occur when TCP opens up + * its congestion window while receiving additional duplicate + * acks after fast-retransmit because TCP will reset snd_nxt + * to snd_max after the fast-retransmit. + * + * In the normal retransmit-FIN-only case, however, snd_nxt will + * be set to snd_una, the offset will be 0, and the length may + * wind up 0. + * + * If sack_rxmit is true we are retransmitting from the scoreboard + * in which case len is already set. + */ +static int +tcp_output_sack_rxmit_bytes(struct socket *so, struct tcp_output_state *tos) +{ + int len; + struct tcpcb *tp = sototcpcb(so); + + if (tos->sack_bytes_rxmt == 0) + len = ((long)ulmin(so->so_snd.sb_cc, tos->sendwin) - tos->off); + else { + long cwin; + + /* + * We are inside of a SACK recovery episode and are + * sending new data, having retransmitted all the + * data possible in the scoreboard. + */ + len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - tos->off); - if (len < 0) { /* - * If FIN has been sent but not acked, - * but we haven't been called to retransmit, - * len will be < 0. Otherwise, window shrank - * after we sent into it. If window shrank to 0, - * cancel pending retransmit, pull snd_nxt back - * to (closed) window, and set the persist timer - * if it isn't already going. If the window didn't - * close completely, just wait for an ACK. + * Don't remove this (len > 0) check ! + * We explicitly check for len > 0 here (although it + * isn't really necessary), to work around a gcc + * optimization issue - to force gcc to compute + * len above. Without this check, the computation + * of len is bungled by the optimizer. */ - len = 0; - if (sendwin == 0) { - tcp_timer_activate(tp, TT_REXMT, 0); - tp->t_rxtshift = 0; - tp->snd_nxt = tp->snd_una; - if (!tcp_timer_active(tp, TT_PERSIST)) - tcp_setpersist(tp); + if (len > 0) { + cwin = tp->snd_cwnd - + (tp->snd_nxt - tp->sack_newdata) - + tos->sack_bytes_rxmt; + if (cwin < 0) + cwin = 0; + len = lmin(len, cwin); } } + return (len); +} - /* len will be >= 0 after this point. */ - KASSERT(len >= 0, ("%s: len < 0", __func__)); +/* + * Automatic sizing of send socket buffer. Often the send buffer + * size is not optimally adjusted to the actual network conditions + * at hand (delay bandwidth product). Setting the buffer size too + * small limits throughput on links with high bandwidth and high + * delay (eg. trans-continental/oceanic links). Setting the + * buffer size too big consumes too much real kernel memory, + * especially with many connections on busy servers. + * + * The criteria to step up the send buffer one notch are: + * 1. receive window of remote host is larger than send buffer + * (with a fudge factor of 5/4th); + * 2. send buffer is filled to 7/8th with data (so we actually + * have data to make use of it); + * 3. send buffer fill has not hit maximal automatic size; + * 4. our send window (slow start and cogestion controlled) is + * larger than sent but unacknowledged data in send buffer. + * + * The remote host receive window scaling factor may limit the + * growing of the send buffer before it reaches its allowed + * maximum. + * + * It scales directly with slow start or congestion window + * and does at most one step per received ACK. This fast + * scaling has the drawback of growing the send buffer beyond + * what is strictly necessary to make full use of a given + * delay*bandwith product. However testing has shown this not + * to be much of an problem. At worst we are trading wasting + * of available bandwith (the non-use of it) for wasting some + * socket buffer memory. + * + * TODO: Shrink send buffer during idle periods together + * with congestion window. Requires another timer. Has to + * wait for upcoming tcp timer rewrite. + */ - /* - * Automatic sizing of send socket buffer. Often the send buffer - * size is not optimally adjusted to the actual network conditions - * at hand (delay bandwidth product). Setting the buffer size too - * small limits throughput on links with high bandwidth and high - * delay (eg. trans-continental/oceanic links). Setting the - * buffer size too big consumes too much real kernel memory, - * especially with many connections on busy servers. - * - * The criteria to step up the send buffer one notch are: - * 1. receive window of remote host is larger than send buffer - * (with a fudge factor of 5/4th); - * 2. send buffer is filled to 7/8th with data (so we actually - * have data to make use of it); - * 3. send buffer fill has not hit maximal automatic size; - * 4. our send window (slow start and cogestion controlled) is - * larger than sent but unacknowledged data in send buffer. - * - * The remote host receive window scaling factor may limit the - * growing of the send buffer before it reaches its allowed - * maximum. - * - * It scales directly with slow start or congestion window - * and does at most one step per received ACK. This fast - * scaling has the drawback of growing the send buffer beyond - * what is strictly necessary to make full use of a given - * delay*bandwith product. However testing has shown this not - * to be much of an problem. At worst we are trading wasting - * of available bandwith (the non-use of it) for wasting some - * socket buffer memory. - * - * TODO: Shrink send buffer during idle periods together - * with congestion window. Requires another timer. Has to - * wait for upcoming tcp timer rewrite. - */ - if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { - if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && - so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && - so->so_snd.sb_cc < tcp_autosndbuf_max && - sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { - if (!sbreserve_locked(&so->so_snd, - min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, - tcp_autosndbuf_max), so, curthread)) - so->so_snd.sb_flags &= ~SB_AUTOSIZE; - } - } +static __inline void +tcp_output_socket_autosize(struct socket *so, struct tcp_output_state *tos) +{ + struct tcpcb *tp = sototcpcb(so); + + if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && + so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && + so->so_snd.sb_cc < tcp_autosndbuf_max && + tos->sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { + if (!sbreserve_locked(&so->so_snd, + min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, + tcp_autosndbuf_max), so, curthread)) + so->so_snd.sb_flags &= ~SB_AUTOSIZE; + } +} - /* - * Truncate to the maximum segment length or enable TCP Segmentation - * Offloading (if supported by hardware) and ensure that FIN is removed - * if the length no longer contains the last data byte. - * - * TSO may only be used if we are in a pure bulk sending state. The - * presence of TCP-MD5, SACK retransmits, SACK advertizements and - * IP options prevent using TSO. With TSO the TCP header is the same - * (except for the sequence number) for all generated packets. This - * makes it impossible to transmit any options which vary per generated - * segment or packet. - * - * The length of TSO bursts is limited to TCP_MAXWIN. That limit and - * removal of FIN (if not already catched here) are handled later after - * the exact length of the TCP options are known. - */ - if (len > tp->t_maxseg) { - if ((tp->t_flags & TF_TSO) && tcp_do_tso && - ((tp->t_flags & TF_SIGNATURE) == 0) && - tp->rcv_numsacks == 0 && sack_rxmit == 0 && - tp->t_inpcb->inp_options == NULL && - tp->t_inpcb->in6p_options == NULL && - tp->t_inpcb->inp_sp == NULL) { - tso = 1; - } else { - len = tp->t_maxseg; - sendalot = 1; - tso = 0; - } - } - if (sack_rxmit) { - if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) - flags &= ~TH_FIN; +/* + * Truncate to the maximum segment length or enable TCP Segmentation + * Offloading (if supported by hardware) and ensure that FIN is removed + * if the length no longer contains the last data byte. + * + * TSO may only be used if we are in a pure bulk sending state. The + * presence of TCP-MD5, SACK retransmits, SACK advertizements and + * IP options prevent using TSO. With TSO the TCP header is the same + * (except for the sequence number) for all generated packets. This + * makes it impossible to transmit any options which vary per generated + * segment or packet. + * + * The length of TSO bursts is limited to TCP_MAXWIN. That limit and + * removal of FIN (if not already catched here) are handled later after + * the exact length of the TCP options are known. + */ +static __inline int +tcp_output_check_tso(struct tcpcb *tp, int curlen, struct tcp_output_state *tos) +{ + int len; + + if ((tp->t_flags & TF_TSO) && tcp_do_tso && + ((tp->t_flags & TF_SIGNATURE) == 0) && + tp->rcv_numsacks == 0 && tos->sack_rxmit == 0 && + tp->t_inpcb->inp_options == NULL && + tp->t_inpcb->in6p_options == NULL && + tp->t_inpcb->inp_sp == NULL) { + tos->tso = 1; + len = curlen; } else { - if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) - flags &= ~TH_FIN; + len = tp->t_maxseg; + tos->sendalot = 1; + tos->tso = 0; } + return (len); +} - recwin = sbspace(&so->so_rcv); +/* + * Sender silly window avoidance. We transmit under the following + * conditions when len is non-zero: + * + * - We have a full segment (or more with TSO) + * - This is the last buffer in a write()/send() and we are + * either idle or running NODELAY + * - we've timed out (e.g. persist timer) + * - we have more then 1/2 the maximum send window's worth of + * data (receiver may be limited the window size) + * - we need to retransmit + */ +static __inline int +tcp_output_silly_window_check(struct socket *so, int len, struct tcp_output_state *tos) +{ + struct tcpcb *tp = sototcpcb(so); + if (len >= tp->t_maxseg) + return (1); /* - * Sender silly window avoidance. We transmit under the following - * conditions when len is non-zero: + * NOTE! on localhost connections an 'ack' from the remote + * end may occur synchronously with the output and cause + * us to flush a buffer queued with moretocome. XXX * - * - We have a full segment (or more with TSO) - * - This is the last buffer in a write()/send() and we are - * either idle or running NODELAY - * - we've timed out (e.g. persist timer) - * - we have more then 1/2 the maximum send window's worth of - * data (receiver may be limited the window size) - * - we need to retransmit + * note: the len + off check is almost certainly unnecessary. */ - if (len) { - if (len >= tp->t_maxseg) - goto send; - /* - * NOTE! on localhost connections an 'ack' from the remote - * end may occur synchronously with the output and cause - * us to flush a buffer queued with moretocome. XXX - * - * note: the len + off check is almost certainly unnecessary. - */ - if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ - (idle || (tp->t_flags & TF_NODELAY)) && - len + off >= so->so_snd.sb_cc && - (tp->t_flags & TF_NOPUSH) == 0) { - goto send; - } - if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ - goto send; - if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) - goto send; - if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ - goto send; - if (sack_rxmit) - goto send; - } + if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ + (tos->idle || (tp->t_flags & TF_NODELAY)) && + len + tos->off >= so->so_snd.sb_cc && + (tp->t_flags & TF_NOPUSH) == 0) + return (1); + + if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ + return (1); + if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) + return (1); + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ + return (1); + if (tos->sack_rxmit) + return (1); - /* - * Compare available window to amount of window - * known to peer (as advertised window less - * next expected input). If the difference is at least two - * max size segments, or at least 50% of the maximum possible - * window, then want to send a window update to peer. - * Skip this if the connection is in T/TCP half-open state. - * Don't send pure window updates when the peer has closed - * the connection and won't ever send more data. - */ - if (recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && - !TCPS_HAVERCVDFIN(tp->t_state)) { - /* - * "adv" is the amount we can increase the window, - * taking into account that we are limited by - * TCP_MAXWIN << tp->rcv_scale. - */ - long adv = min(recwin, (long)TCP_MAXWIN << tp->rcv_scale) - - (tp->rcv_adv - tp->rcv_nxt); + return (0); +} - if (adv >= (long) (2 * tp->t_maxseg)) - goto send; - if (2 * adv >= (long) so->so_rcv.sb_hiwat) - goto send; - } - /* - * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW - * is also a catch-all for the retransmit timer timeout case. - */ - if (tp->t_flags & TF_ACKNOW) - goto send; - if ((flags & TH_RST) || - ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) - goto send; - if (SEQ_GT(tp->snd_up, tp->snd_una)) - goto send; - /* - * If our state indicates that FIN should be sent - * and we have not yet done so, then we need to send. - */ - if (flags & TH_FIN && - ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) - goto send; - /* - * In SACK, it is possible for tcp_output to fail to send a segment - * after the retransmission timer has been turned off. Make sure - * that the retransmission timer is set. - */ - if ((tp->t_flags & TF_SACK_PERMIT) && - SEQ_GT(tp->snd_max, tp->snd_una) && - !tcp_timer_active(tp, TT_REXMT) && - !tcp_timer_active(tp, TT_PERSIST)) { - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); - goto just_return; - } - /* - * TCP window updates are not reliable, rather a polling protocol - * using ``persist'' packets is used to insure receipt of window - * updates. The three ``states'' for the output side are: - * idle not doing retransmits or persists - * persisting to move a small or zero window - * (re)transmitting and thereby not persisting - * - * tcp_timer_active(tp, TT_PERSIST) - * is true when we are in persist state. - * (tp->t_flags & TF_FORCEDATA) - * is set when we are called to send a persist packet. - * tcp_timer_active(tp, TT_REXMT) - * is set when we are retransmitting - * The output side is idle when both timers are zero. - * - * If send window is too small, there is data to transmit, and no - * retransmit or persist is pending, then go to persist state. - * If nothing happens soon, send when timer expires: - * if window is nonzero, transmit what we can, - * otherwise force out a byte. - */ +/* + * TCP window updates are not reliable, rather a polling protocol + * using ``persist'' packets is used to insure receipt of window + * updates. The three ``states'' for the output side are: + * idle not doing retransmits or persists + * persisting to move a small or zero window + * (re)transmitting and thereby not persisting + * + * tcp_timer_active(tp, TT_PERSIST) + * is true when we are in persist state. + * (tp->t_flags & TF_FORCEDATA) + * is set when we are called to send a persist packet. + * tcp_timer_active(tp, TT_REXMT) + * is set when we are retransmitting + * The output side is idle when both timers are zero. + * + * If send window is too small, there is data to transmit, and no + * retransmit or persist is pending, then go to persist state. + * If nothing happens soon, send when timer expires: + * if window is nonzero, transmit what we can, + * otherwise force out a byte. + */ +static __inline void +tcp_output_check_persist(struct socket *so) +{ + struct tcpcb *tp = sototcpcb(so); + if (so->so_snd.sb_cc && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { tp->t_rxtshift = 0; tcp_setpersist(tp); - } - - /* - * No reason to send a segment, just return. - */ -just_return: - SOCKBUF_UNLOCK(&so->so_snd); - return (0); - -send: - SOCKBUF_LOCK_ASSERT(&so->so_snd); - /* - * Before ESTABLISHED, force sending of initial options - * unless TCP set not to do any options. - * NOTE: we assume that the IP/TCP header plus TCP options - * always fit in a single mbuf, leaving room for a maximum - * link header, i.e. - * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES - */ - optlen = 0; -#ifdef INET6 - if (isipv6) - hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); - else -#endif - hdrlen = sizeof (struct tcpiphdr); + } +} - /* - * Compute options for segment. - * We only have to care about SYN and established connection - * segments. Options for SYN-ACK segments are handled in TCP - * syncache. - */ - if ((tp->t_flags & TF_NOOPT) == 0) { - to.to_flags = 0; +static __inline void +tcp_output_calculate_options(struct tcpcb *tp, int flags, struct tcpopt *to) +{ + to->to_flags = 0; /* Maximum segment size. */ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; - to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); - to.to_flags |= TOF_MSS; + to->to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc); + to->to_flags |= TOF_MSS; } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { - to.to_wscale = tp->request_r_scale; - to.to_flags |= TOF_SCALE; + to->to_wscale = tp->request_r_scale; + to->to_flags |= TOF_SCALE; } /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { - to.to_tsval = ticks + tp->ts_offset; - to.to_tsecr = tp->ts_recent; - to.to_flags |= TOF_TS; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + + to->to_tsval = ticks + tp->ts_offset; + to->to_tsecr = tp->ts_recent; + to->to_flags |= TOF_TS; /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) @@ -660,35 +466,36 @@ /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) - to.to_flags |= TOF_SACKPERM; + to->to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { - to.to_flags |= TOF_SACK; - to.to_nsacks = tp->rcv_numsacks; - to.to_sacks = (u_char *)tp->sackblks; + to->to_flags |= TOF_SACK; + to->to_nsacks = tp->rcv_numsacks; + to->to_sacks = (u_char *)tp->sackblks; } } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ #ifdef INET6 - if (!isipv6 && (tp->t_flags & TF_SIGNATURE)) + if (!tos->isipv6 && (tp->t_flags & TF_SIGNATURE)) #else if (tp->t_flags & TF_SIGNATURE) #endif /* INET6 */ - to.to_flags |= TOF_SIGNATURE; + to->to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ - /* Processing the options. */ - hdrlen += optlen = tcp_addoptions(&to, (u_char *)&opt); +} -#ifdef TCP_SIGNATURE - sigoff = to.to_signature - (u_char *)&to; -#endif /* TCP_SIGNATURE */ - } +static __inline int +tcp_output_add_hdrlen(struct tcpcb *tp, int len, int *flags, struct tcp_output_state *tos) +{ + unsigned int ipoptlen; + unsigned int optlen = tos->optlen; + unsigned int hdrlen = tos->hdrlen; #ifdef INET6 - if (isipv6) + if (tos->isipv6) ipoptlen = ip6_optlen(tp->t_inpcb); else #endif @@ -707,7 +514,7 @@ * Clear the FIN bit because we cut off the tail of * the segment. * - * When doing TSO limit a burst to TCP_MAXWIN minus the + * When doing TSO limit a burst to TCP_MAXWIN minus the * IP, TCP and Options length to keep ip->ip_len from * overflowing. Prevent the last segment from being * fractional thus making them all equal sized and set @@ -715,17 +522,17 @@ * IP options or IPSEC are present. */ if (len + optlen + ipoptlen > tp->t_maxopd) { - flags &= ~TH_FIN; - if (tso) { + *flags &= ~TH_FIN; + if (tos->tso) { if (len > TCP_MAXWIN - hdrlen - optlen) { len = TCP_MAXWIN - hdrlen - optlen; len = len - (len % (tp->t_maxopd - optlen)); - sendalot = 1; + tos->sendalot = 1; } else if (tp->t_flags & TF_NEEDFIN) - sendalot = 1; + tos->sendalot = 1; } else { len = tp->t_maxopd - optlen - ipoptlen; - sendalot = 1; + tos->sendalot = 1; } } @@ -737,127 +544,32 @@ #endif panic("tcphdr too big"); /*#endif*/ + return (len); +} - /* - * Grab a header mbuf, attaching a copy of data to - * be transmitted, and initialize the header from - * the template for sends on this connection. - */ - if (len) { - struct mbuf *mb; - u_int moff; - - if ((tp->t_flags & TF_FORCEDATA) && len == 1) - tcpstat.tcps_sndprobe++; - else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { - tcpstat.tcps_sndrexmitpack++; - tcpstat.tcps_sndrexmitbyte += len; - } else { - tcpstat.tcps_sndpack++; - tcpstat.tcps_sndbyte += len; - } -#ifdef notyet - if ((m = m_copypack(so->so_snd.sb_mb, off, - (int)len, max_linkhdr + hdrlen)) == 0) { - SOCKBUF_UNLOCK(&so->so_snd); - error = ENOBUFS; - goto out; - } - /* - * m_copypack left space for our hdr; use it. - */ - m->m_len += hdrlen; - m->m_data -= hdrlen; -#else - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == NULL) { - SOCKBUF_UNLOCK(&so->so_snd); - error = ENOBUFS; - goto out; - } -#ifdef INET6 - if (MHLEN < hdrlen + max_linkhdr) { - MCLGET(m, M_DONTWAIT); - if ((m->m_flags & M_EXT) == 0) { - SOCKBUF_UNLOCK(&so->so_snd); - m_freem(m); - error = ENOBUFS; - goto out; - } - } -#endif - m->m_data += max_linkhdr; - m->m_len = hdrlen; - - /* - * Start the m_copy functions from the closest mbuf - * to the offset in the socket buffer chain. - */ - mb = sbsndptr(&so->so_snd, off, len, &moff); - - if (len <= MHLEN - hdrlen - max_linkhdr) { - m_copydata(mb, moff, (int)len, - mtod(m, caddr_t) + hdrlen); - m->m_len += len; - } else { - m->m_next = m_copy(mb, moff, (int)len); - if (m->m_next == NULL) { - SOCKBUF_UNLOCK(&so->so_snd); - (void) m_free(m); - error = ENOBUFS; - goto out; - } - } -#endif - /* - * If we're sending everything we've got, set PUSH. - * (This will keep happy those implementations which only - * give data to the user when a buffer fills or - * a PUSH comes in.) - */ - if (off + len == so->so_snd.sb_cc) - flags |= TH_PUSH; - SOCKBUF_UNLOCK(&so->so_snd); - } else { - SOCKBUF_UNLOCK(&so->so_snd); - if (tp->t_flags & TF_ACKNOW) - tcpstat.tcps_sndacks++; - else if (flags & (TH_SYN|TH_FIN|TH_RST)) - tcpstat.tcps_sndctrl++; - else if (SEQ_GT(tp->snd_up, tp->snd_una)) - tcpstat.tcps_sndurg++; - else - tcpstat.tcps_sndwinup++; +static __inline void +tcp_output_setup_headers(struct socket *so, struct mbuf *m, int len, int flags, struct tcp_output_state *tos) +{ + int isipv6 = tos->isipv6; + int tso = tos->tso; + int optlen = tos->optlen; + int hdrlen = tos->hdrlen; + struct tcphdr *th; + struct sackhole *p = tos->p; + struct tcpcb *tp = sototcpcb(so); + struct ip *ip = NULL; - MGETHDR(m, M_DONTWAIT, MT_DATA); - if (m == NULL) { - error = ENOBUFS; - goto out; - } -#ifdef INET6 - if (isipv6 && (MHLEN < hdrlen + max_linkhdr) && - MHLEN >= hdrlen) { - MH_ALIGN(m, hdrlen); - } else -#endif - m->m_data += max_linkhdr; - m->m_len = hdrlen; - } - SOCKBUF_UNLOCK_ASSERT(&so->so_snd); - m->m_pkthdr.rcvif = (struct ifnet *)0; -#ifdef MAC - mac_create_mbuf_from_inpcb(tp->t_inpcb, m); -#endif -#ifdef INET6 +#ifdef INET6 if (isipv6) { - ip6 = mtod(m, struct ip6_hdr *); + struct ip6_hdr *ip6; + + ip6 = mtod(m, struct ip6_hdr *); th = (struct tcphdr *)(ip6 + 1); tcpip_fillheaders(tp->t_inpcb, ip6, th); } else #endif /* INET6 */ - { + { ip = mtod(m, struct ip *); - ipov = (struct ipovly *)ip; th = (struct tcphdr *)(ip + 1); tcpip_fillheaders(tp->t_inpcb, ip, th); } @@ -883,7 +595,7 @@ * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ - if (sack_rxmit == 0) { + if (tos->sack_rxmit == 0) { if (len || (flags & (TH_SYN|TH_FIN)) || tcp_timer_active(tp, TT_PERSIST)) th->th_seq = htonl(tp->snd_nxt); @@ -891,12 +603,12 @@ th->th_seq = htonl(tp->snd_max); } else { th->th_seq = htonl(p->rxmit); - p->rxmit += len; + tos->p->rxmit += len; tp->sackhint.sack_bytes_rexmit += len; } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { - bcopy(opt, th + 1, optlen); + bcopy(tos->opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } th->th_flags = flags; @@ -904,13 +616,13 @@ * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. */ - if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && - recwin < (long)tp->t_maxseg) - recwin = 0; - if (recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) - recwin = (long)(tp->rcv_adv - tp->rcv_nxt); - if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) - recwin = (long)TCP_MAXWIN << tp->rcv_scale; + if (tos->recwin < (long)(so->so_rcv.sb_hiwat / 4) && + tos->recwin < (long)tp->t_maxseg) + tos->recwin = 0; + if (tos->recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) + tos->recwin = (long)(tp->rcv_adv - tp->rcv_nxt); + if (tos->recwin > (long)TCP_MAXWIN << tp->rcv_scale) + tos->recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a @@ -921,7 +633,7 @@ th->th_win = htons((u_short) (min(sbspace(&so->so_rcv), TCP_MAXWIN))); else - th->th_win = htons((u_short)(recwin >> tp->rcv_scale)); + th->th_win = htons((u_short)(tos->recwin >> tp->rcv_scale)); /* * Adjust the RXWIN0SENT flag - indicate that we have advertised @@ -931,7 +643,7 @@ * to read more data then can be buffered prior to transmitting on * the connection. */ - if (recwin == 0) + if (tos->recwin == 0) tp->t_flags |= TF_RXWIN0SENT; else tp->t_flags &= ~TF_RXWIN0SENT; @@ -953,7 +665,7 @@ #endif if (tp->t_flags & TF_SIGNATURE) tcp_signature_compute(m, sizeof(struct ip), len, optlen, - (u_char *)(th + 1) + sigoff, IPSEC_DIR_OUTBOUND); + (u_char *)(th + 1) + tos->sigoff, IPSEC_DIR_OUTBOUND); #endif /* @@ -992,108 +704,244 @@ m->m_pkthdr.tso_segsz = tp->t_maxopd - optlen; } - /* - * In transmit state, time the transmission and arrange for - * the retransmit. In persist state, just set snd_max. - */ - if ((tp->t_flags & TF_FORCEDATA) == 0 || - !tcp_timer_active(tp, TT_PERSIST)) { - tcp_seq startseq = tp->snd_nxt; +} +static int +tcp_output_handle_error(struct tcpcb *tp, int error, int flags, struct tcp_output_state *tos) +{ + + switch (error) { + case EPERM: + tp->t_softerror = error; + return (error); + case ENOBUFS: + if (!tcp_timer_active(tp, TT_REXMT) && + !tcp_timer_active(tp, TT_PERSIST)) + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + tp->snd_cwnd = tp->t_maxseg; + return (0); + case EMSGSIZE: /* - * Advance snd_nxt over sequence space of this segment. + * For some reason the interface we used initially + * to send segments changed to another or lowered + * its MTU. + * + * tcp_mtudisc() will find out the new MTU and as + * its last action, initiate retransmission, so it + * is important to not do so here. + * + * If TSO was active we either got an interface + * without TSO capabilits or TSO was turned off. + * Disable it for this connection as too and + * immediatly retry with MSS sized segments generated + * by this function. */ - if (flags & (TH_SYN|TH_FIN)) { - if (flags & TH_SYN) - tp->snd_nxt++; - if (flags & TH_FIN) { - tp->snd_nxt++; - tp->t_flags |= TF_SENTFIN; - } - } - if (sack_rxmit) - goto timer; - tp->snd_nxt += len; - if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { - tp->snd_max = tp->snd_nxt; - /* - * Time this transmission if not a retransmission and - * not currently timing anything. - */ - if (tp->t_rtttime == 0) { - tp->t_rtttime = ticks; - tp->t_rtseq = startseq; - tcpstat.tcps_segstimed++; - } + if (tos->tso) + tp->t_flags &= ~TF_TSO; + tcp_mtudisc(tp->t_inpcb, 0); + return (0); + case EHOSTDOWN: + case EHOSTUNREACH: + case ENETDOWN: + case ENETUNREACH: + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_softerror = error; + return (0); } + /* FALLTHROUGH */ + default: + return (error); + } +} - /* - * Set retransmit timer if not currently set, - * and not doing a pure ack or a keep-alive probe. - * Initial value for retransmit timer is smoothed - * round-trip time + 2 * round-trip time variance. - * Initialize shift counter which is used for backoff - * of retransmit time. - */ -timer: - if (!tcp_timer_active(tp, TT_REXMT) && - ((sack_rxmit && tp->snd_nxt != tp->snd_max) || - (tp->snd_nxt != tp->snd_una))) { - if (tcp_timer_active(tp, TT_PERSIST)) { - tcp_timer_activate(tp, TT_PERSIST, 0); - tp->t_rxtshift = 0; - } - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); +/* + * Grab a header mbuf, attaching a copy of data to + * be transmitted, and initialize the header from + * the template for sends on this connection. + */ +static __inline int +tcp_output_xfer_send(struct socket *so, int len, int *flags, struct mbuf **mp, struct tcp_output_state *tos) +{ + struct mbuf *mb, *m; + u_int moff; + struct tcpcb *tp; + unsigned int hdrlen; + + hdrlen = tos->hdrlen; + tp = sototcpcb(so); + *mp = NULL; + + if ((tp->t_flags & TF_FORCEDATA) && len == 1) + tcpstat.tcps_sndprobe++; + else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || tos->sack_rxmit) { + tcpstat.tcps_sndrexmitpack++; + tcpstat.tcps_sndrexmitbyte += len; + } else { + tcpstat.tcps_sndpack++; + tcpstat.tcps_sndbyte += len; + } +#ifdef notyet + if ((m = m_copypack(so->so_snd.sb_mb, tos->off, + (int)len, max_linkhdr + hdrlen)) == 0) { + return (ENOBUFS); + } + /* + * m_copypack left space for our hdr; use it. + */ + m->m_len += hdrlen; + m->m_data -= hdrlen; +#else + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) + return (ENOBUFS); +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) { + MCLGET(m, M_DONTWAIT); + if ((m->m_flags & M_EXT) == 0) { + m_freem(m); + return (ENOBUFS); } + } +#endif + m->m_data += max_linkhdr; + m->m_len = hdrlen; + + /* + * Start the m_copy functions from the closest mbuf + * to the offset in the socket buffer chain. + */ + mb = sbsndptr(&so->so_snd, tos->off, len, &moff); + + if (len <= MHLEN - hdrlen - max_linkhdr) { + m_copydata(mb, moff, (int)len, + mtod(m, caddr_t) + hdrlen); + m->m_len += len; } else { - /* - * Persist case, update snd_max but since we are in - * persist mode (no window) we do not update snd_nxt. - */ - int xlen = len; + m->m_next = m_copy(mb, moff, (int)len); + if (m->m_next == NULL) { + (void) m_free(m); + return (ENOBUFS); + } + } +#endif + /* + * If we're sending everything we've got, set PUSH. + * (This will keep happy those implementations which only + * give data to the user when a buffer fills or + * a PUSH comes in.) + */ + if (tos->off + len == so->so_snd.sb_cc) + *flags |= TH_PUSH; + + *mp = m; + return (0); +} + +static __inline void +tcp_output_setup_xmit_timer(struct tcpcb *tp, int len, int flags, struct tcp_output_state *tos) +{ + tcp_seq startseq = tp->snd_nxt; + + /* + * Advance snd_nxt over sequence space of this segment. + */ + if (flags & (TH_SYN|TH_FIN)) { if (flags & TH_SYN) - ++xlen; + tp->snd_nxt++; if (flags & TH_FIN) { - ++xlen; + tp->snd_nxt++; tp->t_flags |= TF_SENTFIN; } - if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) - tp->snd_max = tp->snd_nxt + len; } - -#ifdef TCPDEBUG + if (tos->sack_rxmit) + goto timer; + tp->snd_nxt += len; + if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + tp->snd_max = tp->snd_nxt; + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + */ + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + tcpstat.tcps_segstimed++; + } + } + /* - * Trace. + * Set retransmit timer if not currently set, + * and not doing a pure ack or a keep-alive probe. + * Initial value for retransmit timer is smoothed + * round-trip time + 2 * round-trip time variance. + * Initialize shift counter which is used for backoff + * of retransmit time. */ - if (so->so_options & SO_DEBUG) { - u_short save = 0; +timer: + if (!tcp_timer_active(tp, TT_REXMT) && + ((tos->sack_rxmit && tp->snd_nxt != tp->snd_max) || + (tp->snd_nxt != tp->snd_una))) { + if (tcp_timer_active(tp, TT_PERSIST)) { + tcp_timer_activate(tp, TT_PERSIST, 0); + tp->t_rxtshift = 0; + } + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + } +} + +#ifdef TCPDEBUG +static void +tcp_output_trace_pkt(struct tcpcb *tp, struct mbuf *m, struct tcp_output_state *tos) +{ + u_short save = 0; + struct ip *ip; + struct ipovly *ipov; + struct tcphdr *th; #ifdef INET6 - if (!isipv6) + struct ip *ip6; + + if (!tos->isipv6) #endif - { - save = ipov->ih_len; - ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); - } - tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); + { + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + ipov = (struct ipovly *)ip; + save = ipov->ih_len; + ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + (th->th_off << 2) */); + } else { + ip6 = mtod(m, struct ip6 *); + th = (struct tcphdr *)(ip6 + 1); + } + tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); #ifdef INET6 - if (!isipv6) + if (!tos->isipv6) #endif ipov->ih_len = save; - } +} #endif - /* - * Fill in IP length and desired time to live and - * send to IP level. There should be a better way - * to handle ttl and tos; we could keep them in - * the template, but need a way to checksum without them. - */ - /* - * m->m_pkthdr.len should have been set before cksum calcuration, - * because in6_cksum() need it. - */ +/* + * Fill in IP length and desired time to live and + * send to IP level. There should be a better way + * to handle ttl and tos; we could keep them in + * the template, but need a way to checksum without them. + */ +/* + * m->m_pkthdr.len should have been set before cksum calculation, + * because in6_cksum() need it. + */ + +static __inline int +tcp_output_ip_xmit(struct socket *so, struct mbuf *m, struct tcp_output_state *tos) +{ + struct ip *ip = NULL; + struct tcpcb *tp = sototcpcb(so); + int error; #ifdef INET6 - if (isipv6) { + struct ip6_hdr *ip6 = NULL; + + if (tos->isipv6) { + ip6 = mtod(m, struct ip6_hdr *); /* * we separately set hoplimit for every segment, since the * user might want to change the value via setsockopt. @@ -1109,96 +957,419 @@ IP_ROUTETOIF : 0), NULL, NULL, tp->t_inpcb); } else #endif /* INET6 */ - { - ip->ip_len = m->m_pkthdr.len; + { + ip = mtod(m, struct ip *); + + ip->ip_len = m->m_pkthdr.len; #ifdef INET6 - if (INP_CHECK_SOCKAF(so, AF_INET6)) - ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); + if (INP_CHECK_SOCKAF(so, AF_INET6)) + ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ + /* + * If we do path MTU discovery, then we set DF on every packet. + * This might not be the best thing to do according to RFC3390 + * Section 2. However the tcp hostcache migitates the problem + * so it affects only the first tcp connection with a host. + */ + if (path_mtu_discovery) + ip->ip_off |= IP_DF; + + error = ip_output(m, tp->t_inpcb->inp_options, NULL, + ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, + tp->t_inpcb); + } + + return (error); +} +static void +tcp_output_handle_ip_xmit_fail(struct tcpcb *tp, int len, int error, int flags, + struct tcp_output_state *tos) +{ + struct sackhole *p = tos->p; + /* - * If we do path MTU discovery, then we set DF on every packet. - * This might not be the best thing to do according to RFC3390 - * Section 2. However the tcp hostcache migitates the problem - * so it affects only the first tcp connection with a host. + * We know that the packet was lost, so back out the + * sequence number advance, if any. + * + * If the error is EPERM the packet got blocked by the + * local firewall. Normally we should terminate the + * connection but the blocking may have been spurious + * due to a firewall reconfiguration cycle. So we treat + * it like a packet loss and let the retransmit timer and + * timeouts do their work over time. + * XXX: It is a POLA question whether calling tcp_drop right + * away would be the really correct behavior instead. */ - if (path_mtu_discovery) - ip->ip_off |= IP_DF; + if (((tp->t_flags & TF_FORCEDATA) == 0 || + !tcp_timer_active(tp, TT_PERSIST)) && + ((flags & TH_SYN) == 0) && + (error != EPERM)) { + if (tos->sack_rxmit) { + p->rxmit -= len; + tp->sackhint.sack_bytes_rexmit -= len; + KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, + ("sackhint bytes rtx >= 0")); + } else + tp->snd_nxt -= len; + } +} - error = ip_output(m, tp->t_inpcb->inp_options, NULL, - ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, - tp->t_inpcb); - } - if (error) { +/* + * Tcp output routine: figure out what should be sent and send it. + */ +int +tcp_output(struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + long len; + int flags, error; + struct mbuf *m; + struct tcpopt to; + struct tcp_output_state tos; + +#ifdef INET6 + tos.isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; +#endif + INP_LOCK_ASSERT(tp->t_inpcb); + tos.tso = 0; + /* + * Determine length of data that should be transmitted, + * and flags that will be used. + * If there is some data or critical controls (SYN, RST) + * to send, then transmit; otherwise, investigate further. + */ + tos.idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); + if (tos.idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { /* - * We know that the packet was lost, so back out the - * sequence number advance, if any. + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. * - * If the error is EPERM the packet got blocked by the - * local firewall. Normally we should terminate the - * connection but the blocking may have been spurious - * due to a firewall reconfiguration cycle. So we treat - * it like a packet loss and let the retransmit timer and - * timeouts do their work over time. - * XXX: It is a POLA question whether calling tcp_drop right - * away would be the really correct behavior instead. + * Set the slow-start flight size depending on whether + * this is a local network or not. */ - if (((tp->t_flags & TF_FORCEDATA) == 0 || - !tcp_timer_active(tp, TT_PERSIST)) && - ((flags & TH_SYN) == 0) && - (error != EPERM)) { - if (sack_rxmit) { - p->rxmit -= len; - tp->sackhint.sack_bytes_rexmit -= len; - KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, - ("sackhint bytes rtx >= 0")); - } else - tp->snd_nxt -= len; + int ss = ss_fltsz; +#ifdef INET6 + if (tos.isipv6) { + if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) + ss = ss_fltsz_local; + } else +#endif /* INET6 */ + if (in_localaddr(tp->t_inpcb->inp_faddr)) + ss = ss_fltsz_local; + tp->snd_cwnd = tp->t_maxseg * ss; + } + tp->t_flags &= ~TF_LASTIDLE; + if (tos.idle) { + if (tp->t_flags & TF_MORETOCOME) { + tp->t_flags |= TF_LASTIDLE; + tos.idle = 0; + } + } +again: + /* + * If we've recently taken a timeout, snd_max will be greater than + * snd_nxt. There may be SACK information that allows us to avoid + * resending already delivered data. Adjust snd_nxt accordingly. + */ + if ((tp->t_flags & TF_SACK_PERMIT) && + SEQ_LT(tp->snd_nxt, tp->snd_max)) + tcp_sack_adjust(tp); + tos.sendalot = 0; + tos.off = tp->snd_nxt - tp->snd_una; + tos.sendwin = min(tp->snd_wnd, tp->snd_cwnd); + tos.sendwin = min(tos.sendwin, tp->snd_bwnd); + + flags = tcp_outflags[tp->t_state]; + /* + * Send any SACK-generated retransmissions. If we're explicitly trying + * to send out new data (when sendalot is 1), bypass this function. + * If we retransmit in fast recovery mode, decrement snd_cwnd, since + * we're replacing a (future) new transmission with a retransmission + * now, and we previously incremented snd_cwnd in tcp_input(). + */ + /* + * Still in sack recovery , reset rxmit flag to zero. + */ + tos.sack_rxmit = 0; + tos.sack_bytes_rxmt = 0; + len = 0; + tos.p = NULL; + if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) && + (tos.p = tcp_sack_output(tp, &tos.sack_bytes_rxmt))) + len = tcp_output_sack_fast_recovery(tp, &tos); + + /* + * Get standard flags, and add SYN or FIN if requested by 'hidden' + * state flags. + */ + if (tp->t_flags & TF_NEEDFIN) + flags |= TH_FIN; + if (tp->t_flags & TF_NEEDSYN) + flags |= TH_SYN; + + SOCKBUF_LOCK(&so->so_snd); + /* + * If in persist timeout with window of 0, send 1 byte. + * Otherwise, if window is small but nonzero + * and timer expired, we will send what we can + * and go to transmit state. + */ + if (tp->t_flags & TF_FORCEDATA) + tcp_output_check_force_byte(so, &flags, &tos); + + if (tos.sack_rxmit == 0) + len = tcp_output_sack_rxmit_bytes(so, &tos); + + /* + * Lop off SYN bit if it has already been sent. However, if this + * is SYN-SENT state and if segment contains data and if we don't + * know that foreign host supports TAO, suppress sending segment. + */ + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { + if (tp->t_state != TCPS_SYN_RECEIVED) + flags &= ~TH_SYN; + tos.off--, len++; + } + + /* + * Be careful not to send data and/or FIN on SYN segments. + * This measure is needed to prevent interoperability problems + * with not fully conformant TCP implementations. + */ + if ((flags & TH_SYN) && (tp->t_flags & TF_NOOPT)) { + len = 0; + flags &= ~TH_FIN; + } + + if (len < 0) { + /* + * If FIN has been sent but not acked, + * but we haven't been called to retransmit, + * len will be < 0. Otherwise, window shrank + * after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back + * to (closed) window, and set the persist timer + * if it isn't already going. If the window didn't + * close completely, just wait for an ACK. + */ + len = 0; + if (tos.sendwin == 0) { + tcp_timer_activate(tp, TT_REXMT, 0); + tp->t_rxtshift = 0; + tp->snd_nxt = tp->snd_una; + if (!tcp_timer_active(tp, TT_PERSIST)) + tcp_setpersist(tp); } + } + + /* len will be >= 0 after this point. */ + KASSERT(len >= 0, ("%s: len < 0", __func__)); + + if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) + tcp_output_socket_autosize(so, &tos); + + if (len > tp->t_maxseg) + len = tcp_output_check_tso(tp, len, &tos); + + if (tos.sack_rxmit) { + if (SEQ_LT(tos.p->rxmit + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; + } else { + if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + flags &= ~TH_FIN; + } + + tos.recwin = sbspace(&so->so_rcv); + + if (len && tcp_output_silly_window_check(so, len, &tos)) + goto send; + + /* + * Compare available window to amount of window + * known to peer (as advertised window less + * next expected input). If the difference is at least two + * max size segments, or at least 50% of the maximum possible + * window, then want to send a window update to peer. + * Skip this if the connection is in T/TCP half-open state. + * Don't send pure window updates when the peer has closed + * the connection and won't ever send more data. + */ + if (tos.recwin > 0 && !(tp->t_flags & TF_NEEDSYN) && + !TCPS_HAVERCVDFIN(tp->t_state)) { + /* + * "adv" is the amount we can increase the window, + * taking into account that we are limited by + * TCP_MAXWIN << tp->rcv_scale. + */ + long adv = min(tos.recwin, (long)TCP_MAXWIN << tp->rcv_scale) - + (tp->rcv_adv - tp->rcv_nxt); + + if (adv >= (long) (2 * tp->t_maxseg)) + goto send; + if (2 * adv >= (long) so->so_rcv.sb_hiwat) + goto send; + } + + /* + * Send if we owe the peer an ACK, RST, SYN, or urgent data. ACKNOW + * is also a catch-all for the retransmit timer timeout case. + */ + if (tp->t_flags & TF_ACKNOW) + goto send; + if ((flags & TH_RST) || + ((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) + goto send; + if (SEQ_GT(tp->snd_up, tp->snd_una)) + goto send; + /* + * If our state indicates that FIN should be sent + * and we have not yet done so, then we need to send. + */ + if (flags & TH_FIN && + ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) + goto send; + /* + * In SACK, it is possible for tcp_output to fail to send a segment + * after the retransmission timer has been turned off. Make sure + * that the retransmission timer is set. + */ + if ((tp->t_flags & TF_SACK_PERMIT) && + SEQ_GT(tp->snd_max, tp->snd_una) && + !tcp_timer_active(tp, TT_REXMT) && + !tcp_timer_active(tp, TT_PERSIST)) { + tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + goto just_return; + } + + tcp_output_check_persist(so); + + /* + * No reason to send a segment, just return. + */ +just_return: + SOCKBUF_UNLOCK(&so->so_snd); + return (0); + +send: + SOCKBUF_LOCK_ASSERT(&so->so_snd); + /* + * Before ESTABLISHED, force sending of initial options + * unless TCP set not to do any options. + * NOTE: we assume that the IP/TCP header plus TCP options + * always fit in a single mbuf, leaving room for a maximum + * link header, i.e. + * max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES + */ + tos.optlen = 0; + +#ifdef INET6 + if (tos.isipv6) + tos.hdrlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); + else +#endif + tos.hdrlen = sizeof (struct tcpiphdr); + + /* + * Compute options for segment. + * We only have to care about SYN and established connection + * segments. Options for SYN-ACK segments are handled in TCP + * syncache. + */ + if ((tp->t_flags & TF_NOOPT) == 0) { + tcp_output_calculate_options(tp, flags, &to); + + /* Processing the options. */ + tos.hdrlen += tos.optlen = tcp_addoptions(&to, (u_char *)&tos.opt); + +#ifdef TCP_SIGNATURE + tos.sigoff = to.to_signature - (u_char *)&to; +#endif /* TCP_SIGNATURE */ + } +#ifdef TCP_SIGNATURE + else + tos.sigoff = 0; +#endif /* TCP_SIGNATURE */ + + len = tcp_output_add_hdrlen(tp, len, &flags, &tos); + + if (len) { + error = tcp_output_xfer_send(so, len, &flags, &m, &tos); + SOCKBUF_UNLOCK(&so->so_snd); + if (error) + goto out; + } else { + SOCKBUF_UNLOCK(&so->so_snd); + if (tp->t_flags & TF_ACKNOW) + tcpstat.tcps_sndacks++; + else if (flags & (TH_SYN|TH_FIN|TH_RST)) + tcpstat.tcps_sndctrl++; + else if (SEQ_GT(tp->snd_up, tp->snd_una)) + tcpstat.tcps_sndurg++; + else + tcpstat.tcps_sndwinup++; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == NULL) { + error = ENOBUFS; + goto out; + } +#ifdef INET6 + if (tos.isipv6 && (MHLEN < tos.hdrlen + max_linkhdr) && + MHLEN >= tos.hdrlen) { + MH_ALIGN(m, tos.hdrlen); + } else +#endif + m->m_data += max_linkhdr; + m->m_len = tos.hdrlen; + } + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); + m->m_pkthdr.rcvif = (struct ifnet *)0; +#ifdef MAC + mac_create_mbuf_from_inpcb(tp->t_inpcb, m); +#endif + tcp_output_setup_headers(so, m, len, flags, &tos); + + /* + * In transmit state, time the transmission and arrange for + * the retransmit. In persist state, just set snd_max. + */ + if ((tp->t_flags & TF_FORCEDATA) == 0 || + !tcp_timer_active(tp, TT_PERSIST)) { + tcp_output_setup_xmit_timer(tp, len, flags, &tos); + } else { + /* + * Persist case, update snd_max but since we are in + * persist mode (no window) we do not update snd_nxt. + */ + int xlen = len; + if (flags & TH_SYN) + ++xlen; + if (flags & TH_FIN) { + ++xlen; + tp->t_flags |= TF_SENTFIN; + } + if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) + tp->snd_max = tp->snd_nxt + len; + } + +#ifdef TCPDEBUG + /* + * Trace. + */ + if (so->so_options & SO_DEBUG) + tcp_output_trace_pkt(tp, m, &tos) +#endif + + error = tcp_output_ip_xmit(so, m, &tos); + + if (error) { + tcp_output_handle_ip_xmit_fail(tp, len, error, flags, &tos); out: SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ - switch (error) { - case EPERM: - tp->t_softerror = error; - return (error); - case ENOBUFS: - if (!tcp_timer_active(tp, TT_REXMT) && - !tcp_timer_active(tp, TT_PERSIST)) - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); - tp->snd_cwnd = tp->t_maxseg; - return (0); - case EMSGSIZE: - /* - * For some reason the interface we used initially - * to send segments changed to another or lowered - * its MTU. - * - * tcp_mtudisc() will find out the new MTU and as - * its last action, initiate retransmission, so it - * is important to not do so here. - * - * If TSO was active we either got an interface - * without TSO capabilits or TSO was turned off. - * Disable it for this connection as too and - * immediatly retry with MSS sized segments generated - * by this function. - */ - if (tso) - tp->t_flags &= ~TF_TSO; - tcp_mtudisc(tp->t_inpcb, 0); - return (0); - case EHOSTDOWN: - case EHOSTUNREACH: - case ENETDOWN: - case ENETUNREACH: - if (TCPS_HAVERCVDSYN(tp->t_state)) { - tp->t_softerror = error; - return (0); - } - /* FALLTHROUGH */ - default: - return (error); - } + return tcp_output_handle_error(tp, error, flags, &tos); } tcpstat.tcps_sndtotal++; @@ -1208,23 +1379,14 @@ * then remember the size of the advertised window. * Any pending ACK has now been sent. */ - if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) - tp->rcv_adv = tp->rcv_nxt + recwin; + if (tos.recwin > 0 && SEQ_GT(tp->rcv_nxt + tos.recwin, tp->rcv_adv)) + tp->rcv_adv = tp->rcv_nxt + tos.recwin; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); if (tcp_timer_active(tp, TT_DELACK)) tcp_timer_activate(tp, TT_DELACK, 0); -#if 0 - /* - * This completely breaks TCP if newreno is turned on. What happens - * is that if delayed-acks are turned on on the receiver, this code - * on the transmitter effectively destroys the TCP window, forcing - * it to four packets (1.5Kx4 = 6K window). - */ - if (sendalot && (!tcp_do_newreno || --maxburst)) - goto again; -#endif - if (sendalot) + + if (tos.sendalot) goto again; return (0); }