Only in netinet.new: RCS Only in netinet.new: drop Only in netinet.new: drop.c Only in netinet.new: in_rmx.c.21R Only in netinet.orig: in_rmx.c.orig Only in netinet.new: ip_fw.c.21R Only in netinet.orig: ip_fw.c.orig Only in netinet.new: ip_fw.c.stable Only in netinet.new: ip_fw.h.21R Only in netinet.new: ip_fw.h.stable Only in netinet.new: ip_fwdef.c.orig diff -cbwr netinet.orig/tcp.h netinet.new/tcp.h *** netinet.orig/tcp.h Mon May 29 19:09:53 1995 --- netinet.new/tcp.h Sat Aug 3 12:49:06 1996 *************** *** 80,90 **** --- 80,95 ---- #define TCPOPT_SACK_PERMITTED 4 /* Experimental */ #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 /* Experimental */ + #define TCPOLEN_SACK 8 /*2*sizeof(tcp_seq):len of sack blk */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_TSTAMP_HDR \ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) + + #define TCPOPT_SACK_PERMIT_HDR \ + (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED) + #define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8) #define TCPOPT_CC 11 /* CC options: RFC-1644 */ #define TCPOPT_CCNEW 12 diff -cbwr netinet.orig/tcp_input.c netinet.new/tcp_input.c *** netinet.orig/tcp_input.c Wed Aug 23 16:52:06 1995 --- netinet.new/tcp_input.c Tue Aug 27 15:39:38 1996 *************** *** 64,70 **** struct tcpiphdr tcp_saveti; #endif ! int tcprexmtthresh = 3; tcp_seq tcp_iss; tcp_cc tcp_ccgen; struct tcpstat tcpstat; --- 64,71 ---- struct tcpiphdr tcp_saveti; #endif ! #define TCP_REXMTTHRESH 3 ! static int tcprexmtthresh = TCP_REXMTTHRESH; tcp_seq tcp_iss; tcp_cc tcp_ccgen; struct tcpstat tcpstat; *************** *** 85,115 **** * when segments are out of order (so fast retransmit can work). */ #ifdef TCP_ACK_HACK ! #define TCP_REASS(tp, ti, m, so, flags) { \ ! if ((ti)->ti_seq == (tp)->rcv_nxt && \ ! (tp)->seg_next == (struct tcpiphdr *)(tp) && \ ! (tp)->t_state == TCPS_ESTABLISHED) { \ ! if (ti->ti_flags & TH_PUSH) \ ! tp->t_flags |= TF_ACKNOW; \ ! else \ ! tp->t_flags |= TF_DELACK; \ ! (tp)->rcv_nxt += (ti)->ti_len; \ ! flags = (ti)->ti_flags & TH_FIN; \ ! tcpstat.tcps_rcvpack++;\ ! tcpstat.tcps_rcvbyte += (ti)->ti_len;\ ! sbappend(&(so)->so_rcv, (m)); \ ! sorwakeup(so); \ ! } else { \ ! (flags) = tcp_reass((tp), (ti), (m)); \ ! tp->t_flags |= TF_ACKNOW; \ ! } \ ! } #else #define TCP_REASS(tp, ti, m, so, flags) { \ if ((ti)->ti_seq == (tp)->rcv_nxt && \ (tp)->seg_next == (struct tcpiphdr *)(tp) && \ (tp)->t_state == TCPS_ESTABLISHED) { \ ! tp->t_flags |= TF_DELACK; \ (tp)->rcv_nxt += (ti)->ti_len; \ flags = (ti)->ti_flags & TH_FIN; \ tcpstat.tcps_rcvpack++;\ --- 86,101 ---- * when segments are out of order (so fast retransmit can work). */ #ifdef TCP_ACK_HACK ! #define TCP_ACK_FLAG ((ti->ti_flags & TH_PUSH) ? TF_ACKNOW : TF_DELACK) #else + #define TCP_ACK_FLAG TF_DELACK + #endif + #define TCP_REASS(tp, ti, m, so, flags) { \ if ((ti)->ti_seq == (tp)->rcv_nxt && \ (tp)->seg_next == (struct tcpiphdr *)(tp) && \ (tp)->t_state == TCPS_ESTABLISHED) { \ ! tp->t_flags |= TCP_ACK_FLAG; \ (tp)->rcv_nxt += (ti)->ti_len; \ flags = (ti)->ti_flags & TH_FIN; \ tcpstat.tcps_rcvpack++;\ *************** *** 121,127 **** tp->t_flags |= TF_ACKNOW; \ } \ } ! #endif #ifndef TUBA_INCLUDE int --- 107,113 ---- tp->t_flags |= TF_ACKNOW; \ } \ } ! #ifndef TUBA_INCLUDE int *************** *** 163,168 **** --- 149,155 ---- if (i >= ti->ti_len) { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += ti->ti_len; + tp->rx_dup += ti->ti_len; m_freem(m); /* * Try to present any queued data *************** *** 232,237 **** --- 219,311 ---- return (flags); } + #ifdef TSACK /* checked feb 96 -- ok */ + /* + * Look for a valid TSACK and possibly read the actual timestamp from the + * sent[] table. The TSACK is processed after ack processing. + */ + static + void + tsack_check(tp, to) + struct tcpcb *tp; + struct tcpopt *to; + { + if (tp->se_ptr && (to->to_flag & TOF_TS)) { + int id=to->to_tsecr % tp->se_len; + if (tp->se_ptr[id].id == to->to_tsecr) { + to->to_tsecr=tp->se_ptr[id].tv; + /* tp->se_ptr[id].tv=0; *//* mark as TSACKed. debugging */ + tp->se_curr=id; + } else { + /* + * did not find a matching TSACK. Mark the entry + * in the table and the timestamp as invalid. + */ + tp->se_curr=tp->se_len; + to->to_flag &= ~TOF_TS; + } + } + } + #endif /* TSACK */ + + #ifdef TSACK /* checked feb 96 - ok */ + /* + * this implements the new alg. for "which timestamp to echo" + */ + void + tcp_out(struct tcpcb *tp, struct tcpopt* to) + { + u_long tmp=tp->ts_recent; + if ((tcp_do_sack & TCP_TSACK_ENABLE) && (to->to_flag & TOF_TS) && + !(tp->t_flags & TF_DELACK) && + TSTMP_LT(tmp, to->to_tsval)) + tp->ts_recent=to->to_tsval; + (void) tcp_output(tp); + tp->ts_recent=tmp; + } + #else + #define tcp_out(tp, to) tcp_output(tp) + #endif + + #ifdef SACK /* XXX debugging only */ + /* this causes some artificial losses */ + int + tcp_dropit(struct tcpcb *tp, struct tcpiphdr *ti) + { + static char * losspattern= + /* ".....+....1....+....2....+....3....+....4....+...." */ + "...........a...d.x......a...........d.....d.....a." + ".d........................a.......d..............d" + "..dd..d....d..d..................a...............d" + "......d........a.a...dd.a........................." + "d..d.......d.d...x.d...aa.d.x....................." + "..............................a.dd...............a" + ".....d....a..d........d.d.d................d.d...." + "...da...da......a...d...d.....d...d.....d........." + "d...........a..d....d.....d...x..a................" + "....d......d..d...........a.....d.ad.....d.a......" + + ; + static lossplen=499; + int p, c; + p=tp->loss = (tp->loss+1) % lossplen; + c= losspattern[tp->loss]; + if ( (tcp_do_sack & TCP_LOSE) && + ( ( c=='x' /* unconditional drop */ || + (c=='d' && ti->ti_len !=0) /* drop data */ || + (c=='a' && ti->ti_len ==0) /* drop ack */) + ) ) { + #if 0 + printf("drop tp 0x%08x nxt 0x%08x (+%6d %d) ack %d\n", + tp, tp->rcv_nxt, ti->ti_seq - tp->rcv_nxt, + ti->ti_len, ti->ti_ack - tp->iss); + #endif + return 1; + } else + return 0; + } + #endif + /* * TCP input routine, follows pages 65-76 of the * protocol specification dated September, 1981 very closely. *************** *** 381,386 **** --- 455,466 ---- if (tp->t_state == TCPS_CLOSED) goto drop; + if (tcp_dropit(tp, ti)) + goto drop; + #ifdef TSACK /* checked feb 96 - ok */ + tsack_check(tp, &to); + #endif + tp->numpkt++; /* rx pkt since last timeout */ /* Unscale the window into a 32-bit value. */ if ((tiflags & TH_SYN) == 0) tiwin = ti->ti_win << tp->snd_scale; *************** *** 493,502 **** --- 573,585 ---- if (ti->ti_len == 0) { if (SEQ_GT(ti->ti_ack, tp->snd_una) && SEQ_LEQ(ti->ti_ack, tp->snd_max) && + /* next test is to make sure cwnd hasn't changed */ + !(tp->t_flags & TF_FAST_RXMT) && tp->snd_cwnd >= tp->snd_wnd) { /* * this is a pure ack for outstanding data. */ + tp->t_dupacks = 0; ++tcpstat.tcps_predack; if ((to.to_flag & TOF_TS) != 0) tcp_xmit_timer(tp, *************** *** 528,534 **** if (so->so_snd.sb_flags & SB_NOTIFY) sowwakeup(so); if (so->so_snd.sb_cc) ! (void) tcp_output(tp); return; } } else if (ti->ti_ack == tp->snd_una && --- 611,617 ---- if (so->so_snd.sb_flags & SB_NOTIFY) sowwakeup(so); if (so->so_snd.sb_cc) ! (void) tcp_out(tp, &to); return; } } else if (ti->ti_ack == tp->snd_una && *************** *** 548,569 **** */ sbappend(&so->so_rcv, m); sorwakeup(so); #ifdef TCP_ACK_HACK /* ! * If this is a short packet, then ACK now - with Nagel * congestion avoidance sender won't send more until * he gets an ACK. */ if (tiflags & TH_PUSH) { tp->t_flags |= TF_ACKNOW; tcp_output(tp); ! } else { ! tp->t_flags |= TF_DELACK; ! } ! #else ! tp->t_flags |= TF_DELACK; #endif return; } } --- 631,655 ---- */ sbappend(&so->so_rcv, m); sorwakeup(so); + #if SACK && TSACK /* checked feb 96 - ok */ + tcp_do_tsack(tp); /* process TSACK */ + #endif #ifdef TCP_ACK_HACK /* ! * If this is a short packet, then ACK now - with Nagle * congestion avoidance sender won't send more until * he gets an ACK. */ if (tiflags & TH_PUSH) { tp->t_flags |= TF_ACKNOW; tcp_output(tp); ! } else #endif + tp->t_flags |= TF_DELACK; return; + /* With in-sequence packets we don't ack until + * a fasttimeout or some user event. + */ } } *************** *** 942,947 **** --- 1028,1034 ---- } else { tcpstat.tcps_rcvduppack++; tcpstat.tcps_rcvdupbyte += ti->ti_len; + tp->rx_dup += ti->ti_len; tcpstat.tcps_pawsdrop++; goto dropafterack; } *************** *** 993,998 **** --- 1080,1086 ---- tcpstat.tcps_rcvpartduppack++; tcpstat.tcps_rcvpartdupbyte += todrop; } + tp->rx_dup += todrop; m_adj(m, todrop); ti->ti_seq += todrop; ti->ti_len -= todrop; *************** *** 1193,1198 **** --- 1281,1317 ---- case TCPS_TIME_WAIT: if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) { + tcprexmtthresh = TCP_REXMTTHRESH; + #ifdef SACK /* checked feb 96 - ok */ + #ifdef TSACK + tcp_do_tsack(tp); /* do I have a tsack ? */ + #endif + /* + * This is a dup ack. Count how many outstanding + * segments I have, to decide when to start fast + * retransmit/recovery. Don't bother if already + * in fast recovery (TF_FAST_RXMT set) or using the + * standard algorithms. + */ + if (!(tp->t_flags & TF_FAST_RXMT) && + tcp_do_sack & TCP_MOD_FASTRETRANS) { + tcp_seq pend; + pend= (tp->snd_nxt - tp->snd_una); + if (tp->sa_ptr && tp->sa_head) { + struct tcp_sacked_el *p; + for (p=tp->sa_head; + p && SEQ_LEQ(p->end, tp->snd_nxt); + p=p->next) + pend -= p->end -p->beg; + if (p) + pend -= tp->snd_nxt - p->beg; + } + pend=(pend+tp->t_maxseg-1)/ tp->t_maxseg -1; + if (pend<=0) pend=1; + if (pend < tcprexmtthresh) + tcprexmtthresh=pend; + } + #endif /* SACK */ if (ti->ti_len == 0 && tiwin == tp->snd_wnd) { tcpstat.tcps_rcvdupack++; /* *************** *** 1219,1263 **** * to keep a constant cwnd packets in the * network. */ ! if (tp->t_timer[TCPT_REXMT] == 0 || ! ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; ! else if (++tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; ! (void) tcp_output(tp); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; ! } else if (tp->t_dupacks > tcprexmtthresh) { tp->snd_cwnd += tp->t_maxseg; ! (void) tcp_output(tp); goto drop; } ! } else ! tp->t_dupacks = 0; ! break; ! } /* ! * If the congestion window was inflated to account ! * for the other side's cached packets, retract it. */ ! if (tp->t_dupacks > tcprexmtthresh && ! tp->snd_cwnd > tp->snd_ssthresh) ! tp->snd_cwnd = tp->snd_ssthresh; tp->t_dupacks = 0; if (SEQ_GT(ti->ti_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; --- 1338,1403 ---- * to keep a constant cwnd packets in the * network. */ ! if (tp->t_timer[TCPT_REXMT] == 0) { ! /* no outstanding data */ tp->t_dupacks = 0; ! tp->t_flags &= ~TF_FAST_RXMT; ! /* tcp_sa_clean(tp); */ ! } else if (ti->ti_ack != tp->snd_una) ! /* This an old ack. Ignore it */ ; ! else if (++tp->t_dupacks >= tcprexmtthresh && ! ! (tp->t_flags & TF_FAST_RXMT)) { tcp_seq onxt = tp->snd_nxt; u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + tcpstat.tcps_fastretransmit++; + if (tp->t_dupacks <= 3) + tcpstat.tcps_fastrexmt[tp->t_dupacks-1]++; + tp->snd_max_rxmt = tp->snd_max; if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_timer[TCPT_REXMT] = 0; tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; + tp->snd_rxmt = tp->snd_nxt; + printf("-- fastretrans tp 0x%08x una %6d\n", tp, tp->snd_nxt - tp->iss); + #if 0 && SACK + /* this is part of the FACK alg. */ + if (tp->sa_ptr && tp->sa_head) + tp->snd_cwnd= tp->sa_tail->end - + tp->snd_una; + else + #endif tp->snd_cwnd = tp->t_maxseg; ! tp->t_flags |= TF_FAST_RXMT; ! (void) tcp_out(tp, &to); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks; if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; ! } else if (tp->t_flags & TF_FAST_RXMT) { tp->snd_cwnd += tp->t_maxseg; ! tcpstat.tcps_fastrecovery++; ! (void) tcp_out(tp, &to); goto drop; } ! } else { /* ! * ack with data or window reopen. Should not ! * reset t_dupacks. However it's a rare event, ! * so it doesn't matter so much. */ ! if (ti->ti_len==0 && tp->t_dupacks !=0) ! tcpstat.tcps_zerodupw++; tp->t_dupacks = 0; + tp->t_flags &= ~TF_FAST_RXMT; + } + break; /* continues from step6 */ + } if (SEQ_GT(ti->ti_ack, tp->snd_max)) { tcpstat.tcps_rcvacktoomuch++; goto dropafterack; *************** *** 1266,1271 **** --- 1406,1423 ---- * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ + /* + * If the congestion window was inflated to account + * for the other side's cached packets, retract it. + */ + if ( (tp->t_flags & TF_FAST_RXMT) && + tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + #if SACK + tcp_sa_trim(tp); + if (!tp->sa_ptr || tp->sa_head==NULL) + #endif + tp->t_dupacks = 0; /* XXX moved from before */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our *************** *** 1307,1312 **** --- 1459,1470 ---- needoutput = 1; } else if (tp->t_timer[TCPT_PERSIST] == 0) tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + if (tcp_do_sack & TCP_NEWRENO && + tp->t_flags & TF_FAST_RXMT && + SEQ_LT((tp->snd_una+acked),tp->snd_max_rxmt) ) { + tp->t_flags |= TF_NEWRENO; + } + tp->t_flags &= ~TF_FAST_RXMT; /* exit fast recovery */ /* * If no data (only SYN) was ACK'd, *************** *** 1507,1512 **** --- 1665,1675 ---- if ((ti->ti_len || (tiflags&TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { TCP_REASS(tp, ti, m, so, tiflags); + #ifdef SACK /* checked aug96 - reasonable */ + /* mark boundaries of last rx seg. */ + tp->rx_beg= ti->ti_seq; + tp->rx_end= ti->ti_seq + ti->ti_len; + #endif /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's *************** *** 1591,1601 **** tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); #endif /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) ! (void) tcp_output(tp); return; dropafterack: --- 1754,1786 ---- tcp_trace(TA_INPUT, ostate, tp, &tcp_saveti, 0); #endif + /* Newreno should not be done (not needed) if SACK are available + * and there is a proper retransmission policy. + */ + if (tp->t_flags & TF_NEWRENO && + tp->sa_head == NULL ) { + tcp_seq onxt = tp->snd_nxt; + + tcpstat.tcps_newreno++; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_rxmt = tp->snd_nxt = tp->snd_una; + printf("-- newreno tp 0x%08x una %6d [%d] -->%d\n", + tp, tp->snd_nxt - tp->iss, tp->snd_max_rxmt - tp->iss, + tp->snd_max - tp->iss); + (void) tcp_out(tp, &to); + tp->t_dupacks = 1; + tp->t_flags |= TF_FAST_RXMT; + tp->t_flags &= ~TF_NEWRENO; + tp->snd_cwnd = tp->snd_ssthresh + + tp->t_maxseg * tp->t_dupacks; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + } /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) ! (void) tcp_out(tp, &to); return; dropafterack: *************** *** 1611,1617 **** #endif m_freem(m); tp->t_flags |= TF_ACKNOW; ! (void) tcp_output(tp); return; dropwithreset: --- 1796,1802 ---- #endif m_freem(m); tp->t_flags |= TF_ACKNOW; ! (void) tcp_out(tp, &to); return; dropwithreset: *************** *** 1711,1716 **** --- 1896,1904 ---- bcopy((char *)cp + 6, (char *)&to->to_tsecr, sizeof(to->to_tsecr)); NTOHL(to->to_tsecr); + #ifdef TSACK /* checked feb 96 - ok */ + tsack_check(tp, to); + #endif /* * A timestamp received in a SYN makes *************** *** 1722,1727 **** --- 1910,1991 ---- tp->ts_recent_age = tcp_now; } break; + #ifdef SACK /* XXX checked aug 96 - ok */ + case TCPOPT_SACK_PERMITTED: + if (optlen != TCPOLEN_SACK_PERMITTED) + continue; + if (ti->ti_flags & TH_SYN) { + /* MUST only be set on SYN */ + tp->t_flags |= TF_SACK_PERMIT; + } + { + u_long l_ip, r_ip; + u_short l_p, r_p; + + r_ip = *((u_long *)(&tp->t_inpcb->inp_faddr)); + l_ip = *((u_long *)(&tp->t_inpcb->inp_laddr)); + r_p = (u_long)(tp->t_inpcb->inp_fport); + l_p = (u_long)(tp->t_inpcb->inp_lport); + NTOHL(r_ip); + NTOHL(l_ip); + + printf("== OPT: TF_SACK_PERMIT %s tp 0x%08x " + "%d.%d.%d.%d:%d %d.%d.%d.%d:%d\n", + tp->sa_ptr ? "TF_REQ_SACK":"", tp, + (l_ip >>24) & 0xff, (l_ip >>16) & 0xff, + (l_ip >>8) & 0xff, (l_ip) & 0xff, + htons(l_p), + (r_ip >>24) & 0xff, (r_ip >>16) & 0xff, + (r_ip >>8) & 0xff, (r_ip) & 0xff, + htons(r_p) + ); + } + break; + case TCPOPT_SACK: + { + int i, tmp_olen, off, len; + u_char *tmp_cp; + + if (tp->sa_ptr == NULL) + continue; + + /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ + if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) + continue; + tmp_cp = cp + 2; + tmp_olen = optlen - 2; + while (tmp_olen > 0) { + tcp_seq s_start, s_end; + + bcopy((char *) tmp_cp, (char *) &(s_start), + sizeof(tcp_seq)); + NTOHL(s_start); + bcopy((char *) tmp_cp + sizeof(tcp_seq), + (char *) &(s_end), sizeof(tcp_seq)); + NTOHL(s_end); + tmp_olen -= TCPOLEN_SACK; + tmp_cp += TCPOLEN_SACK; + /* + * Sanity check before processing. Note that + * here ack has not been processed yet. + * Thus, there might be a SACK which is + * behind ti->ti_ack. + */ + if (SEQ_GT(s_end, tp->snd_una) && + SEQ_GT(s_end, ti->ti_ack) && + SEQ_LEQ(s_end, tp->snd_max)) { + #if 0 + printf("SACK%d tp 0x%08x una 0x%08x (+%6d %6d --> %6d) dup %d\n", + (tmp_cp - cp - 2)/TCPOLEN_SACK, + tp, tp->snd_una, s_start - tp->snd_una, s_end - s_start, + tp->snd_nxt - tp->snd_una, tp->t_dupacks); + #endif + tcp_addseg(tp, s_start, s_end); + } + } + break; + } + #endif SACK case TCPOPT_CC: if (optlen != TCPOLEN_CC) continue; diff -cbwr netinet.orig/tcp_output.c netinet.new/tcp_output.c *** netinet.orig/tcp_output.c Thu Sep 14 19:58:07 1995 --- netinet.new/tcp_output.c Tue Aug 27 03:53:04 1996 *************** *** 66,71 **** --- 66,114 ---- extern struct mbuf *m_copypack(); #endif + #if SACK /* XXX check this, especially on SYN */ + /* + * this is used to skip SACKed blocks. + * + * returns the maximum amount of data to can be transmitted (0 means unknown) + */ + static u_long /* XXX maybe tcp_seq ? */ + tcp_skip_sacked(tp) + struct tcpcb *tp; + { + struct tcp_sacked_el *p = tp->sa_head; + tcp_seq n= tp->snd_nxt; + + /* first handle common cases */ + if (p==NULL) + return 0; + if (SEQ_GEQ(n, tp->sa_tail->end)) { + if (tp->t_flags & TF_FAST_RXMT) { + /* XXX during fast retransmit, cannot resend blocks after + * the last SACKed one. + */ + tp->snd_nxt = tp->snd_max; + } + return 0; + } + if (SEQ_LT(n, p->beg)) + return p->beg - n; + + /* + * advance snd_nxt in case it points to a SACKed block + */ + for (; p ; p= p->next) { + if (SEQ_LT(n, p->beg)) + return p->beg - n; + else if (SEQ_LEQ(n, p->end)) { + tp->snd_nxt = p->end; + if (tp->t_flags & TF_FAST_RXMT) + tp->snd_rxmt = tp->snd_nxt; + } + } + return 0; + } + #endif SACK /* * Tcp output routine: figure out what should be sent and send it. *************** *** 85,90 **** --- 128,140 ---- struct rmxp_tao *taop; struct rmxp_tao tao_noncached; + #ifdef SACK + long len1; + static long cycle=0; /* XXX debugging only */ + #endif + #ifdef TSACK /* checked feb 96 - ok */ + u_long *tsptr=NULL; /* marks the position of the timestamp */ + #endif /* * Determine length of data that should be transmitted, * and flags that will be used. *************** *** 99,106 **** --- 149,163 ---- * slow start to get ack "clock" running again. */ tp->snd_cwnd = tp->t_maxseg; + #if SACK + cycle++; /* XXX debugging only */ + tcp_sa_trim(tp); /* cleanup SACKed list */ + #endif again: sendalot = 0; + #if SACK + len1=tcp_skip_sacked(tp); + #endif off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); *************** *** 149,154 **** --- 206,221 ---- len = min(so->so_snd.sb_cc, win) - off; + #if SACK + if (len1) + len=min(len, len1); + #if 0 + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* XXX testing */ + printf("-- tx%6d tp 0x%08x una 0x%08x (+%6u %6u ->%6u) dups %d\n", + cycle, tp, tp->snd_una, tp->snd_nxt-tp->snd_una, + len, tp->snd_max- tp->snd_una, tp->t_dupacks); + #endif + #endif if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { taop = &tao_noncached; bzero(taop, sizeof(*taop)); *************** *** 290,295 **** --- 357,393 ---- /* * No reason to send a segment, just return. */ + #if SACK + /* + * A final attempt before giving up: if we are in fast recovery, + * and snd_rxmt is before the last SACK block (moved there when + * did the fast retransmit), try to send some unSACKed data. + */ + if ( + tp->sa_head && /* have some SACKed data */ + SEQ_LT(tp->snd_rxmt, tp->sa_tail->beg) && /* not last hole */ + tp->t_flags & TF_FAST_RXMT && /* doing fast rxmt */ + tp->snd_nxt == tp->snd_max /* am at the end */ + ) { + printf("--- tp 0x%08x rolling back %u <-- %u --> %u\n", + tp, tp->snd_rxmt - tp->iss, + tp->snd_nxt - tp->iss, tp->snd_max - tp->iss); + tp->snd_nxt = tp->snd_rxmt; + goto again; + } + #endif + #if 0 /* XXX debugging only */ + if (tp->snd_max != tp->snd_una && + tp->t_timer[TCPT_REXMT]==0) + printf("xxx tcp_output tp 0x%08x NOTHING TO SEND len %d win %d cwnd %d " + "una 0x%08x out %6u nxt %6u REXMT %d PERS %d force %d\n", + tp, len, tp->snd_wnd, tp->snd_cwnd, + tp->snd_una, tp->snd_max -tp->snd_una, + tp->snd_nxt-tp->snd_una, + tp->t_timer[TCPT_REXMT], + tp->t_timer[TCPT_PERSIST], + tp->t_force); + #endif return (0); send: *************** *** 314,319 **** --- 412,427 ---- (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; + #ifdef SACK /* checked aug 96 - ok */ + /* generate a SACK_PERMIT on opening */ + if (tp->sa_ptr && + ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_SACK_PERMIT))) { + *((u_long *) (opt + optlen)) = + htonl(TCPOPT_SACK_PERMIT_HDR); + optlen += 4; + } + #endif if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { *************** *** 340,345 **** --- 448,456 ---- /* Form timestamp option as shown in appendix A of RFC 1323. */ *lp++ = htonl(TCPOPT_TSTAMP_HDR); + #ifdef TSACK /* checked feb 96 - ok */ + tsptr=lp; /* record the position of the timestamp */ + #endif *lp++ = htonl(tcp_now); *lp = htonl(tp->ts_recent); optlen += TCPOLEN_TSTAMP_APPA; *************** *** 423,428 **** --- 534,578 ---- } } + #ifdef SACK + /* + * Send SACKs if both sides want it and there are out-of-seq data. + * This should be the last option processed. + */ + if ((tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && + tp->sa_ptr && + tp->t_state == TCPS_ESTABLISHED && tcp_generate_sacks(tp)) { + u_long *lp = (u_long *) (opt + optlen); + u_long *olp = lp++; + int i; + + int count = 0; /* actual number of SACKs inserted (<= 3) */ + int maxsack = (TCP_MAXOLEN - (optlen + 4))/TCPOLEN_SACK; + + maxsack = min(maxsack, TCP_MAX_SACK); + maxsack = min(maxsack, tp->sa_rx_count); + #if 0 + if (maxsack) + printf("++ gen_SACK_block tp 0x%08x -- " + "%d/%d blocks (rcv_nxt= 0x%08x)\n", + tp, tp->sa_rx_count, maxsack, tp->rcv_nxt); + #endif + for (i=0; i < tp->sa_rx_count /* maxsack */; i++) { + #if 0 + printf(" ++ 0x%08x (+%6u %6u)\n", + tp->sa_rx[i].beg, + tp->sa_rx[i].beg - tp->rcv_nxt, + tp->sa_rx[i].end - tp->sa_rx[i].beg); + #endif + if (i < maxsack) { + *lp++ = htonl(tp->sa_rx[i].beg); + *lp++ = htonl(tp->sa_rx[i].end); + } + } + *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*maxsack+2)); + optlen += TCPOLEN_SACK*maxsack + 4; /* including leading NOPs */ + } + #endif SACK hdrlen += optlen; /* *************** *** 457,462 **** --- 607,613 ---- else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tcpstat.tcps_sndrexmitpack++; tcpstat.tcps_sndrexmitbyte += len; + tp->tx_dup += len; } else { tcpstat.tcps_sndpack++; tcpstat.tcps_sndbyte += len; *************** *** 551,556 **** --- 702,728 ---- else ti->ti_seq = htonl(tp->snd_max); ti->ti_ack = htonl(tp->rcv_nxt); + #ifdef TSACK /* checked feb 96 - ok */ + /*** TSACK handling. ***/ + if (tp->se_ptr && tsptr) { + /* TSACK generation. id points to a (possibly free) slot */ + + struct tcp_sent_el *p; + p= &(tp->se_ptr[tp->id % tp->se_len]); + + *tsptr = htonl(tp->id); /* send a valid timestamp anyways */ + tsptr=NULL; + if (SEQ_LEQ(p->end, tp->snd_una)) { + /* the slot is free, fill it */ + p->id=tp->id++; + p->tv= tcp_now; /* could also look in *tsptr */ + p->beg= ntohl(ti->ti_seq); + p->end=p->beg+ len; + } + else + p->id = 0; /* clean entry to avoid wrong estimates */ + } + #endif /* TSACK */ if (optlen) { (void)memcpy(ti + 1, opt, optlen); ti->ti_off = (sizeof (struct tcphdr) + optlen) >> 2; *************** *** 607,612 **** --- 779,789 ---- } } tp->snd_nxt += len; + #if SACK + if ( tp->snd_rxmt == startseq ) + /* mark segment as retransmitted */ + tp->snd_rxmt = tp->snd_nxt; + #endif if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; /* *************** *** 664,669 **** --- 841,851 ---- ((struct ip *)ti)->ip_len = m->m_pkthdr.len; ((struct ip *)ti)->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; /* XXX */ ((struct ip *)ti)->ip_tos = tp->t_inpcb->inp_ip.ip_tos; /* XXX */ + #if LOSSY /* code to provide artificial losses */ + /* determine if the packet should be discarded, in case + * set error=0 and do an m_freem(m); + */ + #endif #if BSD >= 43 error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, so->so_options & SO_DONTROUTE, 0); diff -cbwr netinet.orig/tcp_subr.c netinet.new/tcp_subr.c *** netinet.orig/tcp_subr.c Sat Jul 22 16:04:31 1995 --- netinet.new/tcp_subr.c Tue Aug 27 15:39:55 1996 *************** *** 69,74 **** --- 69,82 ---- int tcp_mssdflt = TCP_MSS; int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ; int tcp_do_rfc1323 = 1; + int tcp_do_sack = + TCP_SACK_ENABLE | + TCP_TSACK_ENABLE | + (3 & TCP_SACK_MASK) | /* SACK lifetime */ + TCP_MOD_FASTRETRANS| + TCP_NEWRENO | + /* TCP_LOSE | /* loose packets */ + 0; int tcp_do_rfc1644 = 1; static void tcp_cleartaocache(void); *************** *** 220,225 **** --- 228,255 ---- (void) ip_output(m, NULL, ro, 0, NULL); } + #ifdef TSACK /* create a sent[] table to implement TSACK */ + void + tcp_newsentlist(tp) + struct tcpcb *tp; + { + #ifndef SENT_LEN + #define SENT_LEN 16 + #endif + int i; + + tp->se_ptr= malloc(SENT_LEN*sizeof(struct tcp_sent_el),M_PCB,M_NOWAIT); + if (tp->se_ptr==NULL) + return; + bzero((char *) tp->se_ptr, SENT_LEN*sizeof(struct tcp_sent_el)); + for (i=0; i< SENT_LEN; i++) + /* should really be initialized later */ + tp->se_ptr[i].beg=tp->se_ptr[i].end=tcp_iss; /* XXX */ + tp->se_len=SENT_LEN; + tp->id=0; + } + #endif /* TSACK */ + /* * Create a new TCP control block, making an * empty reassembly queue and hooking it to the argument *************** *** 258,266 **** --- 288,712 ---- tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; inp->inp_ip.ip_ttl = ip_defttl; inp->inp_ppcb = (caddr_t)tp; + #ifdef TSACK + if (tcp_do_sack & TCP_TSACK_ENABLE) + tcp_newsentlist(tp); + #endif /* TSACK */ + #ifdef SACK + if (tcp_do_sack & TCP_SACK_ENABLE) { + struct tcp_sacked_el *tcp_newsacklist(); + + tp->sa_ptr=tp->sa_free=tcp_newsacklist(); + } + #endif SACK return (tp); } + + #ifdef SACK + /* + * creates a list, makes a linked list of free items, the first + * being the head of the list. + */ + struct tcp_sacked_el * + tcp_newsacklist() + { + int i; + struct tcp_sacked_el *t; + + #ifndef SACKED_LEN + #define SACKED_LEN 10 + #endif + t=(struct tcp_sacked_el *)malloc(SACKED_LEN*sizeof(*t),M_PCB,M_NOWAIT); + if (t==NULL) + return t; + t->next=NULL; + for (i=0; i< SACKED_LEN - 1; i++) + t[i].next = &(t[i+1]); + t[i].next = NULL; + return t; + } + + /* + * The following procedures are used to build the list of SACKed + * blocks. The main entry is tcp_addseg, and it uses auxiliary procedures + * to trim the list, merge entries, add segments. + */ + + /* + * tcp_sa_timeout handles timeouts. Ages blocks, disposing them if + * too old (or already acked). The lifetime of SACK info is written + * in tcp_do_sack. Note that the draft standard specifies a lifetime + * of 0, but a longer lifetime is better for the net (reneging is very + * unlikely). + */ + void + tcp_sa_timeout(tp) + struct tcpcb *tp; + { + struct tcp_sacked_el *tmp, *p, *l; + + if (tp->sa_ptr==NULL || tp->sa_head==NULL) + return; /* either empty or unsupported sack */ + for (p=NULL,l=tp->sa_head; l ;) + if (++l->age >= (tcp_do_sack & TCP_SACK_MASK) + || SEQ_LEQ(l->end,tp->snd_una)) { + tmp=l->next; + l->next = tp->sa_free; + tp->sa_free = l; + if (p==NULL) { + l=tp->sa_head=tmp; + if (l==NULL) + tp->sa_tail=NULL; + } else { + if (l==tp->sa_tail) + tp->sa_tail=p; + p->next=l=tmp; + } + } else { + if (SEQ_LT(l->beg, tp->snd_una)) + /* can it happen ? */ + l->beg = tp->snd_una; + printf(" SACKq tp 0x%08x (+%6d %6d) age %d\n", + tp, l->beg - tp->iss, l->end -l->beg, l->age); + p=l; + l=l->next; + } + } + + /* + * tcp_sa_clean discards any segment in the sack list, by appending + * the sacked blocks to the free list. + */ + void + tcp_sa_clean(tp) + struct tcpcb *tp; + { + if (tp->sa_head==NULL) + return; /* either empty or unsupported sack */ + if (tp->sa_free) /* test is not necessary ! */ + tp->sa_tail->next = tp->sa_free; + tp->sa_free = tp->sa_head; + tp->sa_head = tp->sa_tail = NULL; + } + + /* + * this inserts a record in the sack queue after "aft". If "aft" is null, + * then inserts at the head of the list + */ + void + tcp_sa_add(tp, aft, beg, end) + struct tcpcb *tp; + struct tcp_sacked_el *aft; + tcp_seq beg, end; + { + struct tcp_sacked_el *p=tp->sa_free; + if (!p) + return; + tp->sa_free=p->next; + p->beg=beg; + p->end=end; + p->age=0; + if (aft == NULL) { /* head insertion */ + p->next = tp->sa_head; + tp->sa_head = p; + } else { /* regular insertion */ + p->next = aft->next; + aft->next = p; + } + if (p->next == NULL) + tp->sa_tail=p; + } + + /* + * merges two blocks, p and the next one. + */ + inline + void + tcp_sa_merge(tp, p) + struct tcpcb *tp; + struct tcp_sacked_el *p; + { + struct tcp_sacked_el *tmp=p->next; + p->end=tmp->end; + p->age=0; + p->next=tmp->next; + tmp->next=tp->sa_free; + tp->sa_free=tmp; + } + + inline + void + tcp_sa_trim(tp,una) + struct tcpcb *tp; + tcp_seq una; + { + struct tcp_sacked_el *p, *l; + /* + * remove ACKed blocks + */ + if (!tp->sa_head) + return; + if (SEQ_LEQ(tp->sa_tail->end, una) ) { + /* + * quick path, we can invalidate the whole queue + */ + tp->sa_tail->next=tp->sa_free; + tp->sa_free=tp->sa_head; + tp->sa_head=tp->sa_tail=NULL; + return; + } + /* + * slow path: prune ACKed blocks, recovering storage + * NOTE: we do not need to test l!=NULL, as the previous test + * guarantees us that there is at least one valid block. + */ + for (p=NULL,l=tp->sa_head; l && SEQ_LEQ(l->end, una); p=l, l=l->next) ; + if (p) { /* at least one has gone */ + p->next=tp->sa_free; + tp->sa_free=tp->sa_head; + tp->sa_head=l; + } + if (SEQ_GT(una, l->beg)) { + l->beg=una; + l->age=0; + } + } + + /* + * tcp_addseg adds a record to the list of sacked blocks. + * Assume this is *not* called with SACKs already acknowledged. + */ + void + tcp_addseg(tp, beg, end) + struct tcpcb *tp; + tcp_seq beg, end; + { + struct tcp_sacked_el *p, *l; + tcp_seq una= tp->snd_una; + + if (SEQ_GT(una,beg)) /* trim new block if it contains acked data */ + beg=una; + /* + * now check that TSACK are supported (tp->sa_ptr !=NULL) + * and the rest of the segment is not already acked + */ + if (tp->sa_ptr == NULL || SEQ_GEQ(beg,end)) /* sanity check */ + return; + tcp_sa_trim(tp,una); /* remove acked blocks */ + #if 0 && SACK /* debugging code */ + for (p=tp->sa_head; p; p=p->next) { + printf("++ remote SACK 0x%08x - 0x%08x (%6d)\n", + p->beg, p->end, p->end - p->beg); + } + printf("++ UNA 0x%08x, insert SACK 0x%08x - 0x%08x (%6d)\n", + una, beg, end, end-beg); + #endif + /* + * At this point the queue, if non empty, contains only unacked data. + * Insert the new sacked block in the queue. First check common cases. + */ + if (tp->sa_head == NULL) { + /* First common case: sacked queue is empty */ + tcp_sa_add(tp, NULL, beg, end); + return; + } + p=tp->sa_tail; /* shortcut */ + if (SEQ_GEQ(beg, p->beg)) { + /* second common case: new block is next to the last one */ + if (SEQ_LEQ(beg,p->end)) { + /* start within last one, simply merge */ + if (SEQ_GT(end,p->end)) { + p->end=end; + p->age=0; /* and then return */ + } + } else { + /* start after last one, append after */ + tcp_sa_add(tp, p, beg, end); /* and return */ + } + return; + } + /* + * Here, uncommon cases. + * l is the current block, p is the previous one. + */ + for (p=NULL, l=tp->sa_head; l ; p=l, l=l->next) { + if (SEQ_GT(beg, l->end)) + /* the new segment is completely after l */ + continue; + if (SEQ_LT(beg, l->beg)) { + /* the new segment starts before l (and cannot start + * in p because we scan blocks in order) + */ + if (SEQ_LT(end, l->beg)) { + /* it also ends before l, so add after p */ + tcp_sa_add(tp, p, beg, end); + return; + } else { + /* partially or totally overlapped to l, so... */ + l->beg = beg ; /* adjust left edge of l */ + l->age=0; + } + } + again: + /* + * the left edge of the new segment is within l (possibly + * at the beginning). Now fix overlapping blocks. + */ + if (SEQ_LEQ(end, l->end)) + /* the segment is completely inside l, just return */ + return; + if (l->next==NULL || SEQ_LT(end, l->next->beg)) { + /* + * there is no block after l, or the current segment ends + * before the next one: extend right edge of l and return + */ + l->end = end; + l->age = 0; + return; + } + /* + * If we reach this point, the new segment fills the hole between + * l and the next block. Merge the two blocks and continue + */ + tcp_sa_merge(tp,l); + goto again; + } + /* + * we shouldn't arrive here! In any case, if it happens, + * the block is fully after the queue + */ + tcp_sa_add(tp, p, beg, end); + } + + /* + * getrxseg returns the next segment in the reass. queue, and a pointer + * to continue the scan. First time is called with q=tp->seg_next, last time + * returns NULL + */ + static + struct tcpiphdr * + tcp_getrxseg(struct tcpcb *tp, tcp_seq *beg, tcp_seq *end, struct tcpiphdr *q) + { + *beg = *end = q->ti_seq; + for (; ; q = (struct tcpiphdr *)q->ti_next) { + if (q != (struct tcpiphdr *)tp && q->ti_seq == *end) { + *end += q->ti_len; /* enlarge previous block */ + } else { /* end of block */ + if (q == (struct tcpiphdr *)tp) + q=NULL; + return q; + } + } + } + + int + tcp_generate_sacks(struct tcpcb *tp) + { + struct tcpiphdr *q=tp->seg_next; + tcp_seq beg, end; + struct { + tcp_seq beg, end; + } filler[TCP_MAX_SACK]; + u_long bitmap=0; /* TCP_SA_LEN < 32 ... */ + int fillcount=0; + int dst, src; + + if (tp->seg_next == (struct tcpiphdr *)tp) { + /* reassembly queue is empty: just cleanup and return */ + tp->sa_rx_count=0; + return 0; + } + /* make room for the first segment and copy it in place, if valid */ + if (tp->rx_beg != tp->rx_end && SEQ_GT(tp->rx_end, tp->rcv_nxt) ) { + dst=min(TCP_RX_SACK_LEN-1, tp->sa_rx_count); + bcopy(&tp->sa_rx[0], &tp->sa_rx[1], + dst*sizeof(tp->sa_rx[0])); + tp->sa_rx_count = dst+1; + tp->sa_rx[0].beg = tp->rx_beg; + tp->sa_rx[0].end = tp->rx_end; + bitmap=1; /* have 1 new segment */ + tp->rx_end = tp->rx_beg; /* make it invalid */ + } + + /* + * Scan the reassembly queue. If a received segment overlaps + * a block, update boundaries. Duplicates become empty blocks + * (to be removed next). If there is no overlap, store the + * boundaries so that we can use them if there is room. + */ + do { + int taken= 0; + q=tcp_getrxseg(tp, &beg, &end, q); + for (dst=0; dst< tp->sa_rx_count; dst++) { + if ((SEQ_LEQ(beg, tp->sa_rx[dst].end)) && + (SEQ_GEQ(end, tp->sa_rx[dst].beg)) ) { + /* have a partial overlap */ + if (taken) { /* this is a duplicate */ + /* probably this branch is not needed */ + tp->sa_rx[dst].end = tp->sa_rx[dst].beg; + } else { + tp->sa_rx[dst].beg=beg; + tp->sa_rx[dst].end=end; + taken=1; + bitmap |= (1 << dst); + } + } + } + if (!taken && fillcount < TCP_MAX_SACK) { + /* might be a filler */ + filler[fillcount].beg=beg; + filler[fillcount].end=end; + fillcount++; + } + } while (q); + for (src=dst=0; src < tp->sa_rx_count ; src++) { + /* clean up the list, discarding empty or acked blocks */ + if (bitmap & (1<sa_rx[src].beg != tp->sa_rx[src].end && */ + SEQ_GT(tp->sa_rx[src].end, tp->rcv_nxt) ) { + /* a valid block */ + if (dst != src) + tp->sa_rx[dst] = tp->sa_rx[src]; + dst++; + } + } + for (src=0; dst < TCP_MAX_SACK && srcsa_rx[dst].beg = filler[src].beg; + tp->sa_rx[dst].end = filler[src].end; + } + tp->sa_rx_count=dst; + return 1; + } + + #ifdef TSACK /* need also SACK here... */ + /* + * tcp_do_tsack processes a tsack option, adding the entry to + * the SACKed queue. When we call this, the ack has been already + * processed and snd_una advanced accordingly (unlike sack blocks, + * which are processed before. + */ + void + tcp_do_tsack(tp) + struct tcpcb *tp; + { + struct tcp_sent_el *p; + if (tp->se_ptr == NULL || tp->se_curr >= tp->se_len) + return; + p= &(tp->se_ptr[tp->se_curr]); + if (SEQ_GT(p->end, tp->snd_una)) { + tcp_addseg(tp,p->beg, p->end); + printf("TSACK tp 0x%08lx una %6u (%6u %6u)\n", + tp, tp->snd_una - tp->iss, + p->beg - tp->snd_una, + p->end - p->beg); + } + p->beg=p->end=tp->snd_una; /* make the entry free */ + tp->se_curr = tp->se_len; + } + #endif /* TSACK */ + #endif SACK + /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, *************** *** 381,386 **** --- 827,867 ---- } if (tp->t_template) (void) m_free(dtom(tp->t_template)); + #ifdef SACK + /* free the sent segment list, if any */ + if (tp->sa_ptr) + free(tp->sa_ptr,M_PCB); + #endif SACK + #ifdef TSACK + if (tp->se_ptr) + free(tp->se_ptr,M_PCB); + #endif TSACK + #if SACK + if (tp->tx_dup || tp->rx_dup) { + u_long l_ip, r_ip; + u_short l_p, r_p; + + r_ip = *((u_long *)(&tp->t_inpcb->inp_faddr)); + l_ip = *((u_long *)(&tp->t_inpcb->inp_laddr)); + r_p = (u_long)(tp->t_inpcb->inp_fport); + l_p = (u_long)(tp->t_inpcb->inp_lport); + + NTOHL(r_ip); + NTOHL(l_ip); + printf("== close 0x%08x dup:rx %d:%d dup:tx %d:%d " + "%d.%d.%d.%d:%d %d.%d.%d.%d:%d\n", + tp, + tp->rx_dup, tp->rcv_nxt - tp->irs, + tp->tx_dup, tp->snd_max - tp->iss, + (l_ip >>24) & 0xff, (l_ip >>16) & 0xff, + (l_ip >>8) & 0xff, (l_ip) & 0xff, + htons(l_p), + (r_ip >>24) & 0xff, (r_ip >>16) & 0xff, + (r_ip >>8) & 0xff, (r_ip) & 0xff, + htons(r_p) + ); + } + #endif free(tp, M_PCB); inp->inp_ppcb = 0; soisdisconnected(so); diff -cbwr netinet.orig/tcp_timer.c netinet.new/tcp_timer.c *** netinet.orig/tcp_timer.c Sat Jul 29 10:16:52 1995 --- netinet.new/tcp_timer.c Tue Aug 27 03:44:46 1996 *************** *** 171,176 **** --- 171,189 ---- { register int rexmt; + #ifdef SACK + static char *names[]={ "REXMT", "PERSIST", "KEEP", "2MSL" }; + printf("-- tim.%s.%d (%d ticks) tp %08x pkt %d dup %d una %d nxt +%6d " + "max +%6d wi %6d cw %6d sst %d\n", + names[timer], tp->t_rxtshift, tp->t_rxtcur, + tp, tp->numpkt, tp->t_dupacks, + tp->snd_una - tp->iss, + tp->snd_nxt - tp->snd_una, + tp->snd_max -tp->snd_una, + tp->snd_wnd, tp->snd_cwnd, tp->snd_ssthresh); + tp->numpkt=0; + tcp_sa_timeout(tp); + #endif switch (timer) { /* *************** *** 193,198 **** --- 206,213 ---- * to a longer retransmit interval and retransmit one segment. */ case TCPT_REXMT: + if (tp->t_dupacks < 3) + tcpstat.tcps_rexmt[tp->t_dupacks]++; if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; tcpstat.tcps_timeoutdrop++; *************** *** 227,232 **** --- 242,262 ---- * If timing a segment in this window, stop the timer. */ tp->t_rtt = 0; + #if 0 && TSACK /* XXX remember to enable this */ + if (tp->se_ptr) { + /* + * Make a couple of free entries into the TSACK + * vector, so that some segments can be timed. + */ + struct tcp_sent_el *p; + int i; + + for (i=0; i<2; i++) { + tp->se_ptr[(tp->id+i) % tp->se_len].end= + tp->snd_una; /* making it free! */ + } + } + #endif /* * Close the congestion window down to one segment * (we'll open it by one segment for each ack we get). *************** *** 255,263 **** --- 285,300 ---- u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; if (win < 2) win = 2; + #if 0 && SACK /* XXX remove this code */ + /* this is part of the FACK alg. */ + if (tp->sa_ptr && tp->sa_head) + tp->snd_cwnd= tp->sa_tail->end - tp->snd_una; + else + #endif tp->snd_cwnd = tp->t_maxseg; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_dupacks = 0; + tp->t_flags &= ~TF_FAST_RXMT; } (void) tcp_output(tp); break; diff -cbwr netinet.orig/tcp_usrreq.c netinet.new/tcp_usrreq.c *** netinet.orig/tcp_usrreq.c Thu Nov 2 16:53:59 1995 --- netinet.new/tcp_usrreq.c Mon Aug 19 10:45:07 1996 *************** *** 726,731 **** --- 726,736 ---- case TCPCTL_RECVSPACE: return (sysctl_int(oldp, oldlenp, newp, newlen, (int *)&tcp_recvspace)); /* XXX */ + #ifdef SACK + case TCPCTL_TCPSACK: + return (sysctl_int(oldp, oldlenp, newp, newlen, + &tcp_do_sack)); + #endif default: return (ENOPROTOOPT); } diff -cbwr netinet.orig/tcp_var.h netinet.new/tcp_var.h *** netinet.orig/tcp_var.h Sat Jul 29 10:16:53 1995 --- netinet.new/tcp_var.h Tue Aug 27 03:24:14 1996 *************** *** 40,45 **** --- 40,113 ---- * Kernel variables for tcp. */ + #define TCP_MAX_SACK 3 /* max # of sack blocks */ + #define TCP_RX_SACK_LEN 6 /* list of seg. received out of seq. */ + #define TCP_SACK_MASK 0x0f /* expire of SACK info */ + #define TCP_SACK_ENABLE 0x10 /* enable (T)SACK processing */ + #define TCP_TSACK_ENABLE 0x20 /* enable TSACK generation */ + #define TCP_MOD_FASTRETRANS 0x40 /* use a modified fastretrans */ + #define TCP_NEWRENO 0x80 /* use J.Hoe's fastretrans */ + #define TCP_LOSE 0x100 /* lose segments, debug only */ + #ifdef SACK + /*** + *** code to support SACK and TSACK, sack in RFC1323 timestamps + *** Luigi Rizzo (luigi@iet.unipi.it) + + SACK code is supported by "option SACK" + TSACK code is supported by "option TSACK". It does very little (only + echoes modified timestamps) if SACK is not enabled as well. + + A sysctl variable ("net.inet.tcp.sack" i.e. tcp_do_sack) enables SACK + support, TSACK generation, and related issues (see the defines above). + + When TSACK generation is enabled, tp->se_ptr is allocated. + tp->se_ptr != NULL enables both TSACK generation and echoing modified + timestamps. + + When SACK processing is enabled, tp->sa_ptr is allocated. + tp->sa_ptr != NULL enables SACK/TSACK processing in general. + This is partly for historical reasons (there is only one flag, + TF_SACK_PERMIT) and for practical purposes (what we want to do with + SACK is written in one place only, tp->sa_ptr). + + To sum up: + + tcp_do_sack & TCP_SACK_ENABLE --> allocates tp->sa_ptr + tcp_do_sack & TCP_TSACK_ENABLE --> allocates tp->se_ptr + tcp_do_sack & TCP_SACK_MASK --> lifetime of SACK blocks + tcp_do_sack & TCP_MODIFIED_FASTRETRANS --> use a modified fastretrans + + TSACKs are generated if tp->se_ptr != NULL + SACK_PERMITTED is generated if tp->sa_ptr != NULL + SACKs are generated if TF_SACK_PERMIT + SACKs/TSACKs are processed if tp->sa_ptr != NULL + + *** + ***/ + + /* + * Data structure for the management of SACKs: + * elements of the sacked queue + */ + struct tcp_sacked_el { + struct tcp_sacked_el *next; + tcp_seq beg, end; /* first and last+1 sequence numbers */ + u_short age; + }; + #endif SACK + + #ifdef TSACK + /* + * TSACK, SACK embedded in RFC1323 timestamp! + * Use a table to interpret the values associated with timestamps. + */ + struct tcp_sent_el { + u_long id; + u_long tv; + tcp_seq beg, end; /* first and last+1 sequence numbers */ + }; + #endif + /* * Tcp control block, one per tcp; fields: */ *************** *** 53,60 **** short t_dupacks; /* consecutive dup acks recd */ u_short t_maxseg; /* maximum segment size */ u_short t_maxopd; /* mss plus options */ ! char t_force; /* 1 if forcing out a byte */ ! u_short t_flags; #define TF_ACKNOW 0x0001 /* ack peer immediately */ #define TF_DELACK 0x0002 /* ack, but try to delay it */ #define TF_NODELAY 0x0004 /* don't delay packets to coalesce */ --- 121,128 ---- short t_dupacks; /* consecutive dup acks recd */ u_short t_maxseg; /* maximum segment size */ u_short t_maxopd; /* mss plus options */ ! char t_force; /* 1 if forcing out a byte. Make it a flag ? */ ! u_int t_flags; /* XXX was u_short! */ #define TF_ACKNOW 0x0001 /* ack peer immediately */ #define TF_DELACK 0x0002 /* ack, but try to delay it */ #define TF_NODELAY 0x0004 /* don't delay packets to coalesce */ *************** *** 71,76 **** --- 139,149 ---- #define TF_REQ_CC 0x2000 /* have/will request CC */ #define TF_RCVD_CC 0x4000 /* a CC was received in SYN */ + #define TF_FAST_RXMT 0x10000 /* done a fast retransmit, cwnd has changed */ + #define TF_NEWRENO 0x20000 /* retransmit following J.Hoe's idea */ + #define TF_MOD_RXMT 0x40000 /* modified fast retransmit after 1/2 dups */ + #define TF_FORCE 0x80000 /* 1 if forcing out a byte */ + struct tcpiphdr *t_template; /* skeletal packet for transmit */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ /* *************** *** 141,146 **** --- 214,245 ---- caddr_t t_tuba_pcb; /* next level down pcb for TCP over z */ /* More RTT stuff */ u_long t_rttupdated; /* number of times rtt sampled */ + /* Following var. implements Newreno, following Hoe's idea */ + tcp_seq snd_max_rxmt; /* copy of snd_max at the time of fast retr. */ + #ifdef SACK + tcp_seq snd_rxmt; + struct tcp_sacked_el *sa_ptr, + *sa_free, + *sa_tail, + *sa_head; /* List of SACKed segments */ + struct { /* rx segments, used to build outg. sacks */ + tcp_seq beg, end; + } sa_rx[TCP_RX_SACK_LEN]; + u_short sa_rx_count; + tcp_seq rx_beg, rx_end; /* boundaries of newest rx seg. */ + #endif SACK + #ifdef TSACK /* list of SENT segments (for TSACK) */ + struct tcp_sent_el *se_ptr; + u_long se_len; /* length of sent[] */ + u_long se_curr;/* current segment being TSACKed */ + u_long id; /* unique ids for sent segments */ + #endif /* TSACK */ + + /* some statistics */ + u_long loss; /* XXX testing only - drop some packets */ + u_long numpkt; /* XXX testing only - rxpackets since last timeout */ + u_long tx_dup; /* dup. sent data. Net is snd_max - iss */ + u_long rx_dup; /* dup. rx data. Net is rcv_nxt - irs */ }; /* *************** *** 286,291 **** --- 385,397 ---- u_long tcps_usedrtt; /* times RTT initialized from route */ u_long tcps_usedrttvar; /* times RTTVAR initialized from rt */ u_long tcps_usedssthresh; /* times ssthresh initialized from rt*/ + + u_long tcps_rexmt[3]; /* rexmt timeout after n dups */ + u_long tcps_fastretransmit; /* total fastretransmit */ + u_long tcps_fastrexmt[3]; /* fastretrans after n+1 dups */ + u_long tcps_fastrecovery; /* #times in fastrecovery */ + u_long tcps_zerodupw; /* invalid dupack clear on window upd */ + u_long tcps_newreno; /* fast retransmit following J.Hoe */ }; /* *************** *** 300,306 **** #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ ! #define TCPCTL_MAXID 10 #define TCPCTL_NAMES { \ { 0, 0 }, \ --- 406,413 ---- #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ ! #define TCPCTL_TCPSACK 10 /* selective acknowledgements */ ! #define TCPCTL_MAXID 11 #define TCPCTL_NAMES { \ { 0, 0 }, \ *************** *** 313,318 **** --- 420,426 ---- { "keepintvl", CTLTYPE_INT }, \ { "sendspace", CTLTYPE_INT }, \ { "recvspace", CTLTYPE_INT }, \ + { "sack", CTLTYPE_INT }, \ } #ifdef KERNEL *************** *** 320,325 **** --- 428,434 ---- extern struct inpcbinfo tcbinfo; extern struct tcpstat tcpstat; /* tcp statistics */ extern int tcp_do_rfc1323; /* XXX */ + extern int tcp_do_sack; /* XXX */ extern int tcp_do_rfc1644; /* XXX */ extern int tcp_mssdflt; /* XXX */ extern u_long tcp_now; /* for RFC 1323 timestamps */ *************** *** 372,377 **** --- 481,494 ---- int tcp_usrreq __P((struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *)); void tcp_xmit_timer __P((struct tcpcb *, int)); + #ifdef TSACK + void tcp_do_tsack __P((struct tcpcb *tp)); + #endif + #ifdef SACK + void tcp_addseg __P((struct tcpcb *tp, tcp_seq beg, tcp_seq end)); + void tcp_sa_clean __P((struct tcpcb *tp)); + int tcp_generate_sacks __P((struct tcpcb *tp)); + #endif extern u_long tcp_sendspace; extern u_long tcp_recvspace;