diffs.40S100644 0 0 4717 7115755071 10662 0ustar rootwheelIndex: conf/files =================================================================== RCS file: /home/ncvs/src/sys/conf/files,v retrieving revision 1.340.2.8 diff -u -b -w -r1.340.2.8 files --- conf/files 2000/05/23 03:57:04 1.340.2.8 +++ conf/files 2000/06/01 19:37:27 @@ -744,6 +744,8 @@ netinet/ip_input.c optional inet netinet/ip_mroute.c optional inet netinet/ip_output.c optional inet +netinet/pgm_timer.c optional pgm inet +netinet/pgm_usrreq.c optional pgm inet netinet/raw_ip.c optional inet netinet/tcp_debug.c optional tcpdebug netinet/tcp_input.c optional inet Index: conf/options =================================================================== RCS file: /home/ncvs/src/sys/conf/options,v retrieving revision 1.191.2.3 diff -u -b -w -r1.191.2.3 options --- conf/options 2000/05/30 14:31:10 1.191.2.3 +++ conf/options 2000/06/01 19:37:47 @@ -262,6 +262,7 @@ IPTUNNEL opt_ipx.h NCP opt_ncp.h NETATALK opt_atalk.h +PGM opt_pgm.h PPP_BSDCOMP opt_ppp.h PPP_DEFLATE opt_ppp.h PPP_FILTER opt_ppp.h Index: netinet/in_proto.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/in_proto.c,v retrieving revision 1.53 diff -u -b -w -r1.53 in_proto.c --- netinet/in_proto.c 2000/02/13 03:32:00 1.53 +++ netinet/in_proto.c 2000/06/01 19:40:41 @@ -36,6 +36,7 @@ #include "opt_ipdivert.h" #include "opt_ipx.h" +#include "opt_pgm.h" #include "opt_ipsec.h" #include "opt_inet6.h" @@ -61,6 +62,10 @@ #include #include #include +#ifdef PGM +#include +#include +#endif #include @@ -176,6 +181,14 @@ &rip_usrreqs }, #endif /*NGIF*/ +#ifdef PGM +{ SOCK_SEQPACKET,&inetdomain, IPPROTO_PGM, PR_ATOMIC|PR_CONNREQUIRED|PR_ADDR, + pgm_input, 0, pgm_ctlinput, pgm_ctloutput, + 0, + pgm_init, pgm_fasttimo, pgm_slowtimo, pgm_drain, + &pgm_usrreqs +}, +#endif #ifdef IPDIVERT { SOCK_RAW, &inetdomain, IPPROTO_DIVERT, PR_ATOMIC|PR_ADDR, div_input, 0, 0, ip_ctloutput, @@ -228,6 +241,9 @@ SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW, 0, "UDP"); SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW, 0, "TCP"); SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW, 0, "IGMP"); +#ifdef PGM +SYSCTL_NODE(_net_inet, IPPROTO_PGM, pgm, CTLFLAG_RW, 0, "PGM"); +#endif #ifdef IPSEC SYSCTL_NODE(_net_inet, IPPROTO_AH, ipsec, CTLFLAG_RW, 0, "IPSEC"); #endif /* IPSEC */ pgm.h100644 423 0 12731 7041031504 10427 0ustar luigiwheel/* * pgm.h -- include files for PGM * * Copyright (c) 1999 Luigi Rizzo * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #ifndef _NETINET_PGM_H_ #define _NETINET_PGM_H_ typedef u_int32_t pgm_seq ; #define PGM_SEQ_LT(a,b) ((int)((a)-(b)) < 0) #define PGM_SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) #define PGM_SEQ_GT(a,b) ((int)((a)-(b)) > 0) #define PGM_SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) /* * the header of a PGM packet, which is always present. * Basically we consider the header of a data pkt, other pkts * are extensions to it. */ struct pgmhdr { u_int16_t ph_sport ; u_int16_t ph_dport ; u_int8_t type ; #define PGM_SPM_TYPE 0 #define PGM_OD_TYPE 4 #define PGM_RD_TYPE 5 #define PGM_NAK_TYPE 8 #define PGM_NNAK_TYPE 9 #define PGM_NCF_TYPE 10 #define PGM_ACK_TYPE 11 /* for cong.control */ u_int8_t options ; #define PGM_OPT_PRESENT 0x01 /* there are option extentions */ #define PGM_OPT_NE_PRESENT 0x02 /* there are NE-significant opt.ext. */ #define PGM_OPT_PARITY 0x80 /* this is a parity pkt */ /* XXX pisa extensions */ #define PGM_OPT_FIN2 0x20 /* no more data here! */ #define PGM_OPT_JOIN2 0x40 /* can join to trail */ u_int16_t checksum ; u_int32_t gsid_low ; u_int16_t gsid_high ; u_int16_t tsdu_len ; /* * all packets have a couple of sequence numbers here, but * their meaning differs. */ pgm_seq _seq1; #define od_txw_trail _seq1 /* this is in ODATA pkts */ #define spm_seq _seq1 /* this is in SPM pkts */ #define nak_req_seq _seq1 /* this is in NAK pkts */ #define ack_req_seq _seq1 /* this is in ACK pkts */ pgm_seq _seq2; #define od_dp_seq _seq2 /* this is in ODATA pkts */ #define spm_txw_trail _seq2 /* this is in SPM pkts */ #define ack_rxw_lead _seq2 /* this is in ACK pkts */ } ; /* * Source Path Message (SPM) packets */ struct pgm_spm_body { u_int32_t spm_le_seq ; u_int16_t nla_afi ; u_int16_t rsvd ; struct in_addr path_nla ; u_char options[0]; } ; struct pgm_spm { struct pgmhdr pgmhdr ; struct pgm_spm_body body ; } ; /* * (N)ACK packets (from receivers/DLR) */ struct pgm_nack_body { /* u_int32_t req_seq ; u_int16_t nla_afi ; u_int16_t rsvd1 ; */ struct in_addr src_nla ; u_int16_t nla_afi2 ; u_int16_t rsvd2 ; struct in_addr mc_nla ; u_char options[0] ; } ; /* * ACK packets (from elected receivers) */ struct pgm_ack_body { /* u_int32_t req_seq ; u_int32_t rxw_lead ; */ u_int32_t ack_bitmask ; } ; /* * options (similar to IP options) */ #define OPT_HLEN sizeof(struct pgm_option) struct pgm_option { u_int8_t type; u_int8_t len; u_int16_t tot_len; }; struct pgm_opt_join { struct pgm_option hdr ; pgm_seq trail ; } ; struct pgm_opt_loss { struct pgm_option hdr ; u_int32_t rx_loss ; pgm_seq rx_lead ; struct in_addr nacker ; } ; struct pgm_opt_cc { struct pgm_option hdr ; struct in_addr acker ; }; /* * PGM options. The same names, in some cases, are also used * as parameter names in set/getsockopt. * Most options are unimplemented as of 991128 */ #define PGM_OPT_LENGTH 0x00 #define PGM_OPT_FRAGMENT 0x01 #define PGM_OPT_JOIN 0x03 #define PGM_OPT_TIME 0x04 #define PGM_OPT_RXQ 0x05 #define PGM_OPT_DROP 0x06 #define PGM_OPT_REDIRECT 0x07 #define PGM_OPT_END 0x80 /* end of options marker */ /* experimental options */ #define PGM_OPT_LOSSRATE 0x10 #define PGM_OPT_SEND_NAK 0x11 /* receivers, please send a NAK */ #define PGM_OPT_ELECT 0x12 /* node X is elected as acker */ #define PGM_OPT_ELECT_ACK 0x13 /* node X has answered */ #define PGM_OPT_DESC 0x20 #define PGM_OPT_SYN 0x21 #define PGM_OPT_FIN 0x22 /* * user set(get)table options (with setsockopt). Use high numbers so * we can use the PGM option IDs as parameters to set/getsockopt. * XXX this is kind of a hack... */ #define PGM_TXW_SIZE 0xC1 /* set window size (todo) */ #define PGM_TXW_MAX_RATE 0xC2 /* set max transmit rate */ #define PGM_TRAIL_ADVANCE 0xC3 #define PGM_ODATA_LIFETIME 0xC4 #define PGM_NAK_MC 0xC5 #define PGM_HOLE_SIZE 0xC6 #define PGM_TSI 0xC7 #define TRAIL_ADVANCE_TIMER 1 #define TRAIL_ADVANCE_DATA 2 #define TRAIL_ADVANCE_USER 3 struct sockaddr_pgm { u_char sin_len ; u_char sin_family ; u_int16_t sin_port ; struct in_addr sin_addr ; u_int32_t gsid_low ; u_int16_t gsid_high ; u_int16_t sport ; } ; #endif _NETINET_PGM_H_ pgm_var.h100644 423 0 31660 7115541271 11312 0ustar luigiwheel/* * pgm_var.h * * Copyright (C) 1999 Luigi Rizzo * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id$ */ #ifndef _NETINET_PGM_VAR_H_ #define _NETINET_PGM_VAR_H_ /* * CONG_CON enables some code to support congestion control. At the * moment this is receiver code to determine and report the loss rate. */ #define CONG_CON 1 /* * PGM kernel structures and variables. */ struct pgmiphdr { struct ipovly pi_i; /* overlaid ip structure */ struct pgmhdr pi_p; /* pgm header */ }; #define pi_next pi_i.ih_next #define pi_prev pi_i.ih_prev #define pi_x1 pi_i.ih_x1 #define pi_pr pi_i.ih_pr #define pi_len pi_i.ih_len #define pi_src pi_i.ih_src #define pi_dst pi_i.ih_dst #define pi_sport pi_p.ph_sport #define pi_dport pi_p.ph_dport #define pi_ulen pi_p.tsdu_len #define pi_sum pi_p.checksum /* * pgm connection state -- host side -- sender */ typedef enum pgm_state { PGM_NEW, PGM_SENDER, PGM_RECEIVER, PGM_RX_CONNECTED, PGM_CLOSED, } pgm_state ; /* * a queue of pgm packets -- used for both RDATA and reassembly queue, * and NAK timing. */ typedef enum reass_queue_type { T_ODATA, T_HOLE, T_NAK } reass_queue_type ; /* * The first element of the queue only contains the seg_next, seg_prev * pointers. If the head points to itself, the queue is empty. */ #define PGM_Q_EMPTY(ptr) \ ( ptr == (struct pgm_pkt_q *)&(ptr) ) #define PGM_Q_NONEMPTY(ptr) \ ( ptr != (struct pgm_pkt_q *)&(ptr) ) #define PGM_Q_HEAD(ptr, queue) \ ( (ptr) == (struct pgm_pkt_q *)&(queue->seg_next) ) /* * reassembly queue. This is a circular queue. The type field describes * the type of entry (T_HOLE, T_NAK, T_ODATA). * T_HOLE means a non-recoverable item. * T_NAK means there is a retransmit state associated. * T_DATA means there is a chain of mbufs */ struct pgm_pkt_q { struct pgm_pkt_q *seg_next, *seg_prev ; struct pgm_pkt_q *nak_next, *nak_prev; reass_queue_type type ; pgm_seq dp_seq ; /* host order */ union { struct { /* descriptor for data */ struct mbuf *m, *m_tail ; pgm_seq tail_seq ; /* host order */ } d ; struct { enum rxmt_timer_type { NAK_TIMEOUT, NCF_TIMEOUT, RDATA_TIMEOUT } ; enum rxmt_timer_type type ; /* timer type */ short timeout ; /* expire when 1->0 */ short ncf_to_ivl ; short rdata_to_ivl ; short ncf_retry ; short rdata_retry ; } t ; } u ; } ; /* * RDATA queue element. (also used for ODATA expire) * The queue is ordered by sequence number. */ struct pgm_data_q { struct pgm_data_q *next; /* next element */ struct mbuf *mb; /* pointer to mbuf to retransmit */ pgm_seq seq; /* seqno of the packet to retransmit */ int ticks; /* timer queue for ODATA/RDATA */ }; /* * PGM control block. Some variables need to be at the top of * the data structure. Apart from those, first we put the vars * which are significant for both sender and receivers, then the * other ones, grouped. */ struct pgmcb { /* Receiver: reassembly and NAK queue (must be first) */ struct pgm_pkt_q *seg_next, *seg_prev, *nak_next, *nak_prev ; /* RX */ /*--- Global variables ---*/ struct inpcb *p_inpcb; /* back pointer to internet pcb */ pgm_state state ; /* * local and remote addresses are held in the inpcb as * inp_{laddr,lport,faddr,fport}. Here we only need the GSI, * plus the sport on the receiver. */ u_int32_t gsid_low ; u_int16_t gsid_high ; u_int16_t sport; struct pgmiphdr *p_template; /* skeletal packet for transmit */ /*--- Sender's variables ---*/ struct pgm_data_q *rdata_head; /* TX */ /* * sender state variables. Max and current rate, trail, lead. */ int txw_max_rte ; /* TX */ int txw_curr_rte ; /* TX, CC */ pgm_seq txw_trail, txw_lead ; /* TX */ /* * next fields are used to implement the expire of odata * in timer-advance mode. */ struct pgm_data_q *odata_trail_head, *odata_trail_tail; /* TX */ int trail_advance_policy ; /* TX */ int odata_lifetime ; /* in ticks */ /* TX */ int odata_ticks_from_last_insert ; /* TX */ pgm_seq spm_sqn ; /* TX */ /* * ambient SPM gou out "at a rate sufficient to maintain * source state. * Heartbeat SPM go out when data is idle at a decaying rate * IHB_TMR from the most recent transmit, from IHB_MIN to IHB_MAX. * Any data tx reinitializes IHB_TMR to min */ int tx_spm_ticks; /* on 1->0 send spm */ /* TX */ /* * transmit buffers are in the socket buffer. odata_curr * points there and is the next odata pkt to send. */ struct mbuf *odata_curr; /* TX */ /* * next two values are scaled by 8*pgm_timer rate. * numbytes is the amount of data i can send next time, * txw_curr_rte is the increment at each tick (in practice * is the rate in bits/s because of the scaling.). * The scheme is similar to the one used in dummynet. * At each tick, provided there is data or numbytes < txw_curr_rte, * we increment numbytes. We transmit if we have sufficient * credit (could be numbytes>=0 or numbytes >= txw_curr_rte, it * only matters at the beginning of a burst). */ int numbytes ; /* TX */ /* * sender_l is current the value of L used by the sender. It starts * at L when the report comes in, then is multiplied by W every * time the lead advances. * XXX sender_rl is the receiver_lead at the acker. */ u_int32_t sender_l ; pgm_seq sender_rl ; /* * acker_loss = (tp->txw_lead - tp->acker_lead)^2 * sender_l * calculated by sender when ACK arrived, used at NAK * reception to elect the new acker. */ u_int64_t acker_loss ; /* acker_mrtt = acker mean rtt */ u_int32_t acker_mrtt ; pgm_seq ignore_cong ; /* ignore cong.events less than this */ int32_t cc_token ; /* TX */ u_int32_t cc_window ; /* TX */ u_int32_t cc_timeout ; /* timeout for no tokens */ /* TX */ int stall_retries ; /* how many retries for current acker after a stall.*/ struct in_addr acker_addr; /* current acker */ /* TX */ /*--- Receiver's variables ---*/ struct in_addr path_nla ; /* RX */ /* * XXX src_nla is used to fill the NAK (but then not much used there) * and to send the ACK direct to the source. Fill it from SPM. */ struct in_addr src_nla ; /* RX */ /* * receiver state variables. * See sec.3.4 for window definitions. */ pgm_seq rxw_size; /* XXX in bytes!!! */ pgm_seq rxw_irs; /* initial receive sequence number */ pgm_seq rxw_trail; /* oldest recoverable segment. Will never * be rxw_trail PGM_SEQ_LT rxw_next */ pgm_seq rxw_lead ; /* highest seqno so far */ #define PGM_DEFAULT_LOOKAHEAD 10 pgm_seq rxw_lookahead ; /* offset beyond rxw_lead for * acceptable segments */ pgm_seq rxw_next ; /* next pkt to read */ pgm_seq rxw_hole_start; /* first segment in hole */ u_int32_t rx_loss; /* loss rate perceived by a receiver */ /*--- Miscellaneous things, still to sort out ---*/ /* Active options */ u_int32_t rx_options; /* options I have received */ /* * Option handling: the descriptor has a pointer to a memory * area containing all outgoing options, plus a bitmap * indicating all options used, and variables describing * them if necessary. The block and the bitmap are * updated every time some option is modified by user calls. */ char *opt_ptr; /* pointer to mem with options to send */ u_int32_t xmit_options; /* options I want to send */ int xmit_optlen; /* options length sent */ /* * NOTA BENE: order is important ! * All network-significant options must appear first. */ /* Network-significant options */ #define OPT_TIME 0x00000001 /* unimplemented */ #define OPT_RXQ 0x00000002 /* unimplemented */ #define OPT_DROP 0x00000004 /* unimplemented */ #define OPT_REDIRECT 0x00000008 /* unimplemented */ /* Receiver-significant options */ #define OPT_FRAGMENT 0x00000010 /* unimplemented */ /* * OPT_JOIN is set on the sender with a setsockopt; on receipt * it is immediately processed and changes the TRAIL if necessary. */ #define OPT_JOIN 0x00000020 /* fully implemented. */ /* Source-significant options */ /* * OPT_LOSSRATE is generated by the receiver/ne, processed by * the sender, to support congestion control. * The option value is from lossrate */ #define OPT_LOSSRATE 0x00000040 /* XXX next fields used for congestion control */ #define PGMCC_SCALE 8 #define PGMCC_C(x) ( (x) << PGMCC_SCALE ) #if 0 /* BC 2000124 */ #define PGMCC_VAL(x) ( (x) >> PGMCC_SCALE ) #endif #define PGMCC_VAL(x) ( (x + (1 <<(PGMCC_SCALE-1)) ) >> PGMCC_SCALE ) #define PGMCC_ADD(x, y) ( (x) + (y) ) #define PGMCC_MUL(x, y) ( ((x) * (y)) >> PGMCC_SCALE ) #define PGMCC_DIV(x, y) ( ((x) / ((y) >> PGMCC_SCALE)) ) pgm_seq ack_lead ; /* lead of ack_bitmask */ /* TX */ u_int32_t ack_bitmask ; /* S/R most recent ACKs for ODATA */ u_int32_t dupacks ; /* how many "dupacks" */ /* TX */ #define OPT_SENDER_CC 0x00000080 /* have sender-cc option */ /* * XXX sender_cc_opt is a pointer within the options block * for the sender-support options for congestion control */ struct pgm_opt_cc *sender_cc_opt ; /* TX */ /* * Various flags */ char enable_cc ; /* enable cong. control (rx/tx) */ char have_gsi ; /* have gsi */ char rx_in_hole ; /* last seg. appended was a hole */ char rx_need_reass; /* last sbappendaddr failed */ char rx_nak_mc; /* send optional mc nak */ char rx_do_ack ; /* send ack to data */ char rx_do_loss; /* enable receiver loss report */ char rx_do_nack ; /* send an initial nack to elect acker */ char tx_do_fin ; /* add a FIN when done with data */ } ; #define intopgmcb(ip) ((struct pgmcb *)(ip)->inp_ppcb) #define sotopgmcb(so) (intopgmcb(sotoinpcb(so))) #define pgmcbtoso(tp) (tp->p_inpcb->inp_socket) /* * PGM statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct pgmstat { u_long pgms_badsum ; u_long pgms_fullsock ; u_long pgms_hdrops ; /* dropped by header */ u_long pgms_ipackets ; /* input packets */ u_long pgms_opackets ; /* output packets */ u_long pgms_rcvduppack ; u_long pgms_rcvpack ; u_long pgms_rcvoopack ; }; /* * Names for PGM sysctl objects */ #define PGMCTL_STATS 1 /* statistics (read-only) */ #define PGMCTL_SENDSPACE 2 /* send buffer space */ #define PGMCTL_RECVSPACE 3 /* receive buffer space */ #define PGMCTL_MAXID 4 #define PGMCTL_NAMES { \ { 0, 0 }, \ { "stats", CTLTYPE_STRUCT }, \ { "sendspace", CTLTYPE_INT }, \ { "recvspace", CTLTYPE_INT }, \ } #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_pgm); #endif extern struct inpcbhead pgmcb; /* head of queue of active tcpcb's */ extern struct inpcbinfo pcbinfo; extern struct pgmstat pgmstat; /* pgm statistics */ struct pgmcb *pgm_close (struct pgmcb *); void pgm_ctlinput (int, struct sockaddr *, void *); int pgm_ctloutput (struct socket *so, struct sockopt *sopt); int pgm_make_options(struct pgmcb *tp); void pgm_drain __P((void)); void pgm_fasttimo (void); void pgm_init (void); void pgm_input (struct mbuf *, int, int); int pgm_output (struct pgmcb *, int, pgm_seq); void pgm_slowtimo (void); void pgm_clean_reass(struct pgmcb *tp); void pgm_timer(void *); int pgm_usrreq (struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *); void pgm_data_move(struct pgmcb *); extern struct pr_usrreqs pgm_usrreqs; extern struct inpcbinfo pgmcbinfo ; extern u_long pgm_sendspace, pgm_recvspace ; extern int pgm_do_ack; #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ pgm_timer.c100644 423 0 126167 7115541460 11664 0ustar luigiwheel/* * pgm_timer.c - 000110 * Copyright (c) 1999-2000 Luigi Rizzo * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR `AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEB(x) #define DDB(x) x #define SET_BACKOFF (1) + (random() % nak_bo_ivl) #if CONG_CON /* * the feedback value is the loss rate l, 0 <=l < 1, scaled by 2^16. * We use exponential smoothing by W (to be defined), so here we have * some useful constants */ #define ONE_S16 65536 /* 1 << 16 */ #define C_S16 (ONE_S16 - pgm_w_s16) /* (1-W)*(2^16) */ static u_int32_t exp(u_int32_t , u_int32_t); /* * Compute w ^ e */ static u_int32_t exp(u_int32_t w, u_int32_t e) { u_int32_t l, i; if (e == 0) return ONE_S16; for ( i = w, l = ONE_S16 ; e ; e >>= 1 ) { if (e & 1) l = (l * i) >> 16 ; i = (i * i) >> 16 ; } return l; } #endif static void pgm_rx_in(struct pgmcb *tp, struct pgmhdr *ph, struct mbuf *m); static void pgm_handle_naks(struct pgmcb *tp) ; static void pgm_dispatch(struct inpcb *last, struct mbuf *m, struct pgmhdr *ph); static void insert_rdata_q(struct pgmcb *tp, pgm_seq seq); static int option_handler(struct pgmcb *, struct pgmhdr *, struct mbuf *); int pgm_now; struct pgmstat pgmstat ; static struct sockaddr_pgm pgm_in = { sizeof(pgm_in), AF_INET } ; static int pgm_ticks = 0 ; SYSCTL_INT(_net_inet_pgm, OID_AUTO, ticks, CTLFLAG_RW, &pgm_ticks , 0, "Pgm shaper tick counter"); static int cc_timeout = 20; SYSCTL_INT(_net_inet_pgm, OID_AUTO, cc_timeout, CTLFLAG_RW, &cc_timeout , 0, "Base for timeout for lack of tokens"); /* counted from loss detection */ static int nak_bo_ivl = 3; SYSCTL_INT(_net_inet_pgm, OID_AUTO, nak_bo_ivl, CTLFLAG_RW, &nak_bo_ivl , 0, "Base for random backoff nak timeout"); /* counted from loss detection */ static int ncf_to_ivl = 5 ; /* base value for NCF timeout */ SYSCTL_INT(_net_inet_pgm, OID_AUTO, ncf_to_ivl, CTLFLAG_RW, &ncf_to_ivl , 5, "NCF receive timeout"); static int rdata_to_ivl = 20 ; /* base value for RDATA timeout */ SYSCTL_INT(_net_inet_pgm, OID_AUTO, rdata_to_ivl, CTLFLAG_RW, &rdata_to_ivl , 0, "RDATA receive timeout"); static int ncf_retries = 10 ; /* how many retries to get NCF */ SYSCTL_INT(_net_inet_pgm, OID_AUTO, ncf_retries, CTLFLAG_RW, &ncf_retries , 0, "NCF retries"); static int rdata_retries = 10 ; /* how many retries to get RDATA */ SYSCTL_INT(_net_inet_pgm, OID_AUTO, rdata_retries, CTLFLAG_RW, &rdata_retries , 0, "RDATA retries"); static int spm_ivl = 15; /* measured in ticks */ SYSCTL_INT(_net_inet_pgm, OID_AUTO, spm_ivl, CTLFLAG_RW, &spm_ivl , 0, "Interval between ambient SPM"); #ifdef CONG_CON int pgm_w_s16 = 65000; /* 0.992 << 16 */ SYSCTL_INT(_net_inet_pgm, OID_AUTO, w_s16, CTLFLAG_RW, &pgm_w_s16 , 0, "Weigth for lossrate computation"); int pgm_do_ack = 1; SYSCTL_INT(_net_inet_pgm, OID_AUTO, pgm_do_ack, CTLFLAG_RW, &pgm_do_ack , 0, "Enable ACK generation"); int rtt_cong_filter = 1; SYSCTL_INT(_net_inet_pgm, OID_AUTO, rtt_cong_filter, CTLFLAG_RW, &rtt_cong_filter, 0, "Ignore multiple cong.event per rtt"); static int dup_threshold = 3; SYSCTL_INT(_net_inet_pgm, OID_AUTO, dupack, CTLFLAG_RW, &dup_threshold, 0, "Threshold for dup acks"); static int ss_threshold = 6; SYSCTL_INT(_net_inet_pgm, OID_AUTO, ss_thresh, CTLFLAG_RW, &ss_threshold, 0, "Threshold for exponential window opening"); #endif void pgm_slowtimo() /* currently unused... */ { } /* * "Fast" protocol timeout routine called every 200 ms. * Updates the timers in all active pcb's and * causes finite state machine actions if timers expire. */ void pgm_fasttimo() { struct inpcb *inp, *ipnxt; struct pgmcb *tp; int s ; DEB(printf("PGM: pgm_fasttimo %d\n", pgm_now);) /* * Search through pcb's and update active timers. */ for ( inp = pgmcb.lh_first; inp != NULL; inp = ipnxt) { ipnxt = inp->inp_list.le_next; tp = intopgmcb(inp) ; s = splnet(); if (tp->state == PGM_SENDER && tp->have_gsi == 1) { /* ambient spm. */ if (tp->tx_spm_ticks-- <= 0) { tp->tx_spm_ticks = spm_ivl ; pgm_output(tp, PGM_SPM_TYPE, 0 /* unused here */); } /* * have pkts to send, and shaper would let me send, but * i am being limited by absence of tokens */ if (tp->odata_curr && tp->numbytes >= 0 && (tp->enable_cc && tp->cc_token < PGMCC_C(1) ) ) { if (tp->cc_timeout++ > cc_timeout) { DEB(microtime(&tp->tv); printf("== %lu.%06lu TL %u W %d STALL\n", tp->tv.tv_sec, tp->tv.tv_usec, tp->txw_lead, tp->cc_window) ; ) tp->cc_timeout = 0 ; tp->ignore_cong = tp->txw_lead ; tp->cc_token = PGMCC_C(1); tp->cc_window = PGMCC_C(1); tp->dupacks = 0 ; tp->ack_bitmask = ~0 ; /* i give one more try to the current one */ if (tp->acker_addr.s_addr == INADDR_ANY) { int end = tp->sender_cc_opt->hdr.type & PGM_OPT_END ; tp->sender_cc_opt->hdr.type = PGM_OPT_SEND_NAK | end ; } tp->acker_addr.s_addr = INADDR_ANY ; /* * try find a new acker in case the old one is dead. */ pgm_data_move(tp); } } } else if (tp->state == PGM_RX_CONNECTED) { if (tp->rx_need_reass) pgm_clean_reass(tp); if ( PGM_Q_NONEMPTY(tp->nak_next) ) pgm_handle_naks(tp); if (tp->rx_do_nack && tp->path_nla.s_addr && tp->enable_cc) { tp->rx_do_nack = 0 ; /* XXX fake a very high sequence number */ pgm_output(tp, PGM_NAK_TYPE, tp->rxw_lead ^ 0x40000000) ; } } splx(s); } pgm_now++; /* for timestamps */ } /* * High frequency task to move packets out of the traffic shaper. * Calld by the timer once per tick. */ void pgm_timer(void *dummy) { struct inpcb *inp, *ipnxt ; struct pgmcb *tp ; int s ; s = splnet(); pgm_ticks++ ; /* * Search through pcb's for active senders. */ for (inp = pgmcb.lh_first; inp != NULL; inp = ipnxt) { struct socket *so = inp->inp_socket ; ipnxt = inp->inp_list.le_next; tp = intopgmcb(inp) ; if (tp->state != PGM_SENDER || tp->p_template == NULL) continue ; /* * Increment credit if we are below 0. This correctly simulates * the start time of a pkt. To simulate the finish time we would * need something more elaborate accounting for packet length. */ if ( tp->numbytes < 0 ) tp->numbytes += tp->txw_curr_rte; pgm_data_move(tp); /* flush expired packet */ if (tp->odata_trail_head != NULL) { struct pgm_data_q *q = tp->odata_trail_head ; int need_wakeup = 0 ; if (tp->odata_ticks_from_last_insert < tp->odata_lifetime) tp->odata_ticks_from_last_insert++ ; q->ticks-- ; while (q && q->ticks <= 0) { /* expire old packets */ sbdroprecord(&so->so_snd) ; tp->odata_trail_head = q->next ; DEB(printf("pgm_timer: free trail %p\n", q);) free(q, M_PCB); q = tp->odata_trail_head; need_wakeup = 1; tp->txw_trail++ ; /* XXX BC: should disable OPT_JOIN if we past beyond point */ } if (need_wakeup) sowwakeup(so); } } splx(s); timeout(pgm_timer, (caddr_t)NULL, 1); } /* * The following procedure decides whether to send NAKs or otherwise * handle timeouts for missing packets. Called by slowtimo. */ static void pgm_handle_naks(struct pgmcb *tp) { struct pgm_pkt_q *q ; int is_hole = 0; for (q = tp->nak_next ; !PGM_Q_HEAD(q, tp) ; q = q->nak_next ) { if ( --q->u.t.timeout > 0 ) continue; if ( q->u.t.type == NAK_TIMEOUT ) { /* Can send NAK only if have path_nla from an SPM */ if (tp->path_nla.s_addr) { DDB(printf("++ sent NAK %d (retry %d)\n", q->dp_seq, q->u.t.ncf_retry); ) pgm_output(tp, PGM_NAK_TYPE, q->dp_seq); q->u.t.type = NCF_TIMEOUT ; q->u.t.timeout = q->u.t.ncf_to_ivl ; } else { /* retry later... */ q->u.t.timeout = SET_BACKOFF; } } else { /* basically the same thing here... */ int toomuch ; if ( q->u.t.type == NCF_TIMEOUT ) { /* q->u.t.ncf_to_ivl *= 2 ; */ toomuch = ( ++q->u.t.ncf_retry >= ncf_retries ) ; } else { /* RDATA_TIMEOUT */ /* q->u.t.rdata_to_ivl *= 2 ; */ toomuch = ( ++q->u.t.rdata_retry >= rdata_retries ) ; } if ( toomuch ) { q->type = T_HOLE; /* irrecov. loss due to missing NCF */ is_hole = 1; } else { /* schedule nak ... */ q->u.t.type = NAK_TIMEOUT ; q->u.t.timeout = SET_BACKOFF; } } } if (is_hole) pgm_clean_reass(tp); } /* * Pass up packets from the reassembly queue when possible * (in-sequence data or irrecoverable packets "T_HOLE"). */ void pgm_clean_reass(struct pgmcb *tp) { struct pgm_pkt_q *q; struct socket *so = pgmcbtoso(tp); int need_wakeup = 0 ; tp->rx_need_reass = 0; /* * INVARIANT: at each stage, q->dp_seq == tp->rxw_next. If not * there is a bad mistake in the code, probably worth a panic. */ while ( PGM_Q_NONEMPTY(tp->seg_next) ) { q = tp->seg_next ; if (q->dp_seq != tp->rxw_next) { /* check invariant */ printf("--- clean_reass: have %u (rxw_next.%d)\n", q->dp_seq, q->dp_seq - tp->rxw_next); panic("clean_reass: q->dp_seq != tp->rxw_next"); } if ( PGM_SEQ_LT(tp->rxw_next, tp->rxw_trail) && q->type == T_NAK ) q->type = T_HOLE ; /* NAK expired for trail advance */ if ( q->type == T_NAK ) /* recoverable NAK */ break ; else if ( q->type == T_HOLE ) { /* Update nak queue */ q->nak_next->nak_prev = q->nak_prev ; q->nak_prev->nak_next = q->nak_next ; if (tp->rx_in_hole == 0) { /* add a hole entry */ need_wakeup = 1 ; tp->rx_in_hole = 1 ; tp->rxw_hole_start = tp->rxw_next ; sbappendaddr(&so->so_rcv, (struct sockaddr *)&pgm_in, NULL, NULL) ; } } else if (q->type == T_ODATA) { if (tp->rx_in_hole) break; else { if (sbappendaddr(&so->so_rcv, (struct sockaddr *)&pgm_in, q->u.d.m, NULL) == 0) { /* no space in sockbuf or insufficient mbufs */ DEB(printf("--- clean_reass: sbappendaddr failed\n");) tp->rx_need_reass = 1; goto done; } need_wakeup = 1; /* BC:XXX */ } } /* * record gone. advance next,trail, unlink from queue */ tp->rxw_next++; if (PGM_SEQ_LT(tp->rxw_trail, tp->rxw_next ) ) tp->rxw_trail = tp->rxw_next ; q->dp_seq = 0; /* to mark errors */ q->seg_next->seg_prev = q->seg_prev ; q->seg_prev->seg_next = q->seg_next ; free(q, M_PCB); } done: if (need_wakeup) sorwakeup(so); } /* * pgm_rx_in is the main handler for packets in receivers. * We handle DATA, NAK, NCF, SPM. All but the last one have the * interesting fields in the pgmhdr, which is always available. * The SPM body is copied so even if lost by option_handler this is * not problematic. * Only called by pgm_dispatch on receive sockets with * an already-assigned TSI. * The reassembly queue is below the socket buffer. */ static void pgm_rx_in(struct pgmcb *tp, struct pgmhdr *ph, struct mbuf *m) { struct pgm_pkt_q *me = NULL, *q, *r ; pgm_seq dp_seq = 0, dp_trail, dp_lead; pgm_seq seq; /* first pkt we can recover */ struct pgm_spm_body spm_body; int pgmhdrlen = sizeof(struct pgmhdr); /* strip off pgm header */ m->m_len -= pgmhdrlen; m->m_pkthdr.len -= pgmhdrlen; m->m_data += pgmhdrlen; /* copy spm_body */ if (ph->type == PGM_SPM_TYPE) spm_body = *mtod(m, struct pgm_spm_body*); tp->rx_options = 0; if (ph->options & PGM_OPT_PRESENT && option_handler(tp, ph, m) != 0 ) goto fail; if (tp->state == PGM_RECEIVER) { /* * First SPM makes receiver socket connected. */ #if 0 /* * do not connect if you dont have an SPM, or NAK will be * delayed too much. */ if (ph->type == PGM_OD_TYPE) dp_seq = ntohl(ph->od_dp_seq); else #endif if (ph->type == PGM_SPM_TYPE) dp_seq = ntohl(spm_body.spm_le_seq) + 1; /* XXX */ else goto fail ; tp->state = PGM_RX_CONNECTED ; tp->rxw_lookahead = PGM_DEFAULT_LOOKAHEAD ; DEB(printf("pgm_rx_in: options 0x%08x\n", tp->rx_options);) if ((tp->rx_options & OPT_JOIN) == 0) { tp->rxw_irs = tp->rxw_trail = tp->rxw_lead = tp->rxw_next = dp_seq; DEB(printf("pgm_rx_in: INIT rxw_trail to %u\n", tp->rxw_trail);) } } /* * now we are in connected state. */ switch (ph->type) { case PGM_NAK_TYPE: case PGM_NCF_TYPE: /* * Handle this for nak suppression * XXX TODO -- i might use the info to detect missing pkts. */ dp_seq = ntohl(ph->nak_req_seq) ; if (PGM_SEQ_GEQ(dp_seq, tp->rxw_trail) && PGM_SEQ_LEQ(dp_seq, tp->rxw_lead)) { DEB(printf("pgm_rx_in: %s received for seq %lu\n", ph->type == PGM_NAK_TYPE ? "NAK": "NCF", ntohl(ph->nak_req_seq));) for (q = tp->nak_next ; !PGM_Q_HEAD(q,tp) ; q = q->nak_next) if ( q->dp_seq == dp_seq) { if (ph->type == PGM_NCF_TYPE) { q->u.t.type = RDATA_TIMEOUT ; q->u.t.timeout = q->u.t.rdata_to_ivl ; q->u.t.ncf_retry = 0 ; } else { /* must be a NAK */ if (q->u.t.type == NAK_TIMEOUT) { /* act as if the NAK had been sent */ q->u.t.type = NCF_TIMEOUT ; q->u.t.timeout = q->u.t.ncf_to_ivl ; } /* otherwise ignore it */ } break; } } goto done; case PGM_OD_TYPE: case PGM_RD_TYPE: dp_seq = dp_lead = ntohl(ph->od_dp_seq); dp_trail = ntohl(ph->od_txw_trail); /* * XXX TODO enforce receive window size limitations, dropping * the most recent packets. Not trivial, as rxw_size is * measured in bytes, not packets; packets are spread between * the socket buffer and the reassembly queue; and we don't know * how big are the holes. */ /* set lookahead to 1/2 of the current window or min 10 pkts */ if (PGM_SEQ_GT(tp->rxw_lead, tp->rxw_trail+2*PGM_DEFAULT_LOOKAHEAD)) tp->rxw_lookahead = (tp->rxw_lead - tp->rxw_trail) / 2 ; else tp->rxw_lookahead = PGM_DEFAULT_LOOKAHEAD ; /* * check for in-window packet. */ if (PGM_SEQ_LT(dp_seq, tp->rxw_trail) ) goto fail ; /* way too old */ if ( PGM_SEQ_GT(dp_seq, tp->rxw_lead + tp->rxw_lookahead) ) { /* new one... might be rogue, check trail is in-window */ if (PGM_SEQ_LT(dp_trail, tp->rxw_trail) || PGM_SEQ_GT(dp_trail, tp->rxw_lead + tp->rxw_lookahead) ) { DEB(printf("pgm_rx_in: data %u out-of-window (%u,%u + %d), drop\n", dp_seq, tp->rxw_trail, tp->rxw_lead, tp->rxw_lookahead); ); goto fail ; } } if (ph->type == PGM_OD_TYPE && tp->enable_cc) { /* * Update the bitmask of recent packets. We only consider odata. */ int delta = (int)(dp_lead - tp->rxw_lead); if (PGM_SEQ_GT(dp_lead, tp->rxw_lead)) { if (delta > 31) tp->ack_bitmask = 0 ; else tp->ack_bitmask <<= delta ; tp->ack_bitmask |= 1 ; } else { /* old, out of sequence ? */ delta = -delta ; if (delta < 31) tp->ack_bitmask |= (1 << delta) ; } } /* * update rxw_lead and rxw_trail */ if (PGM_SEQ_GT(dp_lead, tp->rxw_lead)) { /* Update lossrate: L = L * W ^ (new_lead - old_lead) */ tp->rx_loss *= exp(pgm_w_s16, dp_lead - tp->rxw_lead ); tp->rx_loss >>= 16; tp->rxw_lead = dp_lead; } if (PGM_SEQ_GT(dp_trail, tp->rxw_trail)) tp->rxw_trail = dp_trail; #if CONG_CON if (pgm_do_ack && tp->rx_do_ack && ph->type == PGM_OD_TYPE && tp->enable_cc) { pgm_output(tp, PGM_ACK_TYPE, dp_seq); DEB( if (tp->ack_bitmask != ~0) { char s[34]; int i ; u_int32_t x = tp->ack_bitmask ; for (i = 0 ; i < 32 ; i++) { s[i] = (x & 1 ) ? '.' : 'X' ; x >>= 1 ; } s[32] = '\0'; printf("++ ACK seq %d lead %d mask %s\n", dp_lead, tp->rxw_lead, s); } ) } #endif break; case PGM_SPM_TYPE: /* * XXX todo: check that the SPM is a recent one */ dp_trail = ntohl(ph->spm_txw_trail); dp_lead = ntohl(spm_body.spm_le_seq); DEB(printf("pgm_rx_in: SPM received from 0x%lx [%u,%u]\n", ntohl(spm_body.path_nla.s_addr), dp_trail, dp_lead);); tp->path_nla = spm_body.path_nla; tp->src_nla.s_addr = pgm_in.sin_addr.s_addr ; m_freem(m); m = NULL ; /* don't need pkt anymore, only lead..trail markers */ if (PGM_SEQ_GT(dp_trail, tp->rxw_trail)) { /* trail advanced, cleanup */ tp->rxw_trail = dp_trail; pgm_clean_reass(tp); } if (ph->options & PGM_OPT_FIN2 && PGM_SEQ_GT(tp->rxw_next, dp_lead)) { /* Got FIN and we can deliver it to application */ struct socket *so = pgmcbtoso(tp); socantrcvmore(so); tp->rxw_hole_start = tp->rxw_next + 1 ; } if (PGM_SEQ_LEQ(dp_lead, tp->rxw_lead)) return ; /* * If I get here: dp_lead > rxw_lead, and must insert entries * for NAK after rxw_lead. XXX check the code! * Also update rx_loss */ tp->rx_loss *= exp(pgm_w_s16, dp_lead - tp->rxw_lead ); tp->rx_loss >>= 16; dp_seq = tp->rxw_lead = dp_lead; break; default: printf("pgm_rx_in: discarding type %d\n", ph->type); goto fail ; } /* * XXX at the moment we only get here with ODATA/RDATA/SPM. Should we * decide to use NAK/NCF to detect holes, make sure m is NULL so we * can tell the two cases. * * Locate place to insert. dp_seq is the current packet, q points * initially to the last record. After the scan, seq is the seqno of * the first missing packet, and q points to the record after which * we must insert new entries (which are seq..dp_seq inclusive). */ q = tp->seg_prev; r = tp->nak_prev; if ( PGM_Q_HEAD(q, tp) ) { /* queue empty, first missing pkt is is rxw_trail */ seq = tp->rxw_trail; /* * check the special case of rxw_trail going beyond rxw_next * XXX this can be optimized a lot! */ if (seq > tp->rxw_next) seq = tp->rxw_next ; } else if ( PGM_SEQ_GT(dp_seq, q->dp_seq) ) { /* Pkt newer than last in queue, so start after that one. */ seq = q->dp_seq+1 ; } else if (m == NULL) { /* we are in the middle, but this is not data. */ return ; /*XXX maybe should not happen ? */ } else { /* * We are in the middle, need a full scan. Only follow the * NAK list, so skip over data pkts. */ for (r = tp->nak_next ; !PGM_Q_HEAD(r,tp) ; r = r->nak_next) { /* XXX should exit when PGM_SEQ_GT( r->dp_seq, dp_seq); */ if (r->dp_seq != dp_seq) continue; /* fill the hole */ r->type = T_ODATA; r->u.d.m = m; /* Update nak queue */ r->nak_next->nak_prev = r->nak_prev ; r->nak_prev->nak_next = r->nak_next ; r->nak_next = r->nak_prev = NULL; #if CONG_CON if (ph->type == PGM_OD_TYPE) { u_int32_t ris; /* Update lossrate: L = L - W ^ (lead - i) */ ris = (exp(pgm_w_s16, tp->rxw_lead - r->dp_seq) * C_S16) >> 16; if (ris > tp->rx_loss) { DEB(printf("--- Approximate: %u %u\n", ris, tp->rx_loss);) tp->rx_loss = 0; } else tp->rx_loss -= ris; } #endif goto present; /* XXX only if q == tp->seg_next */ } pgmstat.pgms_rcvduppack++; goto fail ; } /* * insert an entry for each missing packet */ if (ph->type == PGM_RD_TYPE) goto fail; /* should not get here!!! */ for (; PGM_SEQ_LEQ(seq, dp_seq) ; seq++ ) { me = malloc(sizeof(*me), M_PCB, M_NOWAIT); if (me == NULL) goto fail ; bzero(me, sizeof(*me) ); me->dp_seq = seq ; /* index of the missing/new packet */ if ( seq == dp_seq && m != NULL ) { me->type = T_ODATA ; me->u.d.m = m; } else { /* set retransmission state */ #if CONG_CON int ris; /* Update lossrate: L = L + W ^ (lead - i) */ ris = (exp(pgm_w_s16, tp->rxw_lead - me->dp_seq) * C_S16) >> 16; tp->rx_loss += ris; DEB(printf("pgm_rx_in:(hole) rx_loss %u [ %u %u]\n", tp->rx_loss, tp->rxw_lead, me->dp_seq);) #endif me->type = T_NAK ; me->u.t.type = NAK_TIMEOUT ; me->u.t.timeout = SET_BACKOFF; me->u.t.ncf_to_ivl = ncf_to_ivl ; me->u.t.rdata_to_ivl = rdata_to_ivl ; me->u.t.ncf_retry = me->u.t.rdata_retry = 0; /* Insert nak entry */ me->nak_next = r->nak_next; me->nak_prev = r; r->nak_next = me; me->nak_next->nak_prev = me; r = me; } /* * insert into queue */ me->seg_next = q->seg_next ; me->seg_prev = q ; q->seg_next = me ; me->seg_next->seg_prev = me ; q = me ; } present: /* * try to figure out if we can request some retransmission * to fill holes, and check if we can pass one or more packets * up to the socket buffer. */ pgm_clean_reass(tp); return ; done: fail: m_freem(m); return ; } /* * called by ip_input to demux the packet to the appropriate place(s). * Runs at splnet. */ void pgm_input(struct mbuf *m, int iphlen, int proto) { struct ip *ip; struct pgmhdr ph ; struct inpcb *inp, *last ; struct pgmcb *tp; int len; u_int16_t pkt_sport; DEB(printf("PGM: pgm_input\n");) pgmstat.pgms_ipackets++; /* * Strip IP options, if any. We will have router-alert on SPM, * NAK and NCF. */ if (iphlen > sizeof (struct ip)) { /* Strip IP options, if any. */ ip_stripoptions(m, (struct mbuf *)0); iphlen = sizeof(struct ip); } /* * Get IP and PGM header together in first mbuf (can still be * a cluster, so shared in copies). */ ip = mtod(m, struct ip *); if (m->m_len < iphlen + sizeof(struct pgmhdr)) { if ((m = m_pullup(m, iphlen + sizeof(struct pgmhdr))) == 0) { pgmstat.pgms_hdrops++; return; } ip = mtod(m, struct ip *); } ph = *(struct pgmhdr *)((caddr_t)ip + iphlen); /* copy pgm header */ /* * Make mbuf data length reflect PGM length. * If not enough data to reflect PGM length, drop. */ len = ip->ip_len; /* total PGM packet length (needed for checksum) */ /* * Construct sockaddr format source address, to be used in sbappendaddr. */ pgm_in.sin_port = ph.ph_sport ; pgm_in.sin_addr = ip->ip_src ; pgm_in.gsid_low = ph.gsid_low ; pgm_in.gsid_high = ph.gsid_high ; pgm_in.sport = ph.ph_sport ; if (ph.type == PGM_NAK_TYPE || ph.type == PGM_ACK_TYPE) pkt_sport = ph.ph_dport; else pkt_sport = ph.ph_sport; /* strip off IP header, not needed anymore here. */ m->m_len -= iphlen; m->m_pkthdr.len -= iphlen; m->m_data += iphlen; /* * Checksum PGM header and data. Note, the IP header is not included. */ if (ph.checksum) { u_int16_t old_sum = ph.checksum ; ph.checksum = in_cksum(m, len ); if (ph.checksum) { pgmstat.pgms_badsum++; printf("-- pgm_input: cksum failed, type %u, len %d, 0x%x -> 0x%x\n", ph.type, len, old_sum, ph.checksum); m_freem(m); return ; } } /* * Pullup header+options (except payload). * Because the tsdu_len is in the common part of the header, * we can compute the difference and know how much to pullup. */ len -= ntohs(ph.tsdu_len) ; /* pgm header length, incl. options */ if (m->m_len < len) { if ((m = m_pullup(m, len)) == 0) { pgmstat.pgms_hdrops++; return; } } /* * Deliver PGM packets to all matching pcbs. Most are multicast, * unicast PGM packets can only be NAK directed to a source. * In principle we could go straight to the (only) pcb, but we cannot * use in_pcblookup_hash() for this as it checks faddr to filter * basing on the source IP, so we scan the whole list ourselves. * * To avoid mcopy'ing in case of a single destination, record the * matching position in "last", and handle it only when another match * is found. The final pass is done without copying. */ inp = pgmcb.lh_first ; last = NULL; /* * Now we look for matching inpcbs. * NOTA BENE: if on the same host we have a sender and receivers, * and a unicast NAK arrives, we will (correctly!) find a match only for * sender's inpcb. In fact ip->ip_dst.s_addr for NAK is a unicast address * and inp->inp_laddr.s_addr for receiver is a MC address. */ for (; inp != NULL ; inp = inp->inp_list.le_next) { DEB( printf( "pgm_input: packet SRC 0x%08lx/0x%04x -> DST 0x%08lx/0x%04x type %d\n" " socket FGN 0x%08lx/0x%04x -> LOC 0x%08lx/0x%04x\n", ntohl(ip->ip_src.s_addr), ntohs(ph.ph_sport), ntohl(ip->ip_dst.s_addr), ntohs(ph.ph_dport), ph.type, ntohl(inp->inp_laddr.s_addr), ntohs(inp->inp_lport), ntohl(inp->inp_faddr.s_addr), ntohs(inp->inp_fport) ); ) /* * various checks for a matching socket. We need to match: * + local port (always) * + foreign port (except for raw receiver where it is 0); * + local addr (for receiver it is multicast, for sender * it is the unicast IP of output interface) * + and finally, the full TSI * We _cannot_ match the foreign address, on the receiver because * packets might come from multiple sources, on the sender because * NAKs might come from multiple receivers/NEs. */ if (inp->inp_lport != ph.ph_dport) continue; /* local port not matching */ if (inp->inp_fport != 0 && inp->inp_fport != ph.ph_sport) continue; /* foreign port not matching */ /* * On the receiver, laddr is the MC group address. * On the sender, laddr is the unicast IP of the out interface. * XXX why do we check for INADDR_ANY ??? */ if (inp->inp_laddr.s_addr != INADDR_ANY && inp->inp_laddr.s_addr != ip->ip_dst.s_addr) continue; /* local addr. not matching */ tp = intopgmcb(inp); if (tp->state == PGM_NEW || tp->state == PGM_CLOSED) continue; DEB(printf("pgm_input: tp->TSI 0x%08lx.%04x.0x%04x\n", ntohl(tp->gsid_low), ntohs(ph.gsid_high), ntohs(tp->sport) );) /* * check full TSI */ if ( tp->have_gsi && ( tp->gsid_low != ph.gsid_low || tp->gsid_high != ph.gsid_high || tp->sport != pkt_sport ) ) { printf("--- pgm_input: TSI match failed 0x%08lx.%04x.%04x\n", ntohl(ph.gsid_low), ntohs(ph.gsid_high), ntohs(pkt_sport) ); continue ; /* TSI does not match */ } DEB(printf("pgm_input: found descriptor 0x%p state %d for type %d\n", inp, tp->state, ph.type);); if (last != NULL) { struct mbuf *my_m = m_copypacket(m, M_DONTWAIT) ; if (my_m != NULL) pgm_dispatch(last, my_m, &ph); } last = inp; } if (last) pgm_dispatch(last, m, &ph); else { /* No matching pcb found; discard datagram. */ m_freem(m); DEB(printf("--- pgm_input: no matching socket\n");) } } /* * Pass the packet to the pcb, possibly copying if not the last one. * Header+options are in the first mbuf so we dont need pullup etc. */ static void pgm_dispatch(struct inpcb *last, struct mbuf *m, struct pgmhdr *ph) { struct pgmcb *tp = intopgmcb(last); switch (tp->state) { case PGM_RX_CONNECTED: case PGM_RECEIVER: if (tp->have_gsi) { /* run the receiver state machine */ pgm_rx_in(tp, ph, m); sorwakeup(last->inp_socket); } else { /* can only be PGM_RECEIVER, a raw receiver */ if (sbappendaddr(&last->inp_socket->so_rcv, (struct sockaddr *)&pgm_in, m, NULL) == 0) { pgmstat.pgms_fullsock++; m_freem(m); return ; } sorwakeup(last->inp_socket); /* XXX */ } return ; case PGM_SENDER: if (tp->p_template == NULL) /* connect not done yet... */ break ; /* strip off pgm header */ m->m_len -= sizeof(struct pgmhdr); m->m_pkthdr.len -= sizeof(struct pgmhdr); m->m_data += sizeof(struct pgmhdr); switch (ph->type) { case PGM_ACK_TYPE : { int total, delta, missing ; pgm_seq seqno, lead; u_int32_t bitmask, new_acks, l; struct pgm_ack_body *pb ; if (!tp->enable_cc) break; tp->cc_timeout = 0 ; /* any ack resets the cc_timeout */ pb = mtod(m, struct pgm_ack_body *); seqno = ntohl(ph->ack_req_seq); lead = ntohl(ph->ack_rxw_lead); bitmask = ntohl(pb->ack_bitmask); /* Update mrtt (XXX tricks in the scaling...) */ tp->acker_mrtt = (tp->acker_mrtt * (pgm_w_s16 >> 8) + (tp->txw_lead - lead) * C_S16) >> 8; /* * using bitmasks, determine new acks carried by this pkt. * tp->ack_bitmask is the old state, bitmask is the one in the ack. * This section is necessary because we dont have cumulative acks */ delta = (int)(lead - tp->ack_lead) ; if (delta > 32) /* very new, reset old state */ tp->ack_bitmask = 0 ; else if (delta > 0) tp->ack_bitmask <<= delta ; else if (delta < -32) bitmask = 0 ; else bitmask <<= -delta ; /* compute the new acks */ new_acks = bitmask & ~tp->ack_bitmask ; /* and update state */ tp->ack_bitmask |= bitmask ; if (PGM_SEQ_GT(lead, tp->ack_lead) ) tp->ack_lead = lead ; /* compute total acks in this packet. */ for (total = 0, l = new_acks ; l ; l >>= 1 ) if (l & 1) total ++ ; if (delta > 32) printf("-- warning, old ack. should get %d more\n", delta - 32); /* count missing packets */ for (missing = 0 , l = ~tp->ack_bitmask ; l ; l >>= 1) if (l & 1) missing++ ; DEB(if (~bitmask != 0) printf("++ rxACK lead %d txw_lead %d W %d T %d \n", tp->ack_lead, tp->txw_lead, tp->cc_window, tp->cc_token); ) if (total == 0) /* fully duplicate acks */ goto done; /* * Do not count dups for ACKs within an rtt from prev. * congestion. */ if (rtt_cong_filter && PGM_SEQ_LEQ(lead, tp->ignore_cong)) { if (tp->dupacks) { printf("-- recover %d dup, missing %d, mask 0x%08x\n", tp->dupacks, missing, tp->ack_bitmask); total += tp->dupacks ; tp->dupacks = 0 ; /* do not count dup acks */ } missing = 0 ; tp->ack_bitmask = ~0 ; } if ( missing == 0 ) { u_int32_t delta_w ; total += tp->dupacks ; /* recover previous acks */ tp->dupacks = 0 ; if (tp->cc_token < 0) delta_w = 0 ; /* only ignore tokens */ else if (tp->cc_window < PGMCC_C(ss_threshold)) delta_w = PGMCC_C(1) ; /* exp. increase for small windows */ else delta_w = PGMCC_DIV( PGMCC_C(1) , tp->cc_window ); tp->cc_token += total*( PGMCC_C(1) + delta_w) ; tp->cc_window += total* delta_w; } else { tp->dupacks += total ; printf("++ miss %d tot %d dup %d, W %d (real %d) mask 0x%08x\n", missing, total, tp->dupacks, tp->cc_window, (tp->txw_lead - lead) << 8, tp->ack_bitmask); if (tp->dupacks >= dup_threshold) { /* * re-sync window estimate with reality. * Since we know the real window, we also know how * many tokens will arrive (if no loss), and how * many we should ignore. */ tp->cc_window = (tp->txw_lead - lead + 1 ) << 8 ; tp->ignore_cong = tp->txw_lead ; /* pgm_data_move( tp ); */ /* force something out */ if (tp->cc_token < 0) { printf(".. token < 0, don;t reduce\n"); tp->cc_token += PGMCC_C(1) ; } else if (tp->cc_window >= PGMCC_C(4) ) { tp->cc_window /= 2 ; tp->cc_token = -tp->cc_window + PGMCC_C(1); } else { /* small window. Return one token ? */ printf("small window...\n"); tp->cc_token = PGMCC_C(1) ; } tp->ack_bitmask = ~0 ; /* not count anymore these losses */ tp->dupacks = 0 ; } } done: } break ; case PGM_NAK_TYPE: { pgm_seq i = ntohl(ph->nak_req_seq); /* BC XXX: send an NCF only for in-window requests * Multicast an NCF in response to ANY NAK, then schedule RDATA * for in-window requests. */ DEB(printf("pgm_dispatch: NAK received for seq %u\n", i ); ) if (PGM_SEQ_GEQ(i,tp->txw_trail) && PGM_SEQ_LEQ(i,tp->txw_lead) ) { pgm_output(tp, PGM_NCF_TYPE, i); insert_rdata_q( tp, i ); } else { i ^= 0x40000000 ; /* re-check for fake NAK */ if (PGM_SEQ_LT(i,tp->txw_trail)||PGM_SEQ_GT(i,tp->txw_lead) ) { printf("??? NAK outside window %d <= (%d) <= %d\n", tp->txw_trail, i ^0x40000000, tp->txw_lead); } } } break ; default: /* * XXX NNAK handling still missing... */ } if (ph->options & PGM_OPT_PRESENT && option_handler(tp, ph, m) != 0 ) { m_freem(m); return; } break; default: printf("--- pgm_input: state not recognized: should not be here !!!\n"); break; } m_freem(m); return; } /* * Send ODATA/RDATA packets. Called either directly, or by the traffic shaper. * MUST BE CALLED AT splnet() OR ABOVE. * Returns the number of bytes sent ; * This sends as many bytes as available or allowed * by the traffic shaper (the credit, scaled by 8*hz, is tp->numbytes). * The bandwidth budget should include ODATA, RDATA and SPM. * NOTE: pointers are advanced in pgm_output() !!! */ void pgm_data_move(struct pgmcb *tp) { struct pgm_data_q *q; struct socket *so = pgmcbtoso(tp); int sent = 0 ; DEB(printf("PGM: pgm_data_move\n"); ) /* free expired rdata */ while ( (q = tp->rdata_head) != NULL && q->seq < tp->txw_trail ) { tp->rdata_head = q->next; free(q, M_PCB); } /* transmit some rdata */ while ( (q = tp->rdata_head) != NULL && tp->numbytes >= 0 ) { if (tp->txw_max_rte > 0) tp->numbytes -= 8 * hz * q->mb->m_pkthdr.len; pgm_output(tp, PGM_RD_TYPE, 0 /* unused here */); } /* transmit some data */ while ( tp->odata_curr != NULL && tp->numbytes >= 0 && (tp->cc_token >= PGMCC_C(1) || !tp->enable_cc) ) { if (tp->txw_max_rte > 0) tp->numbytes -= 8 * hz * tp->odata_curr->m_pkthdr.len ; sent++ ; pgm_output(tp, PGM_OD_TYPE, 0 /* unused here */); tp->cc_token -= PGMCC_C(1); /* consume one token */ /* * Implement the window advance policy for this pkt (seqno is * txw_lead, we don't have an mbuf pointer for this anymore * as odata_curr has been moved forward by pgm_output(). */ if (tp->trail_advance_policy == TRAIL_ADVANCE_TIMER) { q = malloc( sizeof (*q), M_PCB, M_NOWAIT); if (q == NULL) { printf("--- pgm_data_move: OUCH, cannot" "allocate record to expire ODATA...\n"); } else { q->next = NULL ; if (tp->odata_trail_head == NULL) { tp->odata_trail_head = q ; q->ticks = tp->odata_lifetime ; } else { tp->odata_trail_tail->next = q ; q->ticks = tp->odata_ticks_from_last_insert ; } tp->odata_ticks_from_last_insert = 0 ; tp->odata_trail_tail = q ; } } } /* * if we are in TRAIL_ADVANCE_DATA, cannot send more, and writer * is blocked, free some space (typically enough to move above * low water mark) */ if (tp->trail_advance_policy == TRAIL_ADVANCE_DATA) { int freed = 0 ; if (tp->odata_curr == NULL && sb_notify(&so->so_snd) ) { while ( sbspace(&so->so_snd) < so->so_snd.sb_lowat ) { sbdroprecord(&so->so_snd) ; tp->txw_trail++ ; freed++ ; } sowwakeup(so) ; if (tp->rdata_head != NULL) printf("++ warning, flush with pending rdata\n"); } DEB(printf("data_move, sent %d tok %d numb %d flush %d, odata %x blocked %d\n", sent, tp->cc_token, tp->numbytes, freed, tp->odata_curr, sb_notify(&so->so_snd));) } } /* * Insert pgm_data_q element ordered by seqno. Retrieve mbuf ptr * by moving (seq - txw_trail) steps in the mbuf chain. * We check before that the requested segment is in-window. */ static void insert_rdata_q(struct pgmcb *tp, pgm_seq seq) { struct pgm_data_q *p, *q, *r; struct mbuf *m; int diff = seq - tp->txw_trail ; /* how many steps must go in the queue. */ struct socket *so = pgmcbtoso(tp); /* first, locate position in queue after which to insert. */ for (p = NULL, r = tp->rdata_head ; r != NULL ; p = r, r = r->next) if (r->seq == seq) return ; /* nothing to do, entry already existing */ else if (r->seq > seq) break; /* * Allocate a descriptor. If fails, just ignore request (should * record the failure in some statistics). */ q = (struct pgm_data_q *) malloc(sizeof(*q), M_PCB, M_NOWAIT); if (q == NULL) return ; /* locate mbuf pointer */ for (m = so->so_snd.sb_mb; m && diff > 0 ; m = m->m_nextpkt, diff-- ) ; if (m == NULL) { printf("--- insert_rdata_q: want %u trail-lead %u, %u\n", seq, tp->txw_trail, tp->txw_lead); panic("--- insert_rdata_q: mbuf not found\n"); } q->next = r; q->seq = seq; q->mb = m; if (p == NULL) tp->rdata_head = q; else p->next = q; } /* * Process pgm options. Move mbuf pointers past the options, * to the payload. * First option must be PGM_OPT_LENGTH, last PGM_OPT_END. */ static int option_handler(struct pgmcb *tp, struct pgmhdr *ph, struct mbuf *m) { struct pgm_option *opt; caddr_t base, limit ; int len; switch (ph->type) { case PGM_SPM_TYPE: len = sizeof(struct pgm_spm_body); break; case PGM_NCF_TYPE: case PGM_NAK_TYPE: len = sizeof(struct pgm_nack_body); break; case PGM_ACK_TYPE: /* XXX BC */ len = sizeof(struct pgm_ack_body); break; case PGM_OD_TYPE: case PGM_RD_TYPE: default: len = 0; break; } /* strip off rest of the header */ m->m_len -= len; m->m_pkthdr.len -= len; m->m_data += len; opt = mtod(m, struct pgm_option *); if (opt->type != PGM_OPT_LENGTH) { printf("--- option_handler: check PGM_OPT_LENGTH failed type 0x%x\n", opt->type); return -2; } len = ntohs(opt->tot_len); base = m->m_data + OPT_HLEN ; limit = m->m_data + len ; /* strip off options from mbuf, we have the pointers to them. */ m->m_len -= len; m->m_pkthdr.len -= len; m->m_data += len; /* Remember that rx_option is set to 0 in pgm_rx_in function */ while (base < limit) { opt = (struct pgm_option *)base ; switch (opt->type & ~PGM_OPT_END) { case PGM_OPT_JOIN: if (tp->state != PGM_RECEIVER) break; tp->rx_options |= OPT_JOIN; tp->rxw_irs = tp->rxw_trail = tp->rxw_lead = tp->rxw_next = ntohl(((struct pgm_opt_join *)base)->trail) ; DEB(printf("option_handler: JOIN %u\n", tp->rxw_trail);) break; #if CONG_CON case PGM_OPT_ELECT: DEB(printf("option_handler: PGM_OPT_ELECT\n");) if (tp->state != PGM_RX_CONNECTED || !tp->enable_cc) break; tp->rx_options |= OPT_SENDER_CC; /* am I the acker ? */ tp->rx_do_ack = (tp->p_template->pi_src.s_addr == ((struct pgm_opt_cc *)base)->acker.s_addr) ; DEB(printf("!! %s\n", tp->rx_do_ack ? "I am the new acker" : "I am not the acker anymore");) break ; case PGM_OPT_SEND_NAK: DEB(printf("option_handler: OPT_SEND_NAK\n");) /* The acker field is not significant for this option */ if (tp->state != PGM_RX_CONNECTED || !tp->enable_cc) break; tp->rx_options |= OPT_SENDER_CC; tp->rx_do_nack = 1 ; break ; case PGM_OPT_LOSSRATE: if (tp->state != PGM_SENDER || !tp->enable_cc) break; DEB(printf("option_handler: PGM_OPT_LOSSRATE\n");) tp->rx_options |= OPT_LOSSRATE; { u_int32_t rx_loss = ntohl(((struct pgm_opt_loss *)base)->rx_loss) ; pgm_seq rx_lead = ntohl(((struct pgm_opt_loss *)base)->rx_lead) ; pgm_seq rttn = (tp->txw_lead - rx_lead ) << 8 ; u_int64_t nacker_loss = ( (rttn * rttn) >> 8) * rx_loss ; if (tp->acker_addr.s_addr != ((struct pgm_opt_loss *)base)->nacker.s_addr) printf("acker l/rtt/AL %u %d (%u) %u NAK %u %d %u\n", tp->sender_l, (tp->txw_lead - tp->sender_rl) << 8, tp->acker_mrtt, (u_int32_t)(tp->acker_loss>>22), rx_loss, rttn, (u_int32_t)(nacker_loss>>22)); /* XXX shift by 22 to ease reading */ if (tp->acker_addr.s_addr == INADDR_ANY) { u_int32_t end = tp->sender_cc_opt->hdr.type & PGM_OPT_END ; tp->sender_cc_opt->hdr.type = PGM_OPT_ELECT | end ; tp->sender_cc_opt->acker = tp->acker_addr = ((struct pgm_opt_loss *)base)->nacker; tp->acker_mrtt = rttn ; tp->cc_token += PGMCC_C(1); /* consider this as an ACK */ tp->ack_lead = tp->txw_lead ; tp->ack_bitmask = ~0 ; /* hopefully a data_move is done at the next tick */ DDB(printf("!!! elected first acker 0x%08lx\n", ntohl(tp->acker_addr.s_addr));) } else if ( nacker_loss > tp->acker_loss && ((struct pgm_opt_loss *)base)->nacker.s_addr != tp->acker_addr.s_addr) { /* * switch to new acker if this one is slower * RTTN^2 * rx_loss > MRTTA^2 * sender_l + TOLERANCE */ tp->sender_cc_opt->acker = tp->acker_addr = ((struct pgm_opt_loss *)base)->nacker ; tp->acker_mrtt = rttn ; DDB(printf("!!! elected new acker 0x%08lx\n", ntohl(tp->acker_addr.s_addr));) } /* * If the option is from the (current/new) acker, update * sender_l, sender_rl and acker_loss. acker_mrtt is * from the lowpass filter in case of the current acker. */ if (tp->acker_addr.s_addr == ((struct pgm_opt_loss *)base)->nacker.s_addr) { tp->sender_l = rx_loss ; tp->sender_rl = rx_lead ; /* Now I can update acker_loss. * XXX TODO: Note, there is a +/-1 * error in the estimate of the mrtt. So if we want a bit * of hysteresys in the switch, we should increase * acker_mrtt by 1, and decrease nacker_mrtt by 1 */ tp->acker_loss = ( (tp->acker_mrtt * tp->acker_mrtt) >> 8) * tp->sender_l; } } break; #endif default: printf("--- option_handler: OUCH!: option type not recognized\n"); break; } base += opt->len ; if (opt->type & PGM_OPT_END) break; } /* end of option while */ if (base != limit || (opt->type & PGM_OPT_END) != PGM_OPT_END) { printf("--- option_handler: check PGM_OPT_END failed opt_type 0x%x\n", opt->type); return -2; } return 0; } /*** end of pgm_timer.c ***/ pgm_usrreq.c100644 423 0 100023 7115541615 12046 0ustar luigiwheel/* * pgm_usrreq.c - 000110 * * Copyright (c) 1999-2000 Luigi Rizzo * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEB(x) #define DDB(x) x /* * PGM protocol interface to socket abstraction. */ #ifndef PGMCBHASHSIZE #define PGMCBHASHSIZE 128 #endif struct inpcbhead pgmcb; struct inpcbinfo pgmcbinfo ; static struct pgmcb *pgm_newpgmcb(struct inpcb *inp); static struct pgmiphdr *pgm_template(struct pgmcb *tp, struct sockaddr *nam); /* * pgm_sendspace and pgm_recvspace are the default send and receive window * sizes, respectively. */ static int pgm_odata_lifetime = 20 ; /* measured in seconds */ SYSCTL_INT(_net_inet_pgm, OID_AUTO, odata_lifetime, CTLFLAG_RW, &pgm_odata_lifetime , 0, "PGM ODATA lifetime, seconds"); /* * bandwidth = 0 means unlimited */ static int pgm_bandwidth = 0; SYSCTL_INT(_net_inet_pgm, OID_AUTO, bandwidth, CTLFLAG_RW, &pgm_bandwidth , 0, "PGM send rate, bits/s"); u_long pgm_sendspace = 1024*128 ; SYSCTL_INT(_net_inet_pgm, PGMCTL_SENDSPACE, sendspace, CTLFLAG_RW, &pgm_sendspace , 0, "PGM sender buffer space"); u_long pgm_recvspace = 1024*128 ; SYSCTL_INT(_net_inet_pgm, PGMCTL_RECVSPACE, recvspace, CTLFLAG_RW, &pgm_recvspace , 0, "PGM receive buffer space"); static int pgmcksum = 1 ; /* do checksum */ SYSCTL_INT(_net_inet_pgm, OID_AUTO, pgmcksum, CTLFLAG_RW, &pgmcksum , 1, "Enable PGM checksums"); static int pgm_gsid_low = 0x12345678 ; SYSCTL_INT(_net_inet_pgm, OID_AUTO, gsid_low, CTLFLAG_RW, &pgm_gsid_low , 0x12345678, "PGM GSI-low (32 bit)"); static int pgm_gsid_high = 0x9abc ; SYSCTL_INT(_net_inet_pgm, OID_AUTO, gsid_high, CTLFLAG_RW, &pgm_gsid_high , 0x9abc, "PGM GSI-high (16 bit)"); int pgm_enable_cc = 1; SYSCTL_INT(_net_inet_pgm, OID_AUTO, pgm_enable_cc, CTLFLAG_RW, &pgm_enable_cc , 0, "Enable congestion control"); int pgm_enable_lossrate = 1; SYSCTL_INT(_net_inet_pgm, OID_AUTO, pgm_enable_lossrate, CTLFLAG_RW, &pgm_enable_lossrate , 0, "Enable lossrate report"); extern int pgm_w_s16; /* XXX to move somewhere else */ /* * there is a router_alert thing in igmp.c which could be reused... * We don't just to avoid removing the static declaration in igmp.c * but it is something needs to be fixed later. XXX */ static struct mbuf *pgm_router_alert; /* * Create a new PGM control block, making an * empty reassembly queue and hooking it to the argument * protocol control block. */ static struct pgmcb * pgm_newpgmcb(struct inpcb *inp) { struct pgmcb *tp; tp = malloc(sizeof(*tp), M_PCB, M_NOWAIT); if (tp == NULL) return ((struct pgmcb *)0); bzero((char *) tp, sizeof(struct pgmcb)); tp->p_inpcb = inp; /* init reass queue pointers */ tp->seg_next = tp->seg_prev = (struct pgm_pkt_q *)tp; /* init nak queue pointers */ tp->nak_next = tp->nak_prev = (struct pgm_pkt_q *)tp; /* * set fields as required. */ tp->state = PGM_NEW ; tp->txw_max_rte = tp->txw_curr_rte = pgm_bandwidth ; /* bits/s */ /* * note here trail > lead, meaning the window is empty */ tp->spm_sqn = 0 /* random() */ ; tp->txw_lead = 0 /* random() */; /* most recently tx pkt */ tp->txw_trail = tp->txw_lead + 1 ; /* oldest avail. pkt */ tp->rxw_size = pgm_recvspace ; tp->trail_advance_policy = TRAIL_ADVANCE_TIMER ; tp->odata_lifetime = pgm_odata_lifetime * hz ; /* now in ticks */ #if CONG_CON /* I set fields to 0 just to remind them, even if it is the default */ tp->ack_bitmask = ~0 ; tp->ignore_cong = tp->txw_lead ; /* only signific. for sender */ tp->dupacks = 0 ; /* only signific. for sender */ tp->enable_cc = pgm_enable_cc; /* only signific. for sender */ tp->rx_do_ack = 0 ; /* only signific. for receiver */ tp->rx_do_loss = pgm_enable_lossrate; /* only signific. for receiver */ #endif inp->inp_ppcb = (caddr_t)tp; return (tp); } /* * Output the pkt requested in cmd (additional info is in the pgmcb). * Also do any necessary state update (e.g. sequence numbers, * pointers, remove RDATA records...) * Third parameter is a sequence number used for NAKs, ACKs and other * segment types. */ int pgm_output(struct pgmcb *tp, int cmd, pgm_seq seqno) { struct inpcb *inp = tp->p_inpcb; struct mbuf *m, *my_m; struct pgmiphdr *pi; u_int16_t len = 0; int error, num_prep = 0; struct mbuf *opt = NULL; if (tp->p_template == NULL) { printf("--- pgm_output: OUCH ! template not allocated !"); return EINVAL; } /* * First, allocate data (or extended header) portion. */ switch(cmd) { default: printf("--- pgm_output: unsupported type\n"); return EINVAL; case PGM_ACK_TYPE: { struct pgm_ack_body *ack; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) return ENOBUFS; /* leave room for link and protocol headers if possible */ if (max_linkhdr + sizeof(struct pgmiphdr) + sizeof(*ack) <= MHLEN) m->m_data += max_linkhdr + sizeof(struct pgmiphdr); m->m_len = m->m_pkthdr.len = sizeof(*ack); ack = mtod(m, struct pgm_ack_body *); ack->ack_bitmask = htonl(tp->ack_bitmask); } break ; case PGM_NAK_TYPE: { struct pgm_nack_body *nak; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) return ENOBUFS; /* leave room for link and protocol headers if possible */ if (max_linkhdr + sizeof(struct pgmiphdr) + sizeof(*nak) <= MHLEN) m->m_data += max_linkhdr + sizeof(struct pgmiphdr); m->m_len = m->m_pkthdr.len = sizeof(*nak); nak = mtod(m, struct pgm_nack_body *); nak->src_nla.s_addr = tp->src_nla.s_addr; nak->nla_afi2 = htons( 1 ); /* this is for IPv4 */ nak->rsvd2 = 0 ; nak->mc_nla = inp->inp_laddr; } break ; case PGM_NCF_TYPE: { struct pgm_nack_body *ncf; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) return ENOBUFS; /* leave room for link and protocol headers if possible */ if (max_linkhdr + sizeof(struct pgmiphdr) + sizeof(*ncf) <= MHLEN) m->m_data += max_linkhdr + sizeof(struct pgmiphdr) ; m->m_len = m->m_pkthdr.len = sizeof(*ncf); ncf = mtod(m, struct pgm_nack_body *); ncf->src_nla = inp->inp_laddr; ncf->nla_afi2 = htons( 1 ); /* IPv4 AFI */ ncf->rsvd2 = 0; ncf->mc_nla = inp->inp_faddr; } opt = pgm_router_alert ; break; case PGM_SPM_TYPE: { struct pgm_spm_body *spm ; MGETHDR(m, M_DONTWAIT, MT_HEADER); if (m == NULL) return ENOBUFS; /* leave room for link and protocol headers if possible */ /* XXX maybe add room for options as well ? */ if (max_linkhdr + sizeof(struct pgmiphdr) + sizeof(*spm) <= MHLEN) m->m_data += max_linkhdr + sizeof(struct pgmiphdr) ; m->m_len = m->m_pkthdr.len = sizeof(*spm); spm = mtod(m, struct pgm_spm_body *) ; spm->spm_le_seq = htonl( tp->txw_lead ) ; spm->nla_afi = htons( 1 ) ; /* IPv4 AFI */ spm->rsvd = 0 ; spm->path_nla = inp->inp_laddr; DEB( printf("pgm_output: send SPM for nla 0x%lx\n", ntohl(spm->path_nla.s_addr) ); ) tp->spm_sqn++ ; } opt = pgm_router_alert ; break ; case PGM_OD_TYPE: /* copy packet from socket buffer */ /* * XXX TODO in case of timeout, try get the last data * instead of the new one. */ m = m_copypacket( tp->odata_curr, M_DONTWAIT ); if (m == NULL) return ENOBUFS; len = m->m_pkthdr.len; break ; case PGM_RD_TYPE: /* copy packet from socket buffer */ m = m_copypacket( tp->rdata_head->mb, M_DONTWAIT ); if (m == NULL) return ENOBUFS; opt = pgm_router_alert ; len = m->m_pkthdr.len; break; } /* * Fill in mbuf with extended pgm header and stuff... */ if (tp->xmit_options && (cmd == PGM_OD_TYPE || cmd == PGM_RD_TYPE)) num_prep = tp->xmit_optlen; M_PREPEND(m, sizeof(struct pgmiphdr) + num_prep, M_DONTWAIT); if (m == NULL) return ENOBUFS; pi = mtod(m, struct pgmiphdr *); bcopy(tp->p_template, pi, sizeof(*pi) ); if (tp->xmit_options) { /* * copy pgm options at the right place, using the mbuf copy * functions if they don't fit in the first mbuf... */ int offset; if (num_prep) { offset = sizeof(struct pgmiphdr); bcopy(tp->opt_ptr, m->m_data + offset, tp->xmit_optlen); } else { #if CONG_CON /* * For receivers, we assume that PGM_OPT_LOSSRATE is the * only option that they can send. Vice versa, we should * look for PGM_OPT_LOSSRATE in tp->opt_ptr */ offset = m->m_len; if (cmd == PGM_NAK_TYPE || cmd == PGM_ACK_TYPE) { u_int32_t num; int off; /* Fill lossrate and receiver-lead fields */ /* off is sizeof(OPT_LENGTH) + sizeof(pgm_option) */ off = OPT_HLEN + OPT_HLEN; num = htonl(tp->rx_loss); bcopy(&num, tp->opt_ptr + off, sizeof(num)); off += sizeof(tp->rx_loss); num = htonl(tp->rxw_lead); bcopy(&num, tp->opt_ptr + off, sizeof(num)); /* src_ip already copied in pgm_make_options */ DEB(printf("pgm_output: loss %u nak_seq %u 0x%lx\n", tp->rx_loss, tp->rxw_lead, ntohl(tp->p_template->pi_src.s_addr));) } m_copyback(m, offset, tp->xmit_optlen, tp->opt_ptr); /* XXX */ #endif } pi->pi_p.options |= PGM_OPT_PRESENT; } /* * len = payload length for ODATA or RDATA, so is 0 for others. */ pi->pi_p.tsdu_len = htons( len ); /* * We change the value of len for checksum computation: * len = PGM pkt length */ len = m->m_pkthdr.len - sizeof(struct ip); /* * Fill up any remaining fields in the header, and update state * in the control block (pointers, sequence numbers, etc.). */ switch(cmd) { case PGM_ACK_TYPE: pi->pi_p.type = PGM_ACK_TYPE; pi->pi_p.ack_req_seq = htonl(seqno); pi->pi_p.ack_rxw_lead = htonl(tp->rxw_lead); pi->pi_dst = tp->src_nla; break ; case PGM_NAK_TYPE: { u_int16_t *p = (u_int16_t *)&(pi->pi_p._seq2); /* the _seq2 field in NAK is 16-bit NLA AFI, 16 bit reserved */ p[0] = htons(1) ; /* NLA AFI for src IP */ p[1] = 0 ; /* reserved for src IP */ pi->pi_p.type = PGM_NAK_TYPE; pi->pi_p.nak_req_seq = htonl(seqno); pi->pi_dst = tp->path_nla; } break ; case PGM_NCF_TYPE: pi->pi_p.type = PGM_NCF_TYPE; pi->pi_p.nak_req_seq = htonl(seqno); DEB(printf("pgm_output: NCF for seq %u\n", seqno);) break; case PGM_SPM_TYPE: pi->pi_p.type = PGM_SPM_TYPE ; pi->pi_p.spm_txw_trail = htonl(tp->txw_trail); pi->pi_p.spm_seq = htonl(tp->spm_sqn); /* add a FIN if the time has come */ if (tp->tx_do_fin && tp->odata_curr == NULL) /* XXX */ pi->pi_p.options |= PGM_OPT_FIN2 ; break ; case PGM_OD_TYPE: /* * XXX todo -- on timeout, if we got the previous pkt, do not * advance the window. */ tp->odata_curr = tp->odata_curr->m_nextpkt ; /* advance ptr */ tp->txw_lead++; pi->pi_p.type = PGM_OD_TYPE; pi->pi_p.od_txw_trail = htonl(tp->txw_trail) ; pi->pi_p.od_dp_seq = htonl(tp->txw_lead); DEB(printf("pgm_output: ODATA packet seq %u\n", tp->txw_lead);) break ; case PGM_RD_TYPE: { struct pgm_data_q *r = tp->rdata_head ; pi->pi_p.type = PGM_RD_TYPE; pi->pi_p.od_txw_trail = htonl(tp->txw_trail); pi->pi_p.od_dp_seq = htonl(r->seq); DEB(printf("pgm_output: RDATA for packet %u\n", r->seq);) tp->rdata_head = r->next; /* advance pointer and free queue */ free(r, M_PCB); } break; } /* * PGM checksum starts from the PGM header. It _does not_ include * the IP header (or the pseudoheader for what matters). */ pi->pi_sum = 0 ; if ( pgmcksum ) { /* skip ip header... */ m->m_len -= sizeof(struct ip) ; m->m_pkthdr.len -= sizeof(struct ip) ; m->m_data += sizeof(struct ip) ; if ((pi->pi_sum = in_cksum(m, len)) == 0) pi->pi_sum = 0xffff ; m->m_len += sizeof(struct ip) ; m->m_pkthdr.len += sizeof(struct ip) ; m->m_data -= sizeof(struct ip) ; } pgmstat.pgms_opackets++; ((struct ip *)pi)->ip_len = len + sizeof (struct ip) ; ((struct ip *)pi)->ip_ttl = inp->inp_ip_ttl; /* XXX */ ((struct ip *)pi)->ip_tos = inp->inp_ip_tos; /* XXX */ my_m = NULL; if (tp->rx_nak_mc && cmd == PGM_NAK_TYPE) { /* * XXX here should make sure that the struct pgmiphdr is in * an mbuf, not a cluster. This is true for the code above. */ my_m = m_copypacket(m, M_DONTWAIT); if (my_m) { pi = mtod(my_m, struct pgmiphdr *); pi->pi_dst = inp->inp_laddr; inp->inp_moptions->imo_multicast_ttl = 1; /* MC NAKs have TTL = 1 */ error = ip_output(my_m, opt, &inp->inp_route, inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST), inp->inp_moptions); } } error = ip_output(m, opt, &inp->inp_route, inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST), inp->inp_moptions); if (error) printf("--- pgm_output: ip_output error\n"); return error ; } static int pgm_abort(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct pgmcb *tp = NULL ; int s ; struct pgm_data_q *q ; DEB(printf("pgm_abort\n");) if (inp == NULL) return EINVAL; s = splnet() ; /* XXX not sure it is really needed... */ tp = intopgmcb(inp); tp->state = PGM_CLOSED ; /* so input will not touch us */ /* free data structures... */ while ( (q = tp->rdata_head) != NULL ) { tp->rdata_head = q->next; free(q, M_PCB); } while ( (q = tp->odata_trail_head) != NULL ) { tp->odata_trail_head = q->next; free(q, M_PCB); } while ( PGM_Q_NONEMPTY(tp->seg_next) ) { struct pgm_pkt_q *q = tp->seg_next ; if (q->type == T_ODATA && q->u.d.m) m_freem(q->u.d.m); tp->seg_next = q->seg_next ; free(q, M_PCB); } if (tp->xmit_options) free(tp->opt_ptr, M_PCB); if (tp->p_template) m_free(dtom(tp->p_template)); free(tp, M_PCB); inp->inp_ppcb = 0; soisdisconnected(so); in_pcbdetach(inp); splx(s); return 0 ; } static int pgm_attach(struct socket *so, int proto, struct proc *p) { struct inpcb *inp = NULL; /* XXX BC was sotoinpcb(so) */ struct pgmcb *tp = NULL; int error, s; DEB(printf("pgm_attach\n");) if (inp != NULL) return EINVAL; s = splnet(); error = in_pcballoc( so, &pgmcbinfo, p ); if (error) goto done; inp = sotoinpcb(so); tp = pgm_newpgmcb(inp); if (tp == 0) { int nofd = so->so_state & SS_NOFDREF; /* XXX */ so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ in_pcbdetach(inp); so->so_state |= nofd; error = ENOBUFS ; goto done; } error = soreserve(so, pgm_sendspace, pgm_recvspace); if (error) goto done; ((struct inpcb *) so->so_pcb)->inp_ip_ttl = ip_defttl; done: splx(s); return error; } /* * API: we can do a bind() on both rx and tx sockets, only allowed * in PGM_NEW state. Bind is optional for a sender, and if done must * preceed the connect() call. * sin_addr, sin_port: anything (both sender and receiver) * For a receiver, sin_addr = multicast address, * sin_port = destination port (local endpoint). * For a sender, sin_addr is usally INADDR_ANY (or an IP for a * local interface), sin_port is either 0 or the chosen port. * gsid == 0, sport == 0: get all matching pkts (raw, receiver only) * gsid == 0, sport == sin_port: commit as sender. * (we still need a connect...) * gsid != 0, sport != 0: commit as receiver * once committed, cannot bind again. * */ static int pgm_bind(struct socket *so, struct sockaddr *nam, struct proc *p) { struct inpcb *inp = sotoinpcb(so); struct pgmcb *tp = NULL ; int error =0, s ; DEB(printf("pgm_bind\n");) if (inp == NULL) return EINVAL; tp = intopgmcb(inp); s = splnet(); if (tp->state != PGM_NEW ) { printf("--- pgm_bind: socket already committed\n"); error = EINVAL ; } if (error == 0) error = in_pcbbind( inp, nam, p ); DEB(printf("pgm_bind: in_pcbind error: %d\n",error);) if (error == 0) { struct sockaddr_pgm *sin = (struct sockaddr_pgm *) nam; if (sin->gsid_high == 0 && sin->gsid_low == 0) { if (sin->sport == sin->sin_port) { DEB(printf("pgm_bind: sin_port == sport, sender mode\n");) tp->state = PGM_SENDER; tp->sport = inp->inp_lport; } else if (sin->sport == 0) { /* receiver... just listen for everything */ DEB(printf("soisconnecting for everything\n");) tp->state = PGM_RECEIVER ; soisconnecting(so); } else { printf("--- pgm_bind: invalid sport with gsi == 0\n"); error = EINVAL ; } } else { if (sin->sport == 0) { printf("--- pgm_bind: invalid sport with gsi != 0\n"); error = EINVAL ; } else { tp->have_gsi = 1 ; tp->gsid_low = sin->gsid_low ; tp->gsid_high = sin->gsid_high ; tp->sport = inp->inp_fport = sin->sport ; tp->state = PGM_RECEIVER ; /* pgm_template needs TSI and state initialized in the pgmcb */ tp->p_template = pgm_template(tp, nam); if (tp->p_template == 0) error = ENOBUFS; else { if (tp->rx_do_loss) { /* default: enable lossrate report */ tp->xmit_options |= OPT_LOSSRATE ; error = pgm_make_options(tp); if (error) printf("--- pgm_bind: OUCH pgm_make_option failed"); } soisconnecting(so); } } } } splx(s); return error ; } static int pgm_connect(struct socket *so, struct sockaddr *nam, struct proc *p) { int error = 0, s; struct inpcb *inp = sotoinpcb(so); struct pgmcb *tp = intopgmcb(inp); struct in_addr local_addr ; local_addr.s_addr = INADDR_ANY; /* just to set some value */ DEB(printf("pgm: pgm_connect\n");); if (tp->state != PGM_NEW && tp->state != PGM_SENDER) { printf("--- pgm_connect: invalid state %d for connect\n", tp->state); return EINVAL ; } s = splnet(); /* If the socket has not been bound with a local port, * in_pcbbind assigns one automatically. After in_pcbbind * set inp_laddr to INADDR_ANY to let in_pcbconnect to * find the uc IP address of the interface. * XXX save the address so we can use it if pcbconnect returns 0 */ if (inp->inp_lport == 0) { error = in_pcbbind( inp, nam, p ); local_addr = inp->inp_laddr ; printf("connect: local is 0x%x\n", (u_int)ntohl(local_addr.s_addr) ); inp->inp_laddr.s_addr = INADDR_ANY; } if (error == 0) { if (inp->inp_faddr.s_addr != INADDR_ANY) { printf("--- pgm_connect: faddr != INADDR_ANY, 0x%lx\n", ntohl(inp->inp_faddr.s_addr ) ); error = EISCONN; } else { error = in_pcbconnect( inp, nam, p ); if (inp->inp_laddr.s_addr == INADDR_ANY) { printf("warning, in_pcbconnect set laddr INADDR_ANY, try 0x%x\n", (u_int)ntohl(local_addr.s_addr)); inp->inp_laddr = local_addr ; } } } if (error == 0) { tp->state = PGM_SENDER; tp->sport = inp->inp_lport; tp->have_gsi = 1; tp->gsid_low = htonl(pgm_gsid_low); tp->gsid_high = htons(pgm_gsid_high); /* pgm_template needs TSI and state initialized in the pgmcb */ tp->p_template = pgm_template(tp, nam); if (tp->p_template == 0) { in_pcbdisconnect(inp); error = ENOBUFS ; } else { soisconnected(so); } tp->cc_token = PGMCC_C(1); tp->cc_window = PGMCC_C(1); } splx(s); return error ; } static int pgm_detach(struct socket *so) { DEB(printf("pgm_detach\n"); ); return pgm_abort(so); }; static int pgm_disconnect(struct socket *so) { DEB(printf("pgm_disconnect\n"); ); return pgm_abort(so); }; /* * at the moment, can only work in state PGM_SENDER and with * a valid template. */ static int pgm_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct proc *p) { struct inpcb *inp= sotoinpcb(so); struct pgmcb *tp = intopgmcb(inp); int s, error = 0; DEB(printf("pgm_send\n");) if (control) { printf("pgm: PRU_SEND: control_len %d\n", control->m_len); m_freem(control); /* XXX shouldn't caller do this??? */ } if (nam) { printf("--- pgm_send: PRU_SEND: don't want an address!\n"); m_freem(m); return EISCONN ; } if (tp->state != PGM_SENDER || tp->p_template == NULL) { printf("--- pgm_send: socket not ready to send\n"); return EINVAL ; } s = splnet(); sbappendrecord(&so->so_snd, m); if (tp->odata_curr == NULL) /* prepare the xmit queue */ tp->odata_curr = m ; pgm_data_move(tp); /* send through the traffic shaper */ splx(s); return error; /* don't want to free the buffer */ } static int pgm_shutdown(struct socket *so) { struct inpcb *inp = sotoinpcb(so); struct pgmcb *tp = intopgmcb(inp); DDB(printf("pgm_shutdown\n"); ); if (inp == 0) return EINVAL; socantsendmore(so); tp->tx_do_fin = 1 ; if (tp->odata_curr == NULL) pgm_output(tp, PGM_SPM_TYPE, 0 /* unused here */); return 0 ; } struct pr_usrreqs pgm_usrreqs = { pgm_abort, pru_accept_notsupp, pgm_attach, pgm_bind, pgm_connect, pru_connect2_notsupp, in_control, pgm_detach, pgm_disconnect, pru_listen_notsupp, in_setpeeraddr, pru_rcvd_notsupp, pru_rcvoob_notsupp, pgm_send, pru_sense_null, pgm_shutdown, in_setsockaddr, sosend, soreceive, sopoll }; /* * pgm_template creates a template pkt for the sender. * We depend on some fields (gsid, sport, dport) being initialized * earlier. */ static struct pgmiphdr * pgm_template(struct pgmcb *tp, struct sockaddr *nam) { struct inpcb *inp = tp->p_inpcb; struct pgmiphdr *pi; struct sockaddr_in *ifaddr; int error; pi = tp->p_template ; if ( pi == NULL ) { struct mbuf *m = m_get(M_DONTWAIT, MT_HEADER); if (m == NULL) return NULL ; m->m_len = sizeof (struct pgmiphdr); pi = mtod(m, struct pgmiphdr *); } else { printf("--- warning, p_template already set, state %d\n", tp->state); } bzero( pi, sizeof (*pi) ); /* IP header */ pi->pi_pr = IPPROTO_PGM ; pi->pi_len = htons(sizeof (struct pgmhdr) ) ; if (tp->state == PGM_RECEIVER) { inp->inp_laddr.s_addr = INADDR_ANY; if ( (error = in_pcbladdr(inp, nam, &ifaddr)) != 0) return NULL; pi->pi_src = ifaddr->sin_addr; ifaddr = (struct sockaddr_in*)nam; inp->inp_laddr = ifaddr->sin_addr; DEB(printf("pgm_template: src IP addr = 0x%lx, inp_laddr 0x%lx\n", ntohl(pi->pi_src.s_addr), ntohl(inp->inp_laddr.s_addr));) } else { /* must be PGM_SENDER */ pi->pi_src = inp->inp_laddr; if (tp->enable_cc) { tp->xmit_options |= OPT_SENDER_CC ; /* XXX support for cong.contr */ error = pgm_make_options(tp); if (error) printf("--- pgm_template: OUCH pgm_make_option failed"); } } pi->pi_dst = inp->inp_faddr; /* PGM header */ pi->pi_p.ph_sport = inp->inp_lport; pi->pi_p.ph_dport = inp->inp_fport; pi->pi_p.type = PGM_OD_TYPE; pi->pi_p.options = 0; pi->pi_p.checksum = 0; pi->pi_p.gsid_low = tp->gsid_low; pi->pi_p.gsid_high = tp->gsid_high; pi->pi_p.tsdu_len = pi->pi_p._seq1 = pi->pi_p._seq2 = 0; return pi ; } void pgm_drain() { printf("PGM: pgm_drain()\n"); } void pgm_init() { struct ipoption *ra; /* * init hash list for pgm control blocks */ LIST_INIT(&pgmcb) ; pgmcbinfo.listhead = &pgmcb ; pgmcbinfo.hashbase = hashinit(PGMCBHASHSIZE, M_PCB, &pgmcbinfo.hashmask); pgmcbinfo.porthashbase = hashinit(PGMCBHASHSIZE, M_PCB, &pgmcbinfo.porthashmask); pgmcbinfo.ipi_zone = zinit("pgmcb", sizeof(struct inpcb), maxsockets, ZONE_INTERRUPT, 0); /* * update global variables */ if (max_protohdr < sizeof(struct pgmhdr)) max_protohdr = sizeof(struct pgmhdr) ; if (max_linkhdr + sizeof(struct pgmhdr) > MHLEN) panic("pgm_init: headers too long"); /* * Construct a Router Alert option to use in outgoing packets */ MGET(pgm_router_alert, M_DONTWAIT, MT_DATA); /* XXX might fail, just hope not! */ ra = mtod(pgm_router_alert, struct ipoption *); ra->ipopt_dst.s_addr = 0; ra->ipopt_list[0] = IPOPT_RA; /* Router Alert Option */ ra->ipopt_list[1] = 0x04; /* 4 bytes long */ ra->ipopt_list[2] = 0x00; ra->ipopt_list[3] = 0x00; pgm_router_alert->m_len = sizeof(ra->ipopt_dst) + ra->ipopt_list[1]; timeout(pgm_timer, NULL, 1); } void pgm_ctlinput(int cmd, struct sockaddr *sa, void *vip) { printf("PGM: pgm_ctlinput\n"); } #define GET_INT_ARG(x, min, max) \ error = sooptcopyin(sopt, &x, sizeof x, sizeof x); \ if (error) break ; \ if ( (x) < (min) || (x) > (max) ) { \ error = EINVAL ; break ; } #define PUT_INT_ARG(x) \ optval = (x) ; error = sooptcopyout(sopt, &optval, sizeof optval); int pgm_ctloutput ( struct socket *so, struct sockopt *sopt) { int error = 0, optval, s; struct inpcb *inp; struct pgmcb *tp; s = splnet(); /* really too coarse locking... */ inp = sotoinpcb(so); if (inp == NULL) { splx(s); return (ECONNRESET); } if (sopt->sopt_level != IPPROTO_PGM) { error = ip_ctloutput(so, sopt); splx(s); return (error); } tp = intopgmcb(inp); if (sopt->sopt_dir == SOPT_SET) { switch (sopt->sopt_name) { case PGM_TXW_MAX_RATE: GET_INT_ARG( optval, 0, 100000000 ); tp->txw_max_rte = tp->txw_curr_rte = optval; break; case PGM_TRAIL_ADVANCE: /* set trail advance method. */ GET_INT_ARG( optval, TRAIL_ADVANCE_TIMER, TRAIL_ADVANCE_USER ); if (tp->trail_advance_policy == optval) break ; /* no change */ switch (tp->trail_advance_policy) { case TRAIL_ADVANCE_TIMER: /* * Deallocate unused structures leaving this method. */ while ( tp->odata_trail_head != NULL ) { struct pgm_data_q *q = tp->odata_trail_head; tp->odata_trail_head = q->next; free(q, M_PCB); } break; case TRAIL_ADVANCE_DATA: break ; case TRAIL_ADVANCE_USER: /* should make socket non-blocking */ break; default: printf("--- pgm_ctloutput: should not get here !\n"); break; } tp->trail_advance_policy = optval; break; case PGM_ODATA_LIFETIME: /* sets odata lifetime */ if (tp->trail_advance_policy != TRAIL_ADVANCE_TIMER) { printf("This option needs TRAIL_ADVANCE_TIMER\n"); error = EINVAL; break; } GET_INT_ARG( optval, 0, 1000 ); tp->odata_lifetime = optval * hz; /* now in ticks */ break; case PGM_NAK_MC: /* send optional mc nak with TTL = 1 */ GET_INT_ARG( optval, 0, 1 ); tp->rx_nak_mc = optval; break; case PGM_OPT_JOIN: #if CONG_CON case PGM_OPT_LOSSRATE: #endif { u_int32_t opt = 0; GET_INT_ARG( optval, 0, 1 ); /* set/reset */ switch (sopt->sopt_name) { case PGM_OPT_JOIN: if (tp->state == PGM_RECEIVER || tp->state == PGM_RX_CONNECTED) { error = EINVAL ; break ; } opt = OPT_JOIN; break; #if CONG_CON case PGM_OPT_LOSSRATE: if (!tp->enable_cc || tp->state == PGM_SENDER) { error = EINVAL ; break; } opt = OPT_LOSSRATE; break; #endif default: error = EINVAL ; break; } if (error) break ; if (optval && (tp->xmit_options & opt)==0) tp->xmit_options |= opt; else if (!(optval || (tp->xmit_options & opt)==0)) tp->xmit_options &= ~opt; else /* no change */ break; if (tp->xmit_options) error = pgm_make_options(tp); else if (tp->xmit_optlen != 0) { free(tp->opt_ptr, M_PCB); tp->opt_ptr = NULL; tp->xmit_optlen = 0; } else { DDB(printf("--- pgm_ctloutput: invalid xmit_optlen. should not get here !\n");) } } break; default: error = ENOPROTOOPT; break; } } else if (sopt->sopt_dir == SOPT_GET) { switch (sopt->sopt_name) { case PGM_TXW_MAX_RATE: PUT_INT_ARG( tp->txw_max_rte ) ; break ; case PGM_HOLE_SIZE: PUT_INT_ARG( tp->rxw_next - tp->rxw_hole_start ); tp->rx_in_hole = 0; pgm_clean_reass(tp); break; case PGM_TSI: { struct sockaddr_pgm tsi; if (tp->state != PGM_SENDER) { error = EINVAL; break; } bzero((char *) &tsi, sizeof(struct sockaddr_pgm)); tsi.gsid_low = pgm_gsid_low; tsi.gsid_high = pgm_gsid_high; tsi.sport = tp->sport; /* XXX */ error = sooptcopyout(sopt, &tsi, sizeof(struct sockaddr_pgm)); } break; default: error = ENOPROTOOPT; break; } } splx(s); return (error); } /* * Create the option part of the pgm packet. * We assume that all options plus the header fit in one mbuf. */ int pgm_make_options(struct pgmcb *tp) { struct pgm_option *opt; int offset = 0, l = 0; /* l is the length of last option */ /* We assume that PGM options always fit in a single mbuf. * XXX: We should check if pgmtotlen fit in an MBUF */ if (tp->opt_ptr == NULL) { tp->opt_ptr = malloc(MHLEN, M_PCB, M_NOWAIT); /* XXX */ if (tp->opt_ptr == NULL) return ENOBUFS; } bzero(tp->opt_ptr, MHLEN); opt = (struct pgm_option *)tp->opt_ptr ; opt->type = PGM_OPT_LENGTH; opt->len = l = OPT_HLEN; opt->tot_len = 0 ; offset += l ; if (tp->xmit_options & OPT_JOIN) { struct pgm_opt_join *o = (struct pgm_opt_join *)(tp->opt_ptr + offset); o->hdr.type = PGM_OPT_JOIN; o->hdr.len = l = sizeof(*o); o->hdr.tot_len = 0 ; /* not significant here */ o->trail = htonl(tp->txw_trail) ; offset += l ; } #if CONG_CON if (tp->xmit_options & OPT_LOSSRATE) { struct pgm_opt_loss *o = (struct pgm_opt_loss *)(tp->opt_ptr + offset); o->hdr.type = PGM_OPT_LOSSRATE; o->hdr.len = l = sizeof(*o); o->hdr.tot_len = 0 ; /* not significant here */ /* We fill lossrate and receiver-lead fields in pgm_output */ o->nacker = tp->p_template->pi_src ; offset += l ; } if (tp->xmit_options & OPT_SENDER_CC) { struct pgm_opt_cc *o = (struct pgm_opt_cc *)(tp->opt_ptr + offset); tp->sender_cc_opt = o ; /* for xmitter to fill up */ o->hdr.type = PGM_OPT_SEND_NAK; o->hdr.len = l = sizeof(*o); o->hdr.tot_len = 0 ; /* not significant here */ o->acker.s_addr = INADDR_ANY ; offset += l ; } #endif opt = (struct pgm_option *)(tp->opt_ptr + offset - l ); opt->type |= PGM_OPT_END; tp->xmit_optlen = offset ; opt = (struct pgm_option *)(tp->opt_ptr); opt->tot_len = htons(tp->xmit_optlen) ; return 0; }