ipfw-user/000755 000423 000000 00000000000 12012143730 013226 5ustar00luigiwheel000000 000000 ipfw-user/sys/000755 000423 000000 00000000000 12007434775 014064 5ustar00luigiwheel000000 000000 ipfw-user/extra/000755 000423 000000 00000000000 12007737733 014371 5ustar00luigiwheel000000 000000 ipfw-user/ipfw/000755 000423 000000 00000000000 12012143730 014173 5ustar00luigiwheel000000 000000 ipfw-user/Makefile.inc000644 000423 000000 00000001120 12011154541 015431 0ustar00luigiwheel000000 000000 # # this is a gnu makefile BSD_HEAD ?= /home/luigi/FreeBSD/head NETMAP_INC ?= ../netmap-release/sys OBJDIR=objs OSARCH := $(shell uname) OSARCH := $(findstring $(OSARCH),FreeBSD Linux Darwin) ifeq ($(OSARCH),) OSARCH := Windows endif ifeq ($V,) # no echo MSG=@echo HIDE=@ else MSG=@\# HIDE= endif # ipfw and kipfw are built in subdirs so the paths for # headers refer to one directory up INCDIRS += -I ../$(OBJDIR)/include_e -DEMULATE_SYSCTL INCDIRS += -I ../sys -I ../extra/sys -I ../extra/sys/contrib/pf .c.o: $(MSG) " CC $<" $(HIDE) $(CC) $(CFLAGS) -c $< -o $@ ipfw-user/Makefile000644 000423 000000 00000001221 12007741251 014670 0ustar00luigiwheel000000 000000 # # this is a gnu makefile SUBDIRS= ipfw dummynet .PHONY: ipfw kipfw include Makefile.inc all: ipfw kipfw ipfw: $(OBJDIR) $(MSG) Building userspace ... @(cd ipfw && $(MAKE) $(MAKECMDGOALS) ) $(OBJDIR): -@mkdir $(OBJDIR) kipfw: $(OBJDIR) $(MSG) Building datapath ... @(cd $(OBJDIR) && $(MAKE) -f ../Makefile.kipfw && cp kipfw ..) clean: -@rm -rf $(OBJDIR) kipfw @(cd ipfw && $(MAKE) clean ) tgz: @$(MAKE) clean (cd ..; tar cvzf /tmp/ipfw-user.tgz --exclude .svn ipfw-user) # compute diffs wrt FreeBSD head tree in BSD_HEAD diffs: -@diff -urp --exclude Makefile $(BSD_HEAD)/sbin/ipfw ipfw -@diff -urp --exclude Makefile $(BSD_HEAD)/sys sys ipfw-user/README000644 000423 000000 00000004054 12012143725 014115 0ustar00luigiwheel000000 000000 # # $Id: README 11706 2012-08-13 09:45:44Z luigi $ # This directory contains a version of ipfw and dummynet that can run in userland, using NETMAP as the backend for packet I/O. This permits a throughput about 10 times higher than the corresponding in-kernel version. I have measured about 6.5 Mpps for plain filtering, and 2.2 Mpps going through a pipe. The base version comes from FreeBSD-HEAD -r '{2012-08-03}' with small modifications listed below netinet/ipfw ip_dn_io.c support for on-stack mbufs ip_fw2.c some conditional compilation for functions not available in userspace ip_fw_log.c revise snprintf, SNPARGS (MAC) sbin/ipfw and the kernel counterpart communicate throuugh a TCP socket (localhost:5555) carrying the raw data that would normally be carried on seg/getsockopt. For testing purposes, opening a telnet session to port 5556 and typing some bytes will start a fake 'infinite source' so you can check how fast your ruleset works. gmake dummynet/ipfw & # preferably in another window telnet localhost 5556 # type some bytes to start 'traffic' sh -c "while true; do ipfw/ipfw show; ipfw/ipfw zero; sleep 1; done" (on an i7-3400 I get about 15 Mpps) Real packet I/O is possible using netmap info.iet.unipi.it/~luigi/netmap/ You can use a couple of VALE switches (part of netmap) to connect a source and sink to the userspace firewall, as follows s f f d [pkt-gen]-->--[valeA]-->--[kipfw]-->--[valeB]-->--[pkt-gen] The commands to run (in separate windows) are # preliminarly, load the netmap module sudo kldload netmap.ko # connect the firewall to two vale switches ./kipfw valeA:f valeB:f & # configure ipfw/dummynet ipfw/ipfw show # or other # start the sink pkt-gen -i valeB:d -f rx # start an infinite source pkt-gen -i valeA:s -f tx # plain again with the firewall and enjoy ipfw/ipfw show # or other On my i7-3400 I get about 6.5 Mpps with a single rule, and about 2.2 Mpps when going through a dummynet pipe. This is for a single process handling the traffic. ipfw-user/Makefile.kipfw000644 000423 000000 00000012173 12007522231 016012 0ustar00luigiwheel000000 000000 # gnu Makefile to build a userland version of the # kernel code for ipfw+dummynet # # The kernel code is compiled with appropriate flags to make # it see a kernel-like environment. # The userland emulation code is compiler with regular flags. # M is the current directory, used in recursive builds # so we allow it to be overridden include ../Makefile.inc VPATH = ../extra:../sys/netinet/ipfw:../sys/netinet:../sys/net M ?= $(shell pwd) OBJPATH = $(M)/../$(OBJDIR) ifeq ($(OSARCH),Darwin) CFLAGS2 += -D__BSD_VISIBLE EFILES_. += libutil.h EFILES_sys += condvar.h priv.h _lock.h rmlock.h EFILES_machine += in_cksum.h EFILES_netinet += ip_carp.h pim.h sctp.h EFILES_net += netisr.h vnet.h endif ifeq ($(OSARCH),Linux) CFLAGS2 += -D__BSD_VISIBLE CFLAGS2 += -include ../extra/linux_defs.h CFLAGS2 += -Wno-unused-but-set-variable EFILES_. += libutil.h EFILES_sys += condvar.h priv.h _lock.h rmlock.h EFILES_sys += lock.h ucred.h # taskqueue.h EFILES_sys += sockio.h EFILES_machine += in_cksum.h EFILES_netinet += in_pcb.h ip_carp.h pim.h sctp.h tcp_var.h EFILES_net += if_types.h bpf.h netisr.h vnet.h EFILES_linux += module.h endif ifeq ($(OSARCH),Windows) CFLAGS2 += -D__BSD_VISIBLE # CFLAGS2 += -include ../extra/linux_defs.h CFLAGS2 += -Wno-unused-but-set-variable # EFILES_. += libutil.h # EFILES_sys += condvar.h priv.h _lock.h rmlock.h # EFILES_sys += lock.h ucred.h # taskqueue.h # EFILES_sys += sockio.h # EFILES_machine += in_cksum.h # EFILES_netinet += in_pcb.h ip_carp.h pim.h sctp.h tcp_var.h # EFILES_net += if_types.h bpf.h netisr.h vnet.h # EFILES_linux += module.h EFILES_sys += sockio.h EFILES_net += ethernet.h EFILES_sys += condvar.h priv.h socketvar.h ucred.h EFILES_net += vnet.h EFILES_netinet += in_pcb.h ip_carp.h pim.h sctp.h tcp_var.h endif NETMAP_FLAGS = -DWITH_NETMAP -I../$(NETMAP_INC) E_CFLAGS += $(INCDIRS) E_CFLAGS += -include $(M)/../extra/glue.h # headers E_CFLAGS += -include $(M)/../extra/missing.h # headers E_CFLAGS += -O2 -Wall -Werror -fno-strict-aliasing E_CFLAGS += -g E_CFLAGS += -DKERNEL_SIDE # build the kernel side of the firewall E_CFLAGS += -DUSERSPACE # communicate through userspace E_CFLAGS += $(EFLAGS) $(NETMAP_FLAGS) E_CFLAGS += -DINET E_CFLAGS += -DIPFIREWALL_DEFAULT_TO_ACCEPT E_CFLAGS += -D_BSD_SOURCE # many of the kernel headers need _KERNEL E_CFLAGS += -D_KERNEL E_CFLAGS += $(CFLAGS2) #ipfw + dummynet section, other parts are not compiled in SRCS_IPFW = ip_fw2.c ip_fw_pfil.c ip_fw_sockopt.c SRCS_IPFW += ip_fw_dynamic.c ip_fw_table.c SRCS_IPFW += ip_fw_log.c SRCS_IPFW += ip_dummynet.c ip_dn_io.c ip_dn_glue.c SRCS_IPFW += dn_heap.c SRCS_IPFW += dn_sched_fifo.c dn_sched_wf2q.c SRCS_IPFW += dn_sched_rr.c dn_sched_qfq.c SRCS_IPFW += dn_sched_prio.c SRCS_NET = radix.c SRCS_NETINET = in_cksum.c # Module glue and functions missing in linux IPFW_SRCS = $(SRCS_IPFW) $(SRCS_NET) $(SRCS_NETINET) IPFW_SRCS += ipfw2_mod.c # bsd_compat.c IPFW_SRCS += missing.c session.c nm_util.c netmap_io.c IPFW_CFLAGS= -DINET E_CFLAGS += -Dradix MOD := kipfw LIBS= -lpthread CFLAGS = $(E_CFLAGS) IPFW_OBJS= $(IPFW_SRCS:%.c=%.o) all: include_e $(MOD) # entries to create empty files EFILES_. += opt_inet.h opt_ipsec.h opt_ipdivert.h EFILES_. += opt_inet6.h opt_ipfw.h opt_mpath.h EFILES_. += opt_mbuf_stress_test.h opt_param.h EFILES_. += timeconv.h EFILES_altq += if_altq.h EFILES_net += pf_mtag.h if_var.h route.h if_clone.h EFILES_netinet += in_var.h ip_var.h udp_var.h EFILES_netinet6 += ip6_var.h EFILES_sys += proc.h sockopt.h sysctl.h # new EFILES_sys += mutex.h _mutex.h _rwlock.h rwlock.h EFILES_sys += eventhandler.h EFILES_sys += jail.h #EFILES += sys/_lock.h sys/_rwlock.h sys/rwlock.h sys/rmlock.h sys/_mutex.h sys/mutex.h #EFILES += sys/condvar.h sys/eventhandler.h # sys/domain.h #EFILES += sys/limits.h sys/lock.h sys/mutex.h sys/priv.h #EFILES += sys/proc.h sys/rwlock.h sys/socket.h sys/socketvar.h #EFILES += sys/sysctl.h sys/time.h sys/ucred.h #EFILES += vm/uma_int.h vm/vm_int.h vm/uma_dbg.h #EFILES += vm/vm_dbg.h vm/vm_page.h vm/vm.h #EFILES += sys/rwlock.h sys/sysctl.h # first make a list of directories from variable names EDIRS= $(subst EFILES_,,$(filter EFILES_%,$(.VARIABLES))) # then prepend the directory name to individual files. # $(empty) serves to interpret the following space literally, # and the ": = " substitution packs spaces into one. EFILES = $(foreach i,$(EDIRS),$(subst $(empty) , $(i)/, $(EFILES_$(i): = ))) include_e: -@echo "Building $(OBJPATH)/include_e ..." -$(HIDE) rm -rf $(OBJPATH)/include_e opt_* -$(HIDE) mkdir -p $(OBJPATH)/include_e -$(HIDE) (cd $(OBJPATH)/include_e; mkdir -p $(EDIRS); touch $(EFILES) ) $(IPFW_OBJS) : ../extra/glue.h ip_fw2.o ip_dummynet.o: # EFLAGS= -include missing.h radix.o:# CFLAGS += -U_KERNEL # session.o: CFLAGS = -O2 nm_util.o: CFLAGS = -O2 -Wall -Werror $(NETMAP_FLAGS) $(MOD): $(IPFW_OBJS) $(MSG) " LD $@" $(HIDE)$(CC) -o $@ $^ $(LIBS) clean: -rm -f *.o $(DN) $(MOD) -rm -rf include_e diff: @-(for i in $(SRCS_IPFW) ; do diff -ubw $(BSD_HEAD)/sys/netinet/ipfw/$$i .; done) @-(for i in $(SRCS_NET) ; do diff -ubw $(BSD_HEAD)/sys/net/$$i . ; done) @-(for i in $(SRCS_NETINET) ; do diff -ubw $(BSD_HEAD)/sys/netinet/$$i .; done) ipfw-user/kipfw-20120806.diffs000644 000423 000000 00000007575 12007741615 016313 0ustar00luigiwheel000000 000000 diff -urp --exclude Makefile /home/luigi/FreeBSD/head/sys/netinet/ipfw/ip_dn_io.c sys/netinet/ipfw/ip_dn_io.c --- /home/luigi/FreeBSD/head/sys/netinet/ipfw/ip_dn_io.c 2012-08-06 13:00:28.000000000 +0200 +++ sys/netinet/ipfw/ip_dn_io.c 2012-08-06 11:06:21.000000000 +0200 @@ -260,6 +255,24 @@ dn_tag_get(struct mbuf *m) static inline void mq_append(struct mq *q, struct mbuf *m) { +#ifdef USERSPACE + // buffers from netmap need to be copied + // XXX note that the routine is not expected to fail + ND("append %p to %p", m, q); + if (m->m_flags & M_STACK) { + struct mbuf *m_new; + void *p; + int l, ofs; + + ofs = m->m_data - m->__m_extbuf; + // XXX allocate + MGETHDR(m_new, M_DONTWAIT, MT_DATA); + ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p", + m, m->__m_extbuf, m->__m_extlen, ofs, m_new); + p = m_new->__m_extbuf; /* new pointer */ + l = m_new->__m_extlen; /* new len */ + if (l <= m->__m_extlen) { + panic("extlen too large"); + } + + *m_new = *m; // copy + m_new->m_flags &= ~M_STACK; + m_new->__m_extbuf = p; // point to new buffer + pkt_copy(m->__m_extbuf, p, m->__m_extlen); + m_new->m_data = p + ofs; + m = m_new; + } +#endif /* USERSPACE */ if (q->head == NULL) q->head = m; else diff -urp --exclude Makefile /home/luigi/FreeBSD/head/sys/netinet/ipfw/ip_fw2.c sys/netinet/ipfw/ip_fw2.c --- /home/luigi/FreeBSD/head/sys/netinet/ipfw/ip_fw2.c 2012-08-03 15:53:27.000000000 +0200 +++ sys/netinet/ipfw/ip_fw2.c 2012-08-06 12:49:56.000000000 +0200 @@ -364,7 +364,7 @@ iface_match(struct ifnet *ifp, ipfw_insn return(1); } } else { -#ifdef __FreeBSD__ /* and OSX too ? */ +#if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); @@ -407,7 +407,7 @@ iface_match(struct ifnet *ifp, ipfw_insn static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { -#ifndef __FreeBSD__ +#if defined(USERSPACE) || !defined(__FreeBSD__) return 0; #else struct route ro; @@ -660,6 +660,9 @@ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { +#if defined(USERSPACE) + return 0; // not supported in userspace +#else #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, @@ -762,6 +765,7 @@ check_uidgid(ipfw_insn_u32 *insn, struct match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return (match); #endif /* __FreeBSD__ */ +#endif /* not supported in userspace */ } /* @@ -1428,6 +1432,7 @@ do { \ key = htonl(dst_port); else if (v == 3) key = htonl(src_port); +#ifndef USERSPACE else if (v == 4 || v == 5) { check_uidgid( (ipfw_insn_u32 *)cmd, @@ -1447,6 +1452,7 @@ do { \ #endif /* !__FreeBSD__ */ key = htonl(key); } else +#endif /* !USERSPACE */ break; } match = ipfw_lookup_table(chain, @@ -1875,6 +1881,7 @@ do { \ break; case O_SOCKARG: { +#ifndef USERSPACE /* not supported in userspace */ struct inpcb *inp = args->inp; struct inpcbinfo *pi; @@ -1915,6 +1922,7 @@ do { \ match = 1; } } +#endif /* !USERSPACE */ break; } diff -urp --exclude Makefile /home/luigi/FreeBSD/head/sys/netinet/ipfw/ip_fw_log.c sys/netinet/ipfw/ip_fw_log.c --- /home/luigi/FreeBSD/head/sys/netinet/ipfw/ip_fw_log.c 2012-08-06 12:48:48.000000000 +0200 +++ sys/netinet/ipfw/ip_fw_log.c 2012-08-05 14:08:49.000000000 +0200 @@ -83,8 +83,15 @@ __FBSDID("$FreeBSD: head/sys/netinet/ipf #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) +#ifdef __APPLE__ +#undef snprintf +#define snprintf sprintf +#define SNPARGS(buf, len) buf + len +#define SNP(buf) buf +#else /* !__APPLE__ */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) +#endif /* !__APPLE__ */ #ifdef WITHOUT_BPF void ipfw-user/ipfw/altq.c000644 000423 000000 00000006375 11725221076 015325 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD: head/sbin/ipfw/altq.c 220802 2011-04-18 21:18:22Z glebius $ * * altq interface */ #include #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* in_addr */ #include /* * Map between current altq queue id numbers and names. */ static TAILQ_HEAD(, pf_altq) altq_entries = TAILQ_HEAD_INITIALIZER(altq_entries); void altq_set_enabled(int enabled) { int pffd; pffd = open("/dev/pf", O_RDWR); if (pffd == -1) err(EX_UNAVAILABLE, "altq support opening pf(4) control device"); if (enabled) { if (ioctl(pffd, DIOCSTARTALTQ) != 0 && errno != EEXIST) err(EX_UNAVAILABLE, "enabling altq"); } else { if (ioctl(pffd, DIOCSTOPALTQ) != 0 && errno != ENOENT) err(EX_UNAVAILABLE, "disabling altq"); } close(pffd); } static void altq_fetch(void) { struct pfioc_altq pfioc; struct pf_altq *altq; int pffd; unsigned int mnr; static int altq_fetched = 0; if (altq_fetched) return; altq_fetched = 1; pffd = open("/dev/pf", O_RDONLY); if (pffd == -1) { warn("altq support opening pf(4) control device"); return; } bzero(&pfioc, sizeof(pfioc)); if (ioctl(pffd, DIOCGETALTQS, &pfioc) != 0) { warn("altq support getting queue list"); close(pffd); return; } mnr = pfioc.nr; for (pfioc.nr = 0; pfioc.nr < mnr; pfioc.nr++) { if (ioctl(pffd, DIOCGETALTQ, &pfioc) != 0) { if (errno == EBUSY) break; warn("altq support getting queue list"); close(pffd); return; } if (pfioc.altq.qid == 0) continue; altq = safe_calloc(1, sizeof(*altq)); *altq = pfioc.altq; TAILQ_INSERT_TAIL(&altq_entries, altq, entries); } close(pffd); } u_int32_t altq_name_to_qid(const char *name) { struct pf_altq *altq; altq_fetch(); TAILQ_FOREACH(altq, &altq_entries, entries) if (strcmp(name, altq->qname) == 0) break; if (altq == NULL) errx(EX_DATAERR, "altq has no queue named `%s'", name); return altq->qid; } static const char * altq_qid_to_name(u_int32_t qid) { struct pf_altq *altq; altq_fetch(); TAILQ_FOREACH(altq, &altq_entries, entries) if (qid == altq->qid) break; if (altq == NULL) return NULL; return altq->qname; } void print_altq_cmd(ipfw_insn_altq *altqptr) { if (altqptr) { const char *qname; qname = altq_qid_to_name(altqptr->qid); if (qname == NULL) printf(" altq ?<%u>", altqptr->qid); else printf(" altq %s", qname); } } ipfw-user/ipfw/dummynet.c000644 000423 000000 00000105326 12007565005 016217 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2002-2003,2010 Luigi Rizzo * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * $FreeBSD: head/sbin/ipfw/dummynet.c 238900 2012-07-30 10:55:23Z luigi $ * * dummynet support */ #include #include /* XXX there are several sysctl leftover here */ #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* inet_ntoa */ static struct _s_x dummynet_params[] = { { "plr", TOK_PLR }, { "noerror", TOK_NOERROR }, { "buckets", TOK_BUCKETS }, { "dst-ip", TOK_DSTIP }, { "src-ip", TOK_SRCIP }, { "dst-port", TOK_DSTPORT }, { "src-port", TOK_SRCPORT }, { "proto", TOK_PROTO }, { "weight", TOK_WEIGHT }, { "lmax", TOK_LMAX }, { "maxlen", TOK_LMAX }, { "all", TOK_ALL }, { "mask", TOK_MASK }, /* alias for both */ { "sched_mask", TOK_SCHED_MASK }, { "flow_mask", TOK_FLOW_MASK }, { "droptail", TOK_DROPTAIL }, { "red", TOK_RED }, { "gred", TOK_GRED }, { "bw", TOK_BW }, { "bandwidth", TOK_BW }, { "delay", TOK_DELAY }, { "link", TOK_LINK }, { "pipe", TOK_PIPE }, { "queue", TOK_QUEUE }, { "flowset", TOK_FLOWSET }, { "sched", TOK_SCHED }, { "pri", TOK_PRI }, { "priority", TOK_PRI }, { "type", TOK_TYPE }, { "flow-id", TOK_FLOWID}, { "dst-ipv6", TOK_DSTIP6}, { "dst-ip6", TOK_DSTIP6}, { "src-ipv6", TOK_SRCIP6}, { "src-ip6", TOK_SRCIP6}, { "profile", TOK_PROFILE}, { "burst", TOK_BURST}, { "dummynet-params", TOK_NULL }, { NULL, 0 } /* terminator */ }; #define O_NEXT(p, len) ((void *)((char *)p + len)) static void oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) { oid->len = len; oid->type = type; oid->subtype = 0; oid->id = id; } /* make room in the buffer and move the pointer forward */ static void * o_next(struct dn_id **o, int len, int type) { struct dn_id *ret = *o; oid_fill(ret, len, type, 0); *o = O_NEXT(*o, len); return ret; } #if 0 static int sort_q(void *arg, const void *pa, const void *pb) { int rev = (co.do_sort < 0); int field = rev ? -co.do_sort : co.do_sort; long long res = 0; const struct dn_flow_queue *a = pa; const struct dn_flow_queue *b = pb; switch (field) { case 1: /* pkts */ res = a->len - b->len; break; case 2: /* bytes */ res = a->len_bytes - b->len_bytes; break; case 3: /* tot pkts */ res = a->tot_pkts - b->tot_pkts; break; case 4: /* tot bytes */ res = a->tot_bytes - b->tot_bytes; break; } if (res < 0) res = -1; if (res > 0) res = 1; return (int)(rev ? res : -res); } #endif /* print a mask and header for the subsequent list of flows */ static void print_mask(struct ipfw_flow_id *id) { if (!IS_IP6_FLOW_ID(id)) { printf(" " "mask: %s 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", id->extra ? "queue," : "", id->proto, id->src_ip, id->src_port, id->dst_ip, id->dst_port); } else { char buf[255]; printf("\n mask: %sproto: 0x%02x, flow_id: 0x%08x, ", id->extra ? "queue," : "", id->proto, id->flow_id6); inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); printf("%s/0x%04x -> ", buf, id->src_port); inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); printf("%s/0x%04x\n", buf, id->dst_port); } } static void print_header(struct ipfw_flow_id *id) { if (!IS_IP6_FLOW_ID(id)) printf("BKT Prot ___Source IP/port____ " "____Dest. IP/port____ " "Tot_pkt/bytes Pkt/Byte Drp\n"); else printf("BKT ___Prot___ _flow-id_ " "______________Source IPv6/port_______________ " "_______________Dest. IPv6/port_______________ " "Tot_pkt/bytes Pkt/Byte Drp\n"); } static void list_flow(struct dn_flow *ni, int *print) { char buff[255]; struct protoent *pe = NULL; struct in_addr ina; struct ipfw_flow_id *id = &ni->fid; if (*print) { print_header(&ni->fid); *print = 0; } pe = getprotobynumber(id->proto); /* XXX: Should check for IPv4 flows */ printf("%3u%c", (ni->oid.id) & 0xff, id->extra ? '*' : ' '); if (!IS_IP6_FLOW_ID(id)) { if (pe) printf("%-4s ", pe->p_name); else printf("%4u ", id->proto); ina.s_addr = htonl(id->src_ip); printf("%15s/%-5d ", inet_ntoa(ina), id->src_port); ina.s_addr = htonl(id->dst_ip); printf("%15s/%-5d ", inet_ntoa(ina), id->dst_port); } else { /* Print IPv6 flows */ if (pe != NULL) printf("%9s ", pe->p_name); else printf("%9u ", id->proto); printf("%7d %39s/%-5d ", id->flow_id6, inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), id->src_port); printf(" %39s/%-5d ", inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), id->dst_port); } pr_u64(&ni->tot_pkts, 4); pr_u64(&ni->tot_bytes, 8); printf("%2u %4u %3u\n", ni->length, ni->len_bytes, ni->drops); } static void print_flowset_parms(struct dn_fs *fs, char *prefix) { int l; char qs[30]; char plr[30]; char red[90]; /* Display RED parameters */ l = fs->qsize; if (fs->flags & DN_QSIZE_BYTES) { if (l >= 8192) sprintf(qs, "%d KB", l / 1024); else sprintf(qs, "%d B", l); } else sprintf(qs, "%3d sl.", l); if (fs->plr) sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); else plr[0] = '\0'; if (fs->flags & DN_IS_RED) /* RED parameters */ sprintf(red, "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', 1.0 * fs->w_q / (double)(1 << SCALE_RED), fs->min_th, fs->max_th, 1.0 * fs->max_p / (double)(1 << SCALE_RED)); else sprintf(red, "droptail"); if (prefix[0]) { printf("%s %s%s %d queues (%d buckets) %s\n", prefix, qs, plr, fs->oid.id, fs->buckets, red); prefix[0] = '\0'; } else { printf("q%05d %s%s %d flows (%d buckets) sched %d " "weight %d lmax %d pri %d %s\n", fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); if (fs->flags & DN_HAVE_MASK) print_mask(&fs->flow_mask); } } static void print_extra_delay_parms(struct dn_profile *p) { double loss; if (p->samples_no <= 0) return; loss = p->loss_level; loss /= p->samples_no; printf("\t profile: name \"%s\" loss %f samples %d\n", p->name, loss, p->samples_no); } static void flush_buf(char *buf) { if (buf[0]) printf("%s\n", buf); buf[0] = '\0'; } /* * generic list routine. We expect objects in a specific order, i.e. * PIPES AND SCHEDULERS: * link; scheduler; internal flowset if any; instances * we can tell a pipe from the number. * * FLOWSETS: * flowset; queues; * link i (int queue); scheduler i; si(i) { flowsets() : queues } */ static void list_pipes(struct dn_id *oid, struct dn_id *end) { char buf[160]; /* pending buffer */ int toPrint = 1; /* print header */ buf[0] = '\0'; for (; oid != end; oid = O_NEXT(oid, oid->len)) { if (oid->len < sizeof(*oid)) errx(1, "invalid oid len %d\n", oid->len); switch (oid->type) { default: flush_buf(buf); printf("unrecognized object %d size %d\n", oid->type, oid->len); break; case DN_TEXT: /* list of attached flowsets */ { int i, l; struct { struct dn_id id; uint32_t p[0]; } *d = (void *)oid; l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); if (l == 0) break; printf(" Children flowsets: "); for (i = 0; i < l; i++) printf("%u ", d->p[i]); printf("\n"); break; } case DN_CMD_GET: if (co.verbose) printf("answer for cmd %d, len %d\n", oid->type, oid->id); break; case DN_SCH: { struct dn_sch *s = (struct dn_sch *)oid; flush_buf(buf); printf(" sched %d type %s flags 0x%x %d buckets %d active\n", s->sched_nr, s->name, s->flags, s->buckets, s->oid.id); if (s->flags & DN_HAVE_MASK) print_mask(&s->sched_mask); } break; case DN_FLOW: list_flow((struct dn_flow *)oid, &toPrint); break; case DN_LINK: { struct dn_link *p = (struct dn_link *)oid; double b = p->bandwidth; char bwbuf[30]; char burst[5 + 7]; /* This starts a new object so flush buffer */ flush_buf(buf); /* data rate */ if (b == 0) sprintf(bwbuf, "unlimited "); else if (b >= 1000000) sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); else if (b >= 1000) sprintf(bwbuf, "%7.3f Kbit/s", b/1000); else sprintf(bwbuf, "%7.3f bit/s ", b); if (humanize_number(burst, sizeof(burst), p->burst, "", HN_AUTOSCALE, 0) < 0 || co.verbose) sprintf(burst, "%d", (int)p->burst); sprintf(buf, "%05d: %s %4d ms burst %s", p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); } break; case DN_FS: print_flowset_parms((struct dn_fs *)oid, buf); break; case DN_PROFILE: flush_buf(buf); print_extra_delay_parms((struct dn_profile *)oid); } flush_buf(buf); // XXX does it really go here ? } } /* * Delete pipe, queue or scheduler i */ int ipfw_delete_pipe(int do_pipe, int i) { struct { struct dn_id oid; uintptr_t a[1]; /* add more if we want a list */ } cmd; oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : ( (do_pipe == 2) ? DN_FS : DN_SCH); cmd.a[0] = i; i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); if (i) { i = 1; warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); } return i; } /* * Code to parse delay profiles. * * Some link types introduce extra delays in the transmission * of a packet, e.g. because of MAC level framing, contention on * the use of the channel, MAC level retransmissions and so on. * From our point of view, the channel is effectively unavailable * for this extra time, which is constant or variable depending * on the link type. Additionally, packets may be dropped after this * time (e.g. on a wireless link after too many retransmissions). * We can model the additional delay with an empirical curve * that represents its distribution. * * cumulative probability * 1.0 ^ * | * L +-- loss-level x * | ****** * | * * | ***** * | * * | ** * | * * +-------*-------------------> * delay * * The empirical curve may have both vertical and horizontal lines. * Vertical lines represent constant delay for a range of * probabilities; horizontal lines correspond to a discontinuty * in the delay distribution: the link will use the largest delay * for a given probability. * * To pass the curve to dummynet, we must store the parameters * in a file as described below, and issue the command * * ipfw pipe config ... bw XXX profile ... * * The file format is the following, with whitespace acting as * a separator and '#' indicating the beginning a comment: * * samples N * the number of samples used in the internal * representation (2..1024; default 100); * * loss-level L * The probability above which packets are lost. * (0.0 <= L <= 1.0, default 1.0 i.e. no loss); * * name identifier * Optional a name (listed by "ipfw pipe show") * to identify the distribution; * * "delay prob" | "prob delay" * One of these two lines is mandatory and defines * the format of the following lines with data points. * * XXX YYY * 2 or more lines representing points in the curve, * with either delay or probability first, according * to the chosen format. * The unit for delay is milliseconds. * * Data points does not need to be ordered or equal to the number * specified in the "samples" line. ipfw will sort and interpolate * the curve as needed. * * Example of a profile file: name bla_bla_bla samples 100 loss-level 0.86 prob delay 0 200 # minimum overhead is 200ms 0.5 200 0.5 300 0.8 1000 0.9 1300 1 1300 * Internally, we will convert the curve to a fixed number of * samples, and when it is time to transmit a packet we will * model the extra delay as extra bits in the packet. * */ #define ED_MAX_LINE_LEN 256+ED_MAX_NAME_LEN #define ED_TOK_SAMPLES "samples" #define ED_TOK_LOSS "loss-level" #define ED_TOK_NAME "name" #define ED_TOK_DELAY "delay" #define ED_TOK_PROB "prob" #define ED_TOK_BW "bw" #define ED_SEPARATORS " \t\n" #define ED_MIN_SAMPLES_NO 2 /* * returns 1 if s is a non-negative number, with at least one '.' */ static int is_valid_number(const char *s) { int i, dots_found = 0; int len = strlen(s); for (i = 0; i 1)) return 0; return 1; } /* * Take as input a string describing a bandwidth value * and return the numeric bandwidth value. * set clocking interface or bandwidth value */ static void read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) { if (*bandwidth != -1) warnx("duplicate token, override bandwidth value!"); if (arg[0] >= 'a' && arg[0] <= 'z') { if (!if_name) { errx(1, "no if support"); } if (namelen >= IFNAMSIZ) warn("interface name truncated"); namelen--; /* interface name */ strncpy(if_name, arg, namelen); if_name[namelen] = '\0'; *bandwidth = 0; } else { /* read bandwidth value */ int bw; char *end = NULL; bw = strtoul(arg, &end, 0); if (*end == 'K' || *end == 'k') { end++; bw *= 1000; } else if (*end == 'M' || *end == 'm') { end++; bw *= 1000000; } if ((*end == 'B' && _substrcmp2(end, "Bi", "Bit/s") != 0) || _substrcmp2(end, "by", "bytes") == 0) bw *= 8; if (bw < 0) errx(EX_DATAERR, "bandwidth too large"); *bandwidth = bw; if (if_name) if_name[0] = '\0'; } } struct point { double prob; double delay; }; static int compare_points(const void *vp1, const void *vp2) { const struct point *p1 = vp1; const struct point *p2 = vp2; double res = 0; res = p1->prob - p2->prob; if (res == 0) res = p1->delay - p2->delay; if (res < 0) return -1; else if (res > 0) return 1; else return 0; } #define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno static void load_extra_delays(const char *filename, struct dn_profile *p, struct dn_link *link) { char line[ED_MAX_LINE_LEN]; FILE *f; int lineno = 0; int i; int samples = -1; double loss = -1.0; char profile_name[ED_MAX_NAME_LEN]; int delay_first = -1; int do_points = 0; struct point points[ED_MAX_SAMPLES_NO]; int points_no = 0; /* XXX link never NULL? */ p->link_nr = link->link_nr; profile_name[0] = '\0'; f = fopen(filename, "r"); if (f == NULL) err(EX_UNAVAILABLE, "fopen: %s", filename); while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ char *s, *cur = line, *name = NULL, *arg = NULL; ++lineno; /* parse the line */ while (cur) { s = strsep(&cur, ED_SEPARATORS); if (s == NULL || *s == '#') break; if (*s == '\0') continue; if (arg) errx(ED_EFMT("too many arguments")); if (name == NULL) name = s; else arg = s; } if (name == NULL) /* empty line */ continue; if (arg == NULL) errx(ED_EFMT("missing arg for %s"), name); if (!strcasecmp(name, ED_TOK_SAMPLES)) { if (samples > 0) errx(ED_EFMT("duplicate ``samples'' line")); if (atoi(arg) <=0) errx(ED_EFMT("invalid number of samples")); samples = atoi(arg); if (samples>ED_MAX_SAMPLES_NO) errx(ED_EFMT("too many samples, maximum is %d"), ED_MAX_SAMPLES_NO); do_points = 0; } else if (!strcasecmp(name, ED_TOK_BW)) { char buf[IFNAMSIZ]; read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); } else if (!strcasecmp(name, ED_TOK_LOSS)) { if (loss != -1.0) errx(ED_EFMT("duplicated token: %s"), name); if (!is_valid_number(arg)) errx(ED_EFMT("invalid %s"), arg); loss = atof(arg); if (loss > 1) errx(ED_EFMT("%s greater than 1.0"), name); do_points = 0; } else if (!strcasecmp(name, ED_TOK_NAME)) { if (profile_name[0] != '\0') errx(ED_EFMT("duplicated token: %s"), name); strncpy(profile_name, arg, sizeof(profile_name) - 1); profile_name[sizeof(profile_name)-1] = '\0'; do_points = 0; } else if (!strcasecmp(name, ED_TOK_DELAY)) { if (do_points) errx(ED_EFMT("duplicated token: %s"), name); delay_first = 1; do_points = 1; } else if (!strcasecmp(name, ED_TOK_PROB)) { if (do_points) errx(ED_EFMT("duplicated token: %s"), name); delay_first = 0; do_points = 1; } else if (do_points) { if (!is_valid_number(name) || !is_valid_number(arg)) errx(ED_EFMT("invalid point found")); if (delay_first) { points[points_no].delay = atof(name); points[points_no].prob = atof(arg); } else { points[points_no].delay = atof(arg); points[points_no].prob = atof(name); } if (points[points_no].prob > 1.0) errx(ED_EFMT("probability greater than 1.0")); ++points_no; } else { errx(ED_EFMT("unrecognised command '%s'"), name); } } fclose (f); if (samples == -1) { warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES); samples = 100; } if (loss == -1.0) { warnx("'%s' not found, assuming no loss", ED_TOK_LOSS); loss = 1; } /* make sure that there are enough points. */ if (points_no < ED_MIN_SAMPLES_NO) errx(ED_EFMT("too few samples, need at least %d"), ED_MIN_SAMPLES_NO); qsort(points, points_no, sizeof(struct point), compare_points); /* interpolation */ for (i = 0; isamples[ix] = x1; } else { double m = (y2-y1)/(x2-x1); double c = y1 - m*x1; for (; ixsamples[ix] = (ix - c)/m; } } p->samples_no = samples; p->loss_level = loss * samples; strncpy(p->name, profile_name, sizeof(p->name)); } /* * configuration of pipes, schedulers, flowsets. * When we configure a new scheduler, an empty pipe is created, so: * * do_pipe = 1 -> "pipe N config ..." only for backward compatibility * sched N+Delta type fifo sched_mask ... * pipe N+Delta * flowset N+Delta pipe N+Delta (no parameters) * sched N type wf2q+ sched_mask ... * pipe N * * do_pipe = 2 -> flowset N config * flowset N parameters * * do_pipe = 3 -> sched N config * sched N parameters (default no pipe) * optional Pipe N config ... * pipe ==> */ void ipfw_config_pipe(int ac, char **av) { int i; u_int j; char *end; struct dn_id *buf, *base; struct dn_sch *sch = NULL; struct dn_link *p = NULL; struct dn_fs *fs = NULL; struct dn_profile *pf = NULL; struct ipfw_flow_id *mask = NULL; int lmax; uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; /* * allocate space for 1 header, * 1 scheduler, 1 link, 1 flowset, 1 profile */ lmax = sizeof(struct dn_id); /* command header */ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + sizeof(struct dn_fs) + sizeof(struct dn_profile); av++; ac--; /* Pipe number */ if (ac && isdigit(**av)) { i = atoi(*av); av++; ac--; } else i = -1; if (i <= 0) errx(EX_USAGE, "need a pipe/flowset/sched number"); base = buf = safe_calloc(1, lmax); /* all commands start with a 'CONFIGURE' and a version */ o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); base->id = DN_API_VERSION; switch (co.do_pipe) { case 1: /* "pipe N config ..." */ /* Allocate space for the WF2Q+ scheduler, its link * and the FIFO flowset. Set the number, but leave * the scheduler subtype and other parameters to 0 * so the kernel will use appropriate defaults. * XXX todo: add a flag to record if a parameter * is actually configured. * If we do a 'pipe config' mask -> sched_mask. * The FIFO scheduler and link are derived from the * WF2Q+ one in the kernel. */ sch = o_next(&buf, sizeof(*sch), DN_SCH); p = o_next(&buf, sizeof(*p), DN_LINK); fs = o_next(&buf, sizeof(*fs), DN_FS); sch->sched_nr = i; sch->oid.subtype = 0; /* defaults to WF2Q+ */ mask = &sch->sched_mask; flags = &sch->flags; buckets = &sch->buckets; *flags |= DN_PIPE_CMD; p->link_nr = i; /* This flowset is only for the FIFO scheduler */ fs->fs_nr = i + 2*DN_MAX_ID; fs->sched_nr = i + DN_MAX_ID; break; case 2: /* "queue N config ... " */ fs = o_next(&buf, sizeof(*fs), DN_FS); fs->fs_nr = i; mask = &fs->flow_mask; flags = &fs->flags; buckets = &fs->buckets; break; case 3: /* "sched N config ..." */ sch = o_next(&buf, sizeof(*sch), DN_SCH); fs = o_next(&buf, sizeof(*fs), DN_FS); sch->sched_nr = i; mask = &sch->sched_mask; flags = &sch->flags; buckets = &sch->buckets; /* fs is used only with !MULTIQUEUE schedulers */ fs->fs_nr = i + DN_MAX_ID; fs->sched_nr = i; break; } /* set to -1 those fields for which we want to reuse existing * values from the kernel. * Also, *_nr and subtype = 0 mean reuse the value from the kernel. * XXX todo: support reuse of the mask. */ if (p) p->bandwidth = -1; for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) fs->par[j] = -1; while (ac > 0) { double d; int tok = match_token(dummynet_params, *av); ac--; av++; switch(tok) { case TOK_NOERROR: NEED(fs, "noerror is only for pipes"); fs->flags |= DN_NOERROR; break; case TOK_PLR: NEED(fs, "plr is only for pipes"); NEED1("plr needs argument 0..1\n"); d = strtod(av[0], NULL); if (d > 1) d = 1; else if (d < 0) d = 0; fs->plr = (int)(d*0x7fffffff); ac--; av++; break; case TOK_QUEUE: NEED(fs, "queue is only for pipes or flowsets"); NEED1("queue needs queue size\n"); end = NULL; fs->qsize = strtoul(av[0], &end, 0); if (*end == 'K' || *end == 'k') { fs->flags |= DN_QSIZE_BYTES; fs->qsize *= 1024; } else if (*end == 'B' || _substrcmp2(end, "by", "bytes") == 0) { fs->flags |= DN_QSIZE_BYTES; } ac--; av++; break; case TOK_BUCKETS: NEED(fs, "buckets is only for pipes or flowsets"); NEED1("buckets needs argument\n"); *buckets = strtoul(av[0], NULL, 0); ac--; av++; break; case TOK_FLOW_MASK: case TOK_SCHED_MASK: case TOK_MASK: NEED(mask, "tok_mask"); NEED1("mask needs mask specifier\n"); /* * per-flow queue, mask is dst_ip, dst_port, * src_ip, src_port, proto measured in bits */ bzero(mask, sizeof(*mask)); end = NULL; while (ac >= 1) { uint32_t *p32 = NULL; uint16_t *p16 = NULL; uint32_t *p20 = NULL; struct in6_addr *pa6 = NULL; uint32_t a; tok = match_token(dummynet_params, *av); ac--; av++; switch(tok) { case TOK_ALL: /* * special case, all bits significant * except 'extra' (the queue number) */ mask->dst_ip = ~0; mask->src_ip = ~0; mask->dst_port = ~0; mask->src_port = ~0; mask->proto = ~0; n2mask(&mask->dst_ip6, 128); n2mask(&mask->src_ip6, 128); mask->flow_id6 = ~0; *flags |= DN_HAVE_MASK; goto end_mask; case TOK_QUEUE: mask->extra = ~0; *flags |= DN_HAVE_MASK; goto end_mask; case TOK_DSTIP: mask->addr_type = 4; p32 = &mask->dst_ip; break; case TOK_SRCIP: mask->addr_type = 4; p32 = &mask->src_ip; break; case TOK_DSTIP6: mask->addr_type = 6; pa6 = &mask->dst_ip6; break; case TOK_SRCIP6: mask->addr_type = 6; pa6 = &mask->src_ip6; break; case TOK_FLOWID: mask->addr_type = 6; p20 = &mask->flow_id6; break; case TOK_DSTPORT: p16 = &mask->dst_port; break; case TOK_SRCPORT: p16 = &mask->src_port; break; case TOK_PROTO: break; default: ac++; av--; /* backtrack */ goto end_mask; } if (ac < 1) errx(EX_USAGE, "mask: value missing"); if (*av[0] == '/') { a = strtoul(av[0]+1, &end, 0); if (pa6 == NULL) a = (a == 32) ? ~0 : (1 << a) - 1; } else a = strtoul(av[0], &end, 0); if (p32 != NULL) *p32 = a; else if (p16 != NULL) { if (a > 0xFFFF) errx(EX_DATAERR, "port mask must be 16 bit"); *p16 = (uint16_t)a; } else if (p20 != NULL) { if (a > 0xfffff) errx(EX_DATAERR, "flow_id mask must be 20 bit"); *p20 = (uint32_t)a; } else if (pa6 != NULL) { if (a > 128) errx(EX_DATAERR, "in6addr invalid mask len"); else n2mask(pa6, a); } else { if (a > 0xFF) errx(EX_DATAERR, "proto mask must be 8 bit"); mask->proto = (uint8_t)a; } if (a != 0) *flags |= DN_HAVE_MASK; ac--; av++; } /* end while, config masks */ end_mask: break; case TOK_RED: case TOK_GRED: NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); fs->flags |= DN_IS_RED; if (tok == TOK_GRED) fs->flags |= DN_IS_GENTLE_RED; /* * the format for parameters is w_q/min_th/max_th/max_p */ if ((end = strsep(&av[0], "/"))) { double w_q = strtod(end, NULL); if (w_q > 1 || w_q <= 0) errx(EX_DATAERR, "0 < w_q <= 1"); fs->w_q = (int) (w_q * (1 << SCALE_RED)); } if ((end = strsep(&av[0], "/"))) { fs->min_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') fs->min_th *= 1024; } if ((end = strsep(&av[0], "/"))) { fs->max_th = strtoul(end, &end, 0); if (*end == 'K' || *end == 'k') fs->max_th *= 1024; } if ((end = strsep(&av[0], "/"))) { double max_p = strtod(end, NULL); if (max_p > 1 || max_p <= 0) errx(EX_DATAERR, "0 < max_p <= 1"); fs->max_p = (int)(max_p * (1 << SCALE_RED)); } ac--; av++; break; case TOK_DROPTAIL: NEED(fs, "droptail is only for flowsets"); fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); break; case TOK_BW: NEED(p, "bw is only for links"); NEED1("bw needs bandwidth or interface\n"); read_bandwidth(av[0], &p->bandwidth, NULL, 0); ac--; av++; break; case TOK_DELAY: NEED(p, "delay is only for links"); NEED1("delay needs argument 0..10000ms\n"); p->delay = strtoul(av[0], NULL, 0); ac--; av++; break; case TOK_TYPE: { int l; NEED(sch, "type is only for schedulers"); NEED1("type needs a string"); l = strlen(av[0]); if (l == 0 || l > 15) errx(1, "type %s too long\n", av[0]); strcpy(sch->name, av[0]); sch->oid.subtype = 0; /* use string */ ac--; av++; break; } case TOK_WEIGHT: NEED(fs, "weight is only for flowsets"); NEED1("weight needs argument\n"); fs->par[0] = strtol(av[0], &end, 0); ac--; av++; break; case TOK_LMAX: NEED(fs, "lmax is only for flowsets"); NEED1("lmax needs argument\n"); fs->par[1] = strtol(av[0], &end, 0); ac--; av++; break; case TOK_PRI: NEED(fs, "priority is only for flowsets"); NEED1("priority needs argument\n"); fs->par[2] = strtol(av[0], &end, 0); ac--; av++; break; case TOK_SCHED: case TOK_PIPE: NEED(fs, "pipe/sched"); NEED1("pipe/link/sched needs number\n"); fs->sched_nr = strtoul(av[0], &end, 0); ac--; av++; break; case TOK_PROFILE: NEED((!pf), "profile already set"); NEED(p, "profile"); { NEED1("extra delay needs the file name\n"); pf = o_next(&buf, sizeof(*pf), DN_PROFILE); load_extra_delays(av[0], pf, p); //XXX can't fail? --ac; ++av; } break; case TOK_BURST: NEED(p, "burst"); NEED1("burst needs argument\n"); errno = 0; if (expand_number(av[0], &p->burst) < 0) if (errno != ERANGE) errx(EX_DATAERR, "burst: invalid argument"); if (errno || p->burst > (1ULL << 48) - 1) errx(EX_DATAERR, "burst: out of range (0..2^48-1)"); ac--; av++; break; default: errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); } } /* check validity of parameters */ if (p) { if (p->delay > 10000) errx(EX_DATAERR, "delay must be < 10000"); if (p->bandwidth == -1) p->bandwidth = 0; } if (fs) { /* XXX accept a 0 scheduler to keep the default */ if (fs->flags & DN_QSIZE_BYTES) { size_t len; long limit; len = sizeof(limit); if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", &limit, &len, NULL, 0) == -1) limit = 1024*1024; if (fs->qsize > limit) errx(EX_DATAERR, "queue size must be < %ldB", limit); } else { size_t len; long limit; len = sizeof(limit); if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", &limit, &len, NULL, 0) == -1) limit = 100; if (fs->qsize > limit) errx(EX_DATAERR, "2 <= queue size <= %ld", limit); } if (fs->flags & DN_IS_RED) { size_t len; int lookup_depth, avg_pkt_size; if (fs->min_th >= fs->max_th) errx(EX_DATAERR, "min_th %d must be < than max_th %d", fs->min_th, fs->max_th); if (fs->max_th == 0) errx(EX_DATAERR, "max_th must be > 0"); len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", &lookup_depth, &len, NULL, 0) == -1) lookup_depth = 256; if (lookup_depth == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" " must be greater than zero"); len = sizeof(int); if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", &avg_pkt_size, &len, NULL, 0) == -1) avg_pkt_size = 512; if (avg_pkt_size == 0) errx(EX_DATAERR, "net.inet.ip.dummynet.red_avg_pkt_size must" " be greater than zero"); #if 0 /* the following computation is now done in the kernel */ /* * Ticks needed for sending a medium-sized packet. * Unfortunately, when we are configuring a WF2Q+ queue, we * do not have bandwidth information, because that is stored * in the parent pipe, and also we have multiple queues * competing for it. So we set s=0, which is not very * correct. But on the other hand, why do we want RED with * WF2Q+ ? */ if (p.bandwidth==0) /* this is a WF2Q+ queue */ s = 0; else s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; /* * max idle time (in ticks) before avg queue size becomes 0. * NOTA: (3/w_q) is approx the value x so that * (1-w_q)^x < 10^-3. */ w_q = ((double)fs->w_q) / (1 << SCALE_RED); idle = s * 3. / w_q; fs->lookup_step = (int)idle / lookup_depth; if (!fs->lookup_step) fs->lookup_step = 1; weight = 1 - w_q; for (t = fs->lookup_step; t > 1; --t) weight *= 1 - w_q; fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); #endif /* code moved in the kernel */ } } i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); if (i) err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); } void dummynet_flush(void) { struct dn_id oid; oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); do_cmd(IP_DUMMYNET3, &oid, oid.len); } /* Parse input for 'ipfw [pipe|sched|queue] show [range list]' * Returns the number of ranges, and possibly stores them * in the array v of size len. */ static int parse_range(int ac, char *av[], uint32_t *v, int len) { int n = 0; char *endptr, *s; uint32_t base[2]; if (v == NULL || len < 2) { v = base; len = 2; } for (s = *av; s != NULL; av++, ac--) { v[0] = strtoul(s, &endptr, 10); v[1] = (*endptr != '-') ? v[0] : strtoul(endptr+1, &endptr, 10); if (*endptr == '\0') { /* prepare for next round */ s = (ac > 0) ? *(av+1) : NULL; } else { if (*endptr != ',') { warn("invalid number: %s", s); s = ++endptr; continue; } /* continue processing from here */ s = ++endptr; ac++; av--; } if (v[1] < v[0] || v[1] >= DN_MAX_ID-1 || v[1] >= DN_MAX_ID-1) { continue; /* invalid entry */ } n++; /* translate if 'pipe list' */ if (co.do_pipe == 1) { v[0] += DN_MAX_ID; v[1] += DN_MAX_ID; } v = (n*2 < len) ? v + 2 : base; } return n; } /* main entry point for dummynet list functions. co.do_pipe indicates * which function we want to support. * av may contain filtering arguments, either individual entries * or ranges, or lists (space or commas are valid separators). * Format for a range can be n1-n2 or n3 n4 n5 ... * In a range n1 must be <= n2, otherwise the range is ignored. * A number 'n4' is translate in a range 'n4-n4' * All number must be > 0 and < DN_MAX_ID-1 */ void dummynet_list(int ac, char *av[], int show_counters) { struct dn_id *oid, *x = NULL; int ret, i; int n; /* # of ranges */ u_int buflen, l; u_int max_size; /* largest obj passed up */ (void)show_counters; // XXX unused, but we should use it. ac--; av++; /* skip 'list' | 'show' word */ n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */ /* Allocate space to store ranges */ l = sizeof(*oid) + sizeof(uint32_t) * n * 2; oid = safe_calloc(1, l); oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION); if (n > 0) /* store ranges in idx */ parse_range(ac, av, (uint32_t *)(oid + 1), n*2); /* * Compute the size of the largest object returned. If the * response leaves at least this much spare space in the * buffer, then surely the response is complete; otherwise * there might be a risk of truncation and we will need to * retry with a larger buffer. * XXX don't bother with smaller structs. */ max_size = sizeof(struct dn_fs); if (max_size < sizeof(struct dn_sch)) max_size = sizeof(struct dn_sch); if (max_size < sizeof(struct dn_flow)) max_size = sizeof(struct dn_flow); switch (co.do_pipe) { case 1: oid->subtype = DN_LINK; /* list pipe */ break; case 2: oid->subtype = DN_FS; /* list queue */ break; case 3: oid->subtype = DN_SCH; /* list sched */ break; } /* * Ask the kernel an estimate of the required space (result * in oid.id), unless we are requesting a subset of objects, * in which case the kernel does not give an exact answer. * In any case, space might grow in the meantime due to the * creation of new queues, so we must be prepared to retry. */ if (n > 0) { buflen = 4*1024; } else { ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); if (ret != 0 || oid->id <= sizeof(*oid)) goto done; buflen = oid->id + max_size; oid->len = sizeof(*oid); /* restore */ } /* Try a few times, until the buffer fits */ for (i = 0; i < 20; i++) { l = buflen; x = safe_realloc(x, l); bcopy(oid, x, oid->len); ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); if (ret != 0 || x->id <= sizeof(*oid)) goto done; /* no response */ if (l + max_size <= buflen) break; /* ok */ buflen *= 2; /* double for next attempt */ } list_pipes(x, O_NEXT(x, l)); done: if (x) free(x); free(oid); } ipfw-user/ipfw/nat.c000644 000423 000000 00000056301 12007565005 015135 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD: head/sbin/ipfw/nat.c 238900 2012-07-30 10:55:23Z luigi $ * * In-kernel nat support */ #include #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #define IPFW_INTERNAL /* Access to protected structures in ip_fw.h. */ #include #include #include /* def. of struct route */ #include #include #include #include static struct _s_x nat_params[] = { { "ip", TOK_IP }, { "if", TOK_IF }, { "log", TOK_ALOG }, { "deny_in", TOK_DENY_INC }, { "same_ports", TOK_SAME_PORTS }, { "unreg_only", TOK_UNREG_ONLY }, { "skip_global", TOK_SKIP_GLOBAL }, { "reset", TOK_RESET_ADDR }, { "reverse", TOK_ALIAS_REV }, { "proxy_only", TOK_PROXY_ONLY }, { "redirect_addr", TOK_REDIR_ADDR }, { "redirect_port", TOK_REDIR_PORT }, { "redirect_proto", TOK_REDIR_PROTO }, { NULL, 0 } /* terminator */ }; /* * Search for interface with name "ifn", and fill n accordingly: * * n->ip ip address of interface "ifn" * n->if_name copy of interface name "ifn" */ static void set_addr_dynamic(const char *ifn, struct cfg_nat *n) { size_t needed; int mib[6]; char *buf, *lim, *next; struct if_msghdr *ifm; struct ifa_msghdr *ifam; struct sockaddr_dl *sdl; struct sockaddr_in *sin; int ifIndex, ifMTU; mib[0] = CTL_NET; mib[1] = PF_ROUTE; mib[2] = 0; mib[3] = AF_INET; mib[4] = NET_RT_IFLIST; mib[5] = 0; /* * Get interface data. */ if (sysctl(mib, 6, NULL, &needed, NULL, 0) == -1) err(1, "iflist-sysctl-estimate"); buf = safe_calloc(1, needed); if (sysctl(mib, 6, buf, &needed, NULL, 0) == -1) err(1, "iflist-sysctl-get"); lim = buf + needed; /* * Loop through interfaces until one with * given name is found. This is done to * find correct interface index for routing * message processing. */ ifIndex = 0; next = buf; while (next < lim) { ifm = (struct if_msghdr *)next; next += ifm->ifm_msglen; if (ifm->ifm_version != RTM_VERSION) { if (co.verbose) warnx("routing message version %d " "not understood", ifm->ifm_version); continue; } if (ifm->ifm_type == RTM_IFINFO) { sdl = (struct sockaddr_dl *)(ifm + 1); if (strlen(ifn) == sdl->sdl_nlen && strncmp(ifn, sdl->sdl_data, sdl->sdl_nlen) == 0) { ifIndex = ifm->ifm_index; ifMTU = ifm->ifm_data.ifi_mtu; break; } } } if (!ifIndex) errx(1, "unknown interface name %s", ifn); /* * Get interface address. */ sin = NULL; while (next < lim) { ifam = (struct ifa_msghdr *)next; next += ifam->ifam_msglen; if (ifam->ifam_version != RTM_VERSION) { if (co.verbose) warnx("routing message version %d " "not understood", ifam->ifam_version); continue; } if (ifam->ifam_type != RTM_NEWADDR) break; if (ifam->ifam_addrs & RTA_IFA) { int i; char *cp = (char *)(ifam + 1); for (i = 1; i < RTA_IFA; i <<= 1) { if (ifam->ifam_addrs & i) cp += SA_SIZE((struct sockaddr *)cp); } if (((struct sockaddr *)cp)->sa_family == AF_INET) { sin = (struct sockaddr_in *)cp; break; } } } if (sin == NULL) errx(1, "%s: cannot get interface address", ifn); n->ip = sin->sin_addr; strncpy(n->if_name, ifn, IF_NAMESIZE); free(buf); } /* * XXX - The following functions, macros and definitions come from natd.c: * it would be better to move them outside natd.c, in a file * (redirect_support.[ch]?) shared by ipfw and natd, but for now i can live * with it. */ /* * Definition of a port range, and macros to deal with values. * FORMAT: HI 16-bits == first port in range, 0 == all ports. * LO 16-bits == number of ports in range * NOTES: - Port values are not stored in network byte order. */ #define port_range u_long #define GETLOPORT(x) ((x) >> 0x10) #define GETNUMPORTS(x) ((x) & 0x0000ffff) #define GETHIPORT(x) (GETLOPORT((x)) + GETNUMPORTS((x))) /* Set y to be the low-port value in port_range variable x. */ #define SETLOPORT(x,y) ((x) = ((x) & 0x0000ffff) | ((y) << 0x10)) /* Set y to be the number of ports in port_range variable x. */ #define SETNUMPORTS(x,y) ((x) = ((x) & 0xffff0000) | (y)) static void StrToAddr (const char* str, struct in_addr* addr) { struct hostent* hp; if (inet_aton (str, addr)) return; hp = gethostbyname (str); if (!hp) errx (1, "unknown host %s", str); memcpy (addr, hp->h_addr, sizeof (struct in_addr)); } static int StrToPortRange (const char* str, const char* proto, port_range *portRange) { char* sep; struct servent* sp; char* end; u_short loPort; u_short hiPort; /* First see if this is a service, return corresponding port if so. */ sp = getservbyname (str,proto); if (sp) { SETLOPORT(*portRange, ntohs(sp->s_port)); SETNUMPORTS(*portRange, 1); return 0; } /* Not a service, see if it's a single port or port range. */ sep = strchr (str, '-'); if (sep == NULL) { SETLOPORT(*portRange, strtol(str, &end, 10)); if (end != str) { /* Single port. */ SETNUMPORTS(*portRange, 1); return 0; } /* Error in port range field. */ errx (EX_DATAERR, "%s/%s: unknown service", str, proto); } /* Port range, get the values and sanity check. */ sscanf (str, "%hu-%hu", &loPort, &hiPort); SETLOPORT(*portRange, loPort); SETNUMPORTS(*portRange, 0); /* Error by default */ if (loPort <= hiPort) SETNUMPORTS(*portRange, hiPort - loPort + 1); if (GETNUMPORTS(*portRange) == 0) errx (EX_DATAERR, "invalid port range %s", str); return 0; } static int StrToProto (const char* str) { if (!strcmp (str, "tcp")) return IPPROTO_TCP; if (!strcmp (str, "udp")) return IPPROTO_UDP; if (!strcmp (str, "sctp")) return IPPROTO_SCTP; errx (EX_DATAERR, "unknown protocol %s. Expected sctp, tcp or udp", str); } static int StrToAddrAndPortRange (const char* str, struct in_addr* addr, char* proto, port_range *portRange) { char* ptr; ptr = strchr (str, ':'); if (!ptr) errx (EX_DATAERR, "%s is missing port number", str); *ptr = '\0'; ++ptr; StrToAddr (str, addr); return StrToPortRange (ptr, proto, portRange); } /* End of stuff taken from natd.c. */ /* * The next 3 functions add support for the addr, port and proto redirect and * their logic is loosely based on SetupAddressRedirect(), SetupPortRedirect() * and SetupProtoRedirect() from natd.c. * * Every setup_* function fills at least one redirect entry * (struct cfg_redir) and zero or more server pool entry (struct cfg_spool) * in buf. * * The format of data in buf is: * * cfg_nat cfg_redir cfg_spool ...... cfg_spool * * ------------------------------------- ------------ * | | .....X ... | | | | ..... * ------------------------------------- ...... ------------ * ^ * spool_cnt n=0 ...... n=(X-1) * * len points to the amount of available space in buf * space counts the memory consumed by every function * * XXX - Every function get all the argv params so it * has to check, in optional parameters, that the next * args is a valid option for the redir entry and not * another token. Only redir_port and redir_proto are * affected by this. */ static int estimate_redir_addr(int *ac, char ***av) { size_t space = sizeof(struct cfg_redir); char *sep = **av; u_int c = 0; (void)ac; /* UNUSED */ while ((sep = strchr(sep, ',')) != NULL) { c++; sep++; } if (c > 0) c++; space += c * sizeof(struct cfg_spool); return (space); } static int setup_redir_addr(char *buf, int *ac, char ***av) { struct cfg_redir *r; char *sep; size_t space; r = (struct cfg_redir *)buf; r->mode = REDIR_ADDR; /* Skip cfg_redir at beginning of buf. */ buf = &buf[sizeof(struct cfg_redir)]; space = sizeof(struct cfg_redir); /* Extract local address. */ if (strchr(**av, ',') != NULL) { struct cfg_spool *spool; /* Setup LSNAT server pool. */ r->laddr.s_addr = INADDR_NONE; sep = strtok(**av, ","); while (sep != NULL) { spool = (struct cfg_spool *)buf; space += sizeof(struct cfg_spool); StrToAddr(sep, &spool->addr); spool->port = ~0; r->spool_cnt++; /* Point to the next possible cfg_spool. */ buf = &buf[sizeof(struct cfg_spool)]; sep = strtok(NULL, ","); } } else StrToAddr(**av, &r->laddr); (*av)++; (*ac)--; /* Extract public address. */ StrToAddr(**av, &r->paddr); (*av)++; (*ac)--; return (space); } static int estimate_redir_port(int *ac, char ***av) { size_t space = sizeof(struct cfg_redir); char *sep = **av; u_int c = 0; (void)ac; /* UNUSED */ while ((sep = strchr(sep, ',')) != NULL) { c++; sep++; } if (c > 0) c++; space += c * sizeof(struct cfg_spool); return (space); } static int setup_redir_port(char *buf, int *ac, char ***av) { struct cfg_redir *r; char *sep, *protoName, *lsnat = NULL; size_t space; u_short numLocalPorts; port_range portRange; numLocalPorts = 0; r = (struct cfg_redir *)buf; r->mode = REDIR_PORT; /* Skip cfg_redir at beginning of buf. */ buf = &buf[sizeof(struct cfg_redir)]; space = sizeof(struct cfg_redir); /* * Extract protocol. */ r->proto = StrToProto(**av); protoName = **av; (*av)++; (*ac)--; /* * Extract local address. */ if ((sep = strchr(**av, ',')) != NULL) { r->laddr.s_addr = INADDR_NONE; r->lport = ~0; numLocalPorts = 1; lsnat = **av; } else { /* * The sctp nat does not allow the port numbers to be mapped to * new port numbers. Therefore, no ports are to be specified * in the target port field. */ if (r->proto == IPPROTO_SCTP) { if (strchr(**av, ':')) errx(EX_DATAERR, "redirect_port:" "port numbers do not change in sctp, so do " "not specify them as part of the target"); else StrToAddr(**av, &r->laddr); } else { if (StrToAddrAndPortRange(**av, &r->laddr, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port: " "invalid local port range"); r->lport = GETLOPORT(portRange); numLocalPorts = GETNUMPORTS(portRange); } } (*av)++; (*ac)--; /* * Extract public port and optionally address. */ if ((sep = strchr(**av, ':')) != NULL) { if (StrToAddrAndPortRange(**av, &r->paddr, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port: " "invalid public port range"); } else { r->paddr.s_addr = INADDR_ANY; if (StrToPortRange(**av, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port: " "invalid public port range"); } r->pport = GETLOPORT(portRange); if (r->proto == IPPROTO_SCTP) { /* so the logic below still works */ numLocalPorts = GETNUMPORTS(portRange); r->lport = r->pport; } r->pport_cnt = GETNUMPORTS(portRange); (*av)++; (*ac)--; /* * Extract remote address and optionally port. */ /* * NB: isdigit(**av) => we've to check that next parameter is really an * option for this redirect entry, else stop here processing arg[cv]. */ if (*ac != 0 && isdigit(***av)) { if ((sep = strchr(**av, ':')) != NULL) { if (StrToAddrAndPortRange(**av, &r->raddr, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port: " "invalid remote port range"); } else { SETLOPORT(portRange, 0); SETNUMPORTS(portRange, 1); StrToAddr(**av, &r->raddr); } (*av)++; (*ac)--; } else { SETLOPORT(portRange, 0); SETNUMPORTS(portRange, 1); r->raddr.s_addr = INADDR_ANY; } r->rport = GETLOPORT(portRange); r->rport_cnt = GETNUMPORTS(portRange); /* * Make sure port ranges match up, then add the redirect ports. */ if (numLocalPorts != r->pport_cnt) errx(EX_DATAERR, "redirect_port: " "port ranges must be equal in size"); /* Remote port range is allowed to be '0' which means all ports. */ if (r->rport_cnt != numLocalPorts && (r->rport_cnt != 1 || r->rport != 0)) errx(EX_DATAERR, "redirect_port: remote port must" "be 0 or equal to local port range in size"); /* Setup LSNAT server pool. */ if (lsnat != NULL) { struct cfg_spool *spool; sep = strtok(lsnat, ","); while (sep != NULL) { spool = (struct cfg_spool *)buf; space += sizeof(struct cfg_spool); /* * The sctp nat does not allow the port numbers to * be mapped to new port numbers. Therefore, no ports * are to be specified in the target port field. */ if (r->proto == IPPROTO_SCTP) { if (strchr (sep, ':')) { errx(EX_DATAERR, "redirect_port:" "port numbers do not change in " "sctp, so do not specify them as " "part of the target"); } else { StrToAddr(sep, &spool->addr); spool->port = r->pport; } } else { if (StrToAddrAndPortRange(sep, &spool->addr, protoName, &portRange) != 0) errx(EX_DATAERR, "redirect_port:" "invalid local port range"); if (GETNUMPORTS(portRange) != 1) errx(EX_DATAERR, "redirect_port: " "local port must be single in " "this context"); spool->port = GETLOPORT(portRange); } r->spool_cnt++; /* Point to the next possible cfg_spool. */ buf = &buf[sizeof(struct cfg_spool)]; sep = strtok(NULL, ","); } } return (space); } static int setup_redir_proto(char *buf, int *ac, char ***av) { struct cfg_redir *r; struct protoent *protoent; size_t space; r = (struct cfg_redir *)buf; r->mode = REDIR_PROTO; /* Skip cfg_redir at beginning of buf. */ buf = &buf[sizeof(struct cfg_redir)]; space = sizeof(struct cfg_redir); /* * Extract protocol. */ protoent = getprotobyname(**av); if (protoent == NULL) errx(EX_DATAERR, "redirect_proto: unknown protocol %s", **av); else r->proto = protoent->p_proto; (*av)++; (*ac)--; /* * Extract local address. */ StrToAddr(**av, &r->laddr); (*av)++; (*ac)--; /* * Extract optional public address. */ if (*ac == 0) { r->paddr.s_addr = INADDR_ANY; r->raddr.s_addr = INADDR_ANY; } else { /* see above in setup_redir_port() */ if (isdigit(***av)) { StrToAddr(**av, &r->paddr); (*av)++; (*ac)--; /* * Extract optional remote address. */ /* see above in setup_redir_port() */ if (*ac != 0 && isdigit(***av)) { StrToAddr(**av, &r->raddr); (*av)++; (*ac)--; } } } return (space); } static void print_nat_config(unsigned char *buf) { struct cfg_nat *n; int i, cnt, flag, off; struct cfg_redir *t; struct cfg_spool *s; struct protoent *p; n = (struct cfg_nat *)buf; flag = 1; off = sizeof(*n); printf("ipfw nat %u config", n->id); if (strlen(n->if_name) != 0) printf(" if %s", n->if_name); else if (n->ip.s_addr != 0) printf(" ip %s", inet_ntoa(n->ip)); while (n->mode != 0) { if (n->mode & PKT_ALIAS_LOG) { printf(" log"); n->mode &= ~PKT_ALIAS_LOG; } else if (n->mode & PKT_ALIAS_DENY_INCOMING) { printf(" deny_in"); n->mode &= ~PKT_ALIAS_DENY_INCOMING; } else if (n->mode & PKT_ALIAS_SAME_PORTS) { printf(" same_ports"); n->mode &= ~PKT_ALIAS_SAME_PORTS; } else if (n->mode & PKT_ALIAS_SKIP_GLOBAL) { printf(" skip_global"); n->mode &= ~PKT_ALIAS_SKIP_GLOBAL; } else if (n->mode & PKT_ALIAS_UNREGISTERED_ONLY) { printf(" unreg_only"); n->mode &= ~PKT_ALIAS_UNREGISTERED_ONLY; } else if (n->mode & PKT_ALIAS_RESET_ON_ADDR_CHANGE) { printf(" reset"); n->mode &= ~PKT_ALIAS_RESET_ON_ADDR_CHANGE; } else if (n->mode & PKT_ALIAS_REVERSE) { printf(" reverse"); n->mode &= ~PKT_ALIAS_REVERSE; } else if (n->mode & PKT_ALIAS_PROXY_ONLY) { printf(" proxy_only"); n->mode &= ~PKT_ALIAS_PROXY_ONLY; } } /* Print all the redirect's data configuration. */ for (cnt = 0; cnt < n->redir_cnt; cnt++) { t = (struct cfg_redir *)&buf[off]; off += SOF_REDIR; switch (t->mode) { case REDIR_ADDR: printf(" redirect_addr"); if (t->spool_cnt == 0) printf(" %s", inet_ntoa(t->laddr)); else for (i = 0; i < t->spool_cnt; i++) { s = (struct cfg_spool *)&buf[off]; if (i) printf(","); else printf(" "); printf("%s", inet_ntoa(s->addr)); off += SOF_SPOOL; } printf(" %s", inet_ntoa(t->paddr)); break; case REDIR_PORT: p = getprotobynumber(t->proto); printf(" redirect_port %s ", p->p_name); if (!t->spool_cnt) { printf("%s:%u", inet_ntoa(t->laddr), t->lport); if (t->pport_cnt > 1) printf("-%u", t->lport + t->pport_cnt - 1); } else for (i=0; i < t->spool_cnt; i++) { s = (struct cfg_spool *)&buf[off]; if (i) printf(","); printf("%s:%u", inet_ntoa(s->addr), s->port); off += SOF_SPOOL; } printf(" "); if (t->paddr.s_addr) printf("%s:", inet_ntoa(t->paddr)); printf("%u", t->pport); if (!t->spool_cnt && t->pport_cnt > 1) printf("-%u", t->pport + t->pport_cnt - 1); if (t->raddr.s_addr) { printf(" %s", inet_ntoa(t->raddr)); if (t->rport) { printf(":%u", t->rport); if (!t->spool_cnt && t->rport_cnt > 1) printf("-%u", t->rport + t->rport_cnt - 1); } } break; case REDIR_PROTO: p = getprotobynumber(t->proto); printf(" redirect_proto %s %s", p->p_name, inet_ntoa(t->laddr)); if (t->paddr.s_addr != 0) { printf(" %s", inet_ntoa(t->paddr)); if (t->raddr.s_addr) printf(" %s", inet_ntoa(t->raddr)); } break; default: errx(EX_DATAERR, "unknown redir mode"); break; } } printf("\n"); } void ipfw_config_nat(int ac, char **av) { struct cfg_nat *n; /* Nat instance configuration. */ int i, off, tok, ac1; char *id, *buf, **av1, *end; size_t len; av++; ac--; /* Nat id. */ if (ac == 0) errx(EX_DATAERR, "missing nat id"); id = *av; i = (int)strtol(id, &end, 0); if (i <= 0 || *end != '\0') errx(EX_DATAERR, "illegal nat id: %s", id); av++; ac--; if (ac == 0) errx(EX_DATAERR, "missing option"); len = sizeof(struct cfg_nat); ac1 = ac; av1 = av; while (ac1 > 0) { tok = match_token(nat_params, *av1); ac1--; av1++; switch (tok) { case TOK_IP: case TOK_IF: ac1--; av1++; break; case TOK_ALOG: case TOK_DENY_INC: case TOK_SAME_PORTS: case TOK_SKIP_GLOBAL: case TOK_UNREG_ONLY: case TOK_RESET_ADDR: case TOK_ALIAS_REV: case TOK_PROXY_ONLY: break; case TOK_REDIR_ADDR: if (ac1 < 2) errx(EX_DATAERR, "redirect_addr: " "not enough arguments"); len += estimate_redir_addr(&ac1, &av1); av1 += 2; ac1 -= 2; break; case TOK_REDIR_PORT: if (ac1 < 3) errx(EX_DATAERR, "redirect_port: " "not enough arguments"); av1++; ac1--; len += estimate_redir_port(&ac1, &av1); av1 += 2; ac1 -= 2; /* Skip optional remoteIP/port */ if (ac1 != 0 && isdigit(**av1)) { av1++; ac1--; } break; case TOK_REDIR_PROTO: if (ac1 < 2) errx(EX_DATAERR, "redirect_proto: " "not enough arguments"); len += sizeof(struct cfg_redir); av1 += 2; ac1 -= 2; /* Skip optional remoteIP/port */ if (ac1 != 0 && isdigit(**av1)) { av1++; ac1--; } if (ac1 != 0 && isdigit(**av1)) { av1++; ac1--; } break; default: errx(EX_DATAERR, "unrecognised option ``%s''", av1[-1]); } } if ((buf = malloc(len)) == NULL) errx(EX_OSERR, "malloc failed"); /* Offset in buf: save space for n at the beginning. */ off = sizeof(*n); memset(buf, 0, len); n = (struct cfg_nat *)buf; n->id = i; while (ac > 0) { tok = match_token(nat_params, *av); ac--; av++; switch (tok) { case TOK_IP: if (ac == 0) errx(EX_DATAERR, "missing option"); if (!inet_aton(av[0], &(n->ip))) errx(EX_DATAERR, "bad ip address ``%s''", av[0]); ac--; av++; break; case TOK_IF: if (ac == 0) errx(EX_DATAERR, "missing option"); set_addr_dynamic(av[0], n); ac--; av++; break; case TOK_ALOG: n->mode |= PKT_ALIAS_LOG; break; case TOK_DENY_INC: n->mode |= PKT_ALIAS_DENY_INCOMING; break; case TOK_SAME_PORTS: n->mode |= PKT_ALIAS_SAME_PORTS; break; case TOK_UNREG_ONLY: n->mode |= PKT_ALIAS_UNREGISTERED_ONLY; break; case TOK_SKIP_GLOBAL: n->mode |= PKT_ALIAS_SKIP_GLOBAL; break; case TOK_RESET_ADDR: n->mode |= PKT_ALIAS_RESET_ON_ADDR_CHANGE; break; case TOK_ALIAS_REV: n->mode |= PKT_ALIAS_REVERSE; break; case TOK_PROXY_ONLY: n->mode |= PKT_ALIAS_PROXY_ONLY; break; /* * All the setup_redir_* functions work directly in * the final buffer, see above for details. */ case TOK_REDIR_ADDR: case TOK_REDIR_PORT: case TOK_REDIR_PROTO: switch (tok) { case TOK_REDIR_ADDR: i = setup_redir_addr(&buf[off], &ac, &av); break; case TOK_REDIR_PORT: i = setup_redir_port(&buf[off], &ac, &av); break; case TOK_REDIR_PROTO: i = setup_redir_proto(&buf[off], &ac, &av); break; } n->redir_cnt++; off += i; break; } } i = do_cmd(IP_FW_NAT_CFG, buf, off); if (i) err(1, "setsockopt(%s)", "IP_FW_NAT_CFG"); if (!co.do_quiet) { /* After every modification, we show the resultant rule. */ int _ac = 3; const char *_av[] = {"show", "config", id}; ipfw_show_nat(_ac, (char **)(void *)_av); } } void ipfw_show_nat(int ac, char **av) { struct cfg_nat *n; struct cfg_redir *e; int cmd, i, nbytes, do_cfg, do_rule, frule, lrule, nalloc, size; int nat_cnt, redir_cnt, r; uint8_t *data, *p; char *endptr; do_rule = 0; nalloc = 1024; size = 0; data = NULL; frule = 0; lrule = IPFW_DEFAULT_RULE; /* max ipfw rule number */ ac--; av++; if (co.test_only) return; /* Parse parameters. */ for (cmd = IP_FW_NAT_GET_LOG, do_cfg = 0; ac != 0; ac--, av++) { if (!strncmp(av[0], "config", strlen(av[0]))) { cmd = IP_FW_NAT_GET_CONFIG, do_cfg = 1; continue; } /* Convert command line rule #. */ frule = lrule = strtoul(av[0], &endptr, 10); if (*endptr == '-') lrule = strtoul(endptr+1, &endptr, 10); if (lrule == 0) err(EX_USAGE, "invalid rule number: %s", av[0]); do_rule = 1; } nbytes = nalloc; while (nbytes >= nalloc) { nalloc = nalloc * 2; nbytes = nalloc; data = safe_realloc(data, nbytes); if (do_cmd(cmd, data, (uintptr_t)&nbytes) < 0) err(EX_OSERR, "getsockopt(IP_FW_GET_%s)", (cmd == IP_FW_NAT_GET_LOG) ? "LOG" : "CONFIG"); } if (nbytes == 0) exit(0); if (do_cfg) { nat_cnt = *((int *)data); for (i = sizeof(nat_cnt); nat_cnt; nat_cnt--) { n = (struct cfg_nat *)&data[i]; if (frule <= n->id && lrule >= n->id) print_nat_config(&data[i]); i += sizeof(struct cfg_nat); for (redir_cnt = 0; redir_cnt < n->redir_cnt; redir_cnt++) { e = (struct cfg_redir *)&data[i]; i += sizeof(struct cfg_redir) + e->spool_cnt * sizeof(struct cfg_spool); } } } else { for (i = 0; 1; i += LIBALIAS_BUF_SIZE + sizeof(int)) { p = &data[i]; if (p == data + nbytes) break; bcopy(p, &r, sizeof(int)); if (do_rule) { if (!(frule <= r && lrule >= r)) continue; } printf("nat %u: %s\n", r, p+sizeof(int)); } } } ipfw-user/ipfw/ipv6.c000644 000423 000000 00000030054 11725221076 015237 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD: head/sbin/ipfw/ipv6.c 220802 2011-04-18 21:18:22Z glebius $ * * ipv6 support */ #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include #include #include static struct _s_x icmp6codes[] = { { "no-route", ICMP6_DST_UNREACH_NOROUTE }, { "admin-prohib", ICMP6_DST_UNREACH_ADMIN }, { "address", ICMP6_DST_UNREACH_ADDR }, { "port", ICMP6_DST_UNREACH_NOPORT }, { NULL, 0 } }; void fill_unreach6_code(u_short *codep, char *str) { int val; char *s; val = strtoul(str, &s, 0); if (s == str || *s != '\0' || val >= 0x100) val = match_token(icmp6codes, str); if (val < 0) errx(EX_DATAERR, "unknown ICMPv6 unreachable code ``%s''", str); *codep = val; return; } void print_unreach6_code(uint16_t code) { char const *s = match_value(icmp6codes, code); if (s != NULL) printf("unreach6 %s", s); else printf("unreach6 %u", code); } /* * Print the ip address contained in a command. */ void print_ip6(ipfw_insn_ip6 *cmd, char const *s) { struct hostent *he = NULL; int len = F_LEN((ipfw_insn *) cmd) - 1; struct in6_addr *a = &(cmd->addr6); char trad[255]; printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s); if (cmd->o.opcode == O_IP6_SRC_ME || cmd->o.opcode == O_IP6_DST_ME) { printf("me6"); return; } if (cmd->o.opcode == O_IP6) { printf(" ip6"); return; } /* * len == 4 indicates a single IP, whereas lists of 1 or more * addr/mask pairs have len = (2n+1). We convert len to n so we * use that to count the number of entries. */ for (len = len / 4; len > 0; len -= 2, a += 2) { int mb = /* mask length */ (cmd->o.opcode == O_IP6_SRC || cmd->o.opcode == O_IP6_DST) ? 128 : contigmask((uint8_t *)&(a[1]), 128); if (mb == 128 && co.do_resolv) he = gethostbyaddr((char *)a, sizeof(*a), AF_INET6); if (he != NULL) /* resolved to name */ printf("%s", he->h_name); else if (mb == 0) /* any */ printf("any"); else { /* numeric IP followed by some kind of mask */ if (inet_ntop(AF_INET6, a, trad, sizeof( trad ) ) == NULL) printf("Error ntop in print_ip6\n"); printf("%s", trad ); if (mb < 0) /* XXX not really legal... */ printf(":%s", inet_ntop(AF_INET6, &a[1], trad, sizeof(trad))); else if (mb < 128) printf("/%d", mb); } if (len > 2) printf(","); } } void fill_icmp6types(ipfw_insn_icmp6 *cmd, char *av) { uint8_t type; bzero(cmd, sizeof(*cmd)); while (*av) { if (*av == ',') av++; type = strtoul(av, &av, 0); if (*av != ',' && *av != '\0') errx(EX_DATAERR, "invalid ICMP6 type"); /* * XXX: shouldn't this be 0xFF? I can't see any reason why * we shouldn't be able to filter all possiable values * regardless of the ability of the rest of the kernel to do * anything useful with them. */ if (type > ICMP6_MAXTYPE) errx(EX_DATAERR, "ICMP6 type out of range"); cmd->d[type / 32] |= ( 1 << (type % 32)); } cmd->o.opcode = O_ICMP6TYPE; cmd->o.len |= F_INSN_SIZE(ipfw_insn_icmp6); } void print_icmp6types(ipfw_insn_u32 *cmd) { int i, j; char sep= ' '; printf(" ip6 icmp6types"); for (i = 0; i < 7; i++) for (j=0; j < 32; ++j) { if ( (cmd->d[i] & (1 << (j))) == 0) continue; printf("%c%d", sep, (i*32 + j)); sep = ','; } } void print_flow6id( ipfw_insn_u32 *cmd) { uint16_t i, limit = cmd->o.arg1; char sep = ','; printf(" flow-id "); for( i=0; i < limit; ++i) { if (i == limit - 1) sep = ' '; printf("%d%c", cmd->d[i], sep); } } /* structure and define for the extension header in ipv6 */ static struct _s_x ext6hdrcodes[] = { { "frag", EXT_FRAGMENT }, { "hopopt", EXT_HOPOPTS }, { "route", EXT_ROUTING }, { "dstopt", EXT_DSTOPTS }, { "ah", EXT_AH }, { "esp", EXT_ESP }, { "rthdr0", EXT_RTHDR0 }, { "rthdr2", EXT_RTHDR2 }, { NULL, 0 } }; /* fills command for the extension header filtering */ int fill_ext6hdr( ipfw_insn *cmd, char *av) { int tok; char *s = av; cmd->arg1 = 0; while(s) { av = strsep( &s, ",") ; tok = match_token(ext6hdrcodes, av); switch (tok) { case EXT_FRAGMENT: cmd->arg1 |= EXT_FRAGMENT; break; case EXT_HOPOPTS: cmd->arg1 |= EXT_HOPOPTS; break; case EXT_ROUTING: cmd->arg1 |= EXT_ROUTING; break; case EXT_DSTOPTS: cmd->arg1 |= EXT_DSTOPTS; break; case EXT_AH: cmd->arg1 |= EXT_AH; break; case EXT_ESP: cmd->arg1 |= EXT_ESP; break; case EXT_RTHDR0: cmd->arg1 |= EXT_RTHDR0; break; case EXT_RTHDR2: cmd->arg1 |= EXT_RTHDR2; break; default: errx( EX_DATAERR, "invalid option for ipv6 exten header" ); break; } } if (cmd->arg1 == 0 ) return 0; cmd->opcode = O_EXT_HDR; cmd->len |= F_INSN_SIZE( ipfw_insn ); return 1; } void print_ext6hdr( ipfw_insn *cmd ) { char sep = ' '; printf(" extension header:"); if (cmd->arg1 & EXT_FRAGMENT ) { printf("%cfragmentation", sep); sep = ','; } if (cmd->arg1 & EXT_HOPOPTS ) { printf("%chop options", sep); sep = ','; } if (cmd->arg1 & EXT_ROUTING ) { printf("%crouting options", sep); sep = ','; } if (cmd->arg1 & EXT_RTHDR0 ) { printf("%crthdr0", sep); sep = ','; } if (cmd->arg1 & EXT_RTHDR2 ) { printf("%crthdr2", sep); sep = ','; } if (cmd->arg1 & EXT_DSTOPTS ) { printf("%cdestination options", sep); sep = ','; } if (cmd->arg1 & EXT_AH ) { printf("%cauthentication header", sep); sep = ','; } if (cmd->arg1 & EXT_ESP ) { printf("%cencapsulated security payload", sep); } } /* Try to find ipv6 address by hostname */ static int lookup_host6 (char *host, struct in6_addr *ip6addr) { struct hostent *he; if (!inet_pton(AF_INET6, host, ip6addr)) { if ((he = gethostbyname2(host, AF_INET6)) == NULL) return(-1); memcpy(ip6addr, he->h_addr_list[0], sizeof( struct in6_addr)); } return(0); } /* * fill the addr and mask fields in the instruction as appropriate from av. * Update length as appropriate. * The following formats are allowed: * any matches any IP6. Actually returns an empty instruction. * me returns O_IP6_*_ME * * 03f1::234:123:0342 single IP6 addres * 03f1::234:123:0342/24 address/mask * 03f1::234:123:0342/24,03f1::234:123:0343/ List of address * * Set of address (as in ipv6) not supported because ipv6 address * are typically random past the initial prefix. * Return 1 on success, 0 on failure. */ static int fill_ip6(ipfw_insn_ip6 *cmd, char *av) { int len = 0; struct in6_addr *d = &(cmd->addr6); /* * Needed for multiple address. * Note d[1] points to struct in6_add r mask6 of cmd */ cmd->o.len &= ~F_LEN_MASK; /* zero len */ if (strcmp(av, "any") == 0) return (1); if (strcmp(av, "me") == 0) { /* Set the data for "me" opt*/ cmd->o.len |= F_INSN_SIZE(ipfw_insn); return (1); } if (strcmp(av, "me6") == 0) { /* Set the data for "me" opt*/ cmd->o.len |= F_INSN_SIZE(ipfw_insn); return (1); } av = strdup(av); while (av) { /* * After the address we can have '/' indicating a mask, * or ',' indicating another address follows. */ char *p; int masklen; char md = '\0'; if ((p = strpbrk(av, "/,")) ) { md = *p; /* save the separator */ *p = '\0'; /* terminate address string */ p++; /* and skip past it */ } /* now p points to NULL, mask or next entry */ /* lookup stores address in *d as a side effect */ if (lookup_host6(av, d) != 0) { /* XXX: failed. Free memory and go */ errx(EX_DATAERR, "bad address \"%s\"", av); } /* next, look at the mask, if any */ masklen = (md == '/') ? atoi(p) : 128; if (masklen > 128 || masklen < 0) errx(EX_DATAERR, "bad width \"%s\''", p); else n2mask(&d[1], masklen); APPLY_MASK(d, &d[1]) /* mask base address with mask */ /* find next separator */ if (md == '/') { /* find separator past the mask */ p = strpbrk(p, ","); if (p != NULL) p++; } av = p; /* Check this entry */ if (masklen == 0) { /* * 'any' turns the entire list into a NOP. * 'not any' never matches, so it is removed from the * list unless it is the only item, in which case we * report an error. */ if (cmd->o.len & F_NOT && av == NULL && len == 0) errx(EX_DATAERR, "not any never matches"); continue; } /* * A single IP can be stored alone */ if (masklen == 128 && av == NULL && len == 0) { len = F_INSN_SIZE(struct in6_addr); break; } /* Update length and pointer to arguments */ len += F_INSN_SIZE(struct in6_addr)*2; d += 2; } /* end while */ /* * Total length of the command, remember that 1 is the size of * the base command. */ if (len + 1 > F_LEN_MASK) errx(EX_DATAERR, "address list too long"); cmd->o.len |= len+1; free(av); return (1); } /* * fills command for ipv6 flow-id filtering * note that the 20 bit flow number is stored in a array of u_int32_t * it's supported lists of flow-id, so in the o.arg1 we store how many * additional flow-id we want to filter, the basic is 1 */ void fill_flow6( ipfw_insn_u32 *cmd, char *av ) { u_int32_t type; /* Current flow number */ u_int16_t nflow = 0; /* Current flow index */ char *s = av; cmd->d[0] = 0; /* Initializing the base number*/ while (s) { av = strsep( &s, ",") ; type = strtoul(av, &av, 0); if (*av != ',' && *av != '\0') errx(EX_DATAERR, "invalid ipv6 flow number %s", av); if (type > 0xfffff) errx(EX_DATAERR, "flow number out of range %s", av); cmd->d[nflow] |= type; nflow++; } if( nflow > 0 ) { cmd->o.opcode = O_FLOW6ID; cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + nflow; cmd->o.arg1 = nflow; } else { errx(EX_DATAERR, "invalid ipv6 flow number %s", av); } } ipfw_insn * add_srcip6(ipfw_insn *cmd, char *av) { fill_ip6((ipfw_insn_ip6 *)cmd, av); if (F_LEN(cmd) == 0) { /* any */ } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ cmd->opcode = O_IP6_SRC_ME; } else if (F_LEN(cmd) == (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { /* single IP, no mask*/ cmd->opcode = O_IP6_SRC; } else { /* addr/mask opt */ cmd->opcode = O_IP6_SRC_MASK; } return cmd; } ipfw_insn * add_dstip6(ipfw_insn *cmd, char *av) { fill_ip6((ipfw_insn_ip6 *)cmd, av); if (F_LEN(cmd) == 0) { /* any */ } else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) { /* "me" */ cmd->opcode = O_IP6_DST_ME; } else if (F_LEN(cmd) == (F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn))) { /* single IP, no mask*/ cmd->opcode = O_IP6_DST; } else { /* addr/mask opt */ cmd->opcode = O_IP6_DST_MASK; } return cmd; } ipfw-user/ipfw/main.c000644 000423 000000 00000037036 12007565005 015303 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2002-2003,2010 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * Command line interface for IP firewall facility * * $FreeBSD: head/sbin/ipfw/main.c 229778 2012-01-07 16:09:33Z uqs $ */ #include #include #include #include #include #include #include #include #include #include #include "ipfw2.h" static void help(void) { fprintf(stderr, "ipfw syntax summary (but please do read the ipfw(8) manpage):\n\n" "\tipfw [-abcdefhnNqStTv] \n\n" "where is one of the following:\n\n" "add [num] [set N] [prob x] RULE-BODY\n" "{pipe|queue} N config PIPE-BODY\n" "[pipe|queue] {zero|delete|show} [N{,N}]\n" "nat N config {ip IPADDR|if IFNAME|log|deny_in|same_ports|unreg_only|reset|\n" " reverse|proxy_only|redirect_addr linkspec|\n" " redirect_port linkspec|redirect_proto linkspec}\n" "set [disable N... enable N...] | move [rule] X to Y | swap X Y | show\n" "set N {show|list|zero|resetlog|delete} [N{,N}] | flush\n" "table N {add ip[/bits] [value] | delete ip[/bits] | flush | list}\n" "table all {flush | list}\n" "\n" "RULE-BODY: check-state [PARAMS] | ACTION [PARAMS] ADDR [OPTION_LIST]\n" "ACTION: check-state | allow | count | deny | unreach{,6} CODE |\n" " skipto N | {divert|tee} PORT | forward ADDR |\n" " pipe N | queue N | nat N | setfib FIB | reass\n" "PARAMS: [log [logamount LOGLIMIT]] [altq QUEUE_NAME]\n" "ADDR: [ MAC dst src ether_type ] \n" " [ ip from IPADDR [ PORT ] to IPADDR [ PORTLIST ] ]\n" " [ ipv6|ip6 from IP6ADDR [ PORT ] to IP6ADDR [ PORTLIST ] ]\n" "IPADDR: [not] { any | me | ip/bits{x,y,z} | table(t[,v]) | IPLIST }\n" "IP6ADDR: [not] { any | me | me6 | ip6/bits | IP6LIST }\n" "IP6LIST: { ip6 | ip6/bits }[,IP6LIST]\n" "IPLIST: { ip | ip/bits | ip:mask }[,IPLIST]\n" "OPTION_LIST: OPTION [OPTION_LIST]\n" "OPTION: bridged | diverted | diverted-loopback | diverted-output |\n" " {dst-ip|src-ip} IPADDR | {dst-ip6|src-ip6|dst-ipv6|src-ipv6} IP6ADDR |\n" " {dst-port|src-port} LIST |\n" " estab | frag | {gid|uid} N | icmptypes LIST | in | out | ipid LIST |\n" " iplen LIST | ipoptions SPEC | ipprecedence | ipsec | iptos SPEC |\n" " ipttl LIST | ipversion VER | keep-state | layer2 | limit ... |\n" " icmp6types LIST | ext6hdr LIST | flow-id N[,N] | fib FIB |\n" " mac ... | mac-type LIST | proto LIST | {recv|xmit|via} {IF|IPADDR} |\n" " setup | {tcpack|tcpseq|tcpwin} NN | tcpflags SPEC | tcpoptions SPEC |\n" " tcpdatalen LIST | verrevpath | versrcreach | antispoof\n" ); exit(0); } /* * Called with the arguments, including program name because getopt * wants it to be present. * Returns 0 if successful, 1 if empty command, errx() in case of errors. * First thing we do is process parameters creating an argv[] array * which includes the program name and a NULL entry at the end. * If we are called with a single string, we split it on whitespace. * Also, arguments with a trailing ',' are joined to the next one. * The pointers (av[]) and data are in a single chunk of memory. * av[0] points to the original program name, all other entries * point into the allocated chunk. */ static int ipfw_main(int oldac, char **oldav) { int ch, ac; const char *errstr; char **av, **save_av; int do_acct = 0; /* Show packet/byte count */ int try_next = 0; /* set if pipe cmd not found */ int av_size; /* compute the av size */ char *av_p; /* used to build the av list */ #define WHITESP " \t\f\v\n\r" if (oldac < 2) return 1; /* need at least one argument */ if (oldac == 2) { /* * If we are called with one argument, try to split it into * words for subsequent parsing. Spaces after a ',' are * removed by copying the string in-place. */ char *arg = oldav[1]; /* The string is the first arg. */ int l = strlen(arg); int copy = 0; /* 1 if we need to copy, 0 otherwise */ int i, j; for (i = j = 0; i < l; i++) { if (arg[i] == '#') /* comment marker */ break; if (copy) { arg[j++] = arg[i]; copy = !strchr("," WHITESP, arg[i]); } else { copy = !strchr(WHITESP, arg[i]); if (copy) arg[j++] = arg[i]; } } if (!copy && j > 0) /* last char was a 'blank', remove it */ j--; l = j; /* the new argument length */ arg[j++] = '\0'; if (l == 0) /* empty string! */ return 1; /* * First, count number of arguments. Because of the previous * processing, this is just the number of blanks plus 1. */ for (i = 0, ac = 1; i < l; i++) if (strchr(WHITESP, arg[i]) != NULL) ac++; /* * Allocate the argument list structure as a single block * of memory, containing pointers and the argument * strings. We include one entry for the program name * because getopt expects it, and a NULL at the end * to simplify further parsing. */ ac++; /* add 1 for the program name */ av_size = (ac+1) * sizeof(char *) + l + 1; av = safe_calloc(av_size, 1); /* * Init the argument pointer to the end of the array * and copy arguments from arg[] to av[]. For each one, * j is the initial character, i is the one past the end. */ av_p = (char *)&av[ac+1]; for (ac = 1, i = j = 0; i < l; i++) { if (strchr(WHITESP, arg[i]) != NULL || i == l-1) { if (i == l-1) i++; bcopy(arg+j, av_p, i-j); av[ac] = av_p; av_p += i-j; /* the length of the string */ *av_p++ = '\0'; ac++; j = i + 1; } } } else { /* * If an argument ends with ',' join with the next one. */ int first, i, l=0; /* * Allocate the argument list structure as a single block * of memory, containing both pointers and the argument * strings. We include some space for the program name * because getopt expects it. * We add an extra pointer to the end of the array, * to make simpler further parsing. */ for (i=0; i= 2 && !strcmp(av[1], "sysctl")) { char *s; int i; if (ac != 3) { printf( "sysctl emulation usage:\n" " ipfw sysctl name[=value]\n" " ipfw sysctl -a\n"); return 0; } s = strchr(av[2], '='); if (s == NULL) { s = !strcmp(av[2], "-a") ? NULL : av[2]; sysctlbyname(s, NULL, NULL, NULL, 0); } else { /* ipfw sysctl x.y.z=value */ /* assume an INT value, will extend later */ if (s[1] == '\0') { printf("ipfw sysctl: missing value\n\n"); return 0; } *s = '\0'; i = strtol(s+1, NULL, 0); sysctlbyname(av[2], NULL, NULL, &i, sizeof(int)); } return 0; } #endif /* Save arguments for final freeing of memory. */ save_av = av; optind = optreset = 1; /* restart getopt() */ while ((ch = getopt(ac, av, "abcdefhinNp:qs:STtv")) != -1) switch (ch) { case 'a': do_acct = 1; break; case 'b': co.comment_only = 1; co.do_compact = 1; break; case 'c': co.do_compact = 1; break; case 'd': co.do_dynamic = 1; break; case 'e': co.do_expired = 1; break; case 'f': co.do_force = 1; break; case 'h': /* help */ free(save_av); help(); break; /* NOTREACHED */ case 'i': co.do_value_as_ip = 1; break; case 'n': co.test_only = 1; break; case 'N': co.do_resolv = 1; break; case 'p': errx(EX_USAGE, "An absolute pathname must be used " "with -p option."); /* NOTREACHED */ case 'q': co.do_quiet = 1; break; case 's': /* sort */ co.do_sort = atoi(optarg); break; case 'S': co.show_sets = 1; break; case 't': co.do_time = 1; break; case 'T': co.do_time = 2; /* numeric timestamp */ break; case 'v': /* verbose */ co.verbose = 1; break; default: free(save_av); return 1; } ac -= optind; av += optind; NEED1("bad arguments, for usage summary ``ipfw''"); /* * An undocumented behaviour of ipfw1 was to allow rule numbers first, * e.g. "100 add allow ..." instead of "add 100 allow ...". * In case, swap first and second argument to get the normal form. */ if (ac > 1 && isdigit(*av[0])) { char *p = av[0]; av[0] = av[1]; av[1] = p; } /* * Optional: pipe, queue or nat. */ co.do_nat = 0; co.do_pipe = 0; co.use_set = 0; if (!strncmp(*av, "nat", strlen(*av))) co.do_nat = 1; else if (!strncmp(*av, "pipe", strlen(*av))) co.do_pipe = 1; else if (_substrcmp(*av, "queue") == 0) co.do_pipe = 2; else if (_substrcmp(*av, "flowset") == 0) co.do_pipe = 2; else if (_substrcmp(*av, "sched") == 0) co.do_pipe = 3; else if (!strncmp(*av, "set", strlen(*av))) { if (ac > 1 && isdigit(av[1][0])) { co.use_set = strtonum(av[1], 0, resvd_set_number, &errstr); if (errstr) errx(EX_DATAERR, "invalid set number %s\n", av[1]); ac -= 2; av += 2; co.use_set++; } } if (co.do_pipe || co.do_nat) { ac--; av++; } NEED1("missing command"); /* * For pipes, queues and nats we normally say 'nat|pipe NN config' * but the code is easier to parse as 'nat|pipe config NN' * so we swap the two arguments. */ if ((co.do_pipe || co.do_nat) && ac > 1 && isdigit(*av[0])) { char *p = av[0]; av[0] = av[1]; av[1] = p; } if (co.use_set == 0) { if (_substrcmp(*av, "add") == 0) ipfw_add(av); else if (co.do_nat && _substrcmp(*av, "show") == 0) ipfw_show_nat(ac, av); else if (co.do_pipe && _substrcmp(*av, "config") == 0) ipfw_config_pipe(ac, av); else if (co.do_nat && _substrcmp(*av, "config") == 0) ipfw_config_nat(ac, av); else if (_substrcmp(*av, "set") == 0) ipfw_sets_handler(av); else if (_substrcmp(*av, "table") == 0) ipfw_table_handler(ac, av); else if (_substrcmp(*av, "enable") == 0) ipfw_sysctl_handler(av, 1); else if (_substrcmp(*av, "disable") == 0) ipfw_sysctl_handler(av, 0); else try_next = 1; } if (co.use_set || try_next) { if (_substrcmp(*av, "delete") == 0) ipfw_delete(av); else if (_substrcmp(*av, "flush") == 0) ipfw_flush(co.do_force); else if (_substrcmp(*av, "zero") == 0) ipfw_zero(ac, av, 0 /* IP_FW_ZERO */); else if (_substrcmp(*av, "resetlog") == 0) ipfw_zero(ac, av, 1 /* IP_FW_RESETLOG */); else if (_substrcmp(*av, "print") == 0 || _substrcmp(*av, "list") == 0) ipfw_list(ac, av, do_acct); else if (_substrcmp(*av, "show") == 0) ipfw_list(ac, av, 1 /* show counters */); else errx(EX_USAGE, "bad command `%s'", *av); } /* Free memory allocated in the argument parsing. */ free(save_av); return 0; } static void ipfw_readfile(int ac, char *av[]) { #define MAX_ARGS 32 char buf[4096]; char *progname = av[0]; /* original program name */ const char *cmd = NULL; /* preprocessor name, if any */ const char *filename = av[ac-1]; /* file to read */ int c, lineno=0; FILE *f = NULL; pid_t preproc = 0; while ((c = getopt(ac, av, "cfNnp:qS")) != -1) { switch(c) { case 'c': co.do_compact = 1; break; case 'f': co.do_force = 1; break; case 'N': co.do_resolv = 1; break; case 'n': co.test_only = 1; break; case 'p': /* * ipfw -p cmd [args] filename * * We are done with getopt(). All arguments * except the filename go to the preprocessor, * so we need to do the following: * - check that a filename is actually present; * - advance av by optind-1 to skip arguments * already processed; * - decrease ac by optind, to remove the args * already processed and the final filename; * - set the last entry in av[] to NULL so * popen() can detect the end of the array; * - set optind=ac to let getopt() terminate. */ if (optind == ac) errx(EX_USAGE, "no filename argument"); cmd = optarg; av[ac-1] = NULL; av += optind - 1; ac -= optind; optind = ac; break; case 'q': co.do_quiet = 1; break; case 'S': co.show_sets = 1; break; default: errx(EX_USAGE, "bad arguments, for usage" " summary ``ipfw''"); } } if (cmd == NULL && ac != optind + 1) errx(EX_USAGE, "extraneous filename arguments %s", av[ac-1]); if ((f = fopen(filename, "r")) == NULL) err(EX_UNAVAILABLE, "fopen: %s", filename); if (cmd != NULL) { /* pipe through preprocessor */ int pipedes[2]; if (pipe(pipedes) == -1) err(EX_OSERR, "cannot create pipe"); preproc = fork(); if (preproc == -1) err(EX_OSERR, "cannot fork"); if (preproc == 0) { /* * Child, will run the preprocessor with the * file on stdin and the pipe on stdout. */ if (dup2(fileno(f), 0) == -1 || dup2(pipedes[1], 1) == -1) err(EX_OSERR, "dup2()"); fclose(f); close(pipedes[1]); close(pipedes[0]); execvp(cmd, av); err(EX_OSERR, "execvp(%s) failed", cmd); } else { /* parent, will reopen f as the pipe */ fclose(f); close(pipedes[1]); if ((f = fdopen(pipedes[0], "r")) == NULL) { int savederrno = errno; (void)kill(preproc, SIGTERM); errno = savederrno; err(EX_OSERR, "fdopen()"); } } } while (fgets(buf, sizeof(buf), f)) { /* read commands */ char linename[20]; char *args[2]; lineno++; snprintf(linename, sizeof(linename), "Line %d", lineno); setprogname(linename); /* XXX */ args[0] = progname; args[1] = buf; ipfw_main(2, args); } fclose(f); if (cmd != NULL) { int status; if (waitpid(preproc, &status, 0) == -1) errx(EX_OSERR, "waitpid()"); if (WIFEXITED(status) && WEXITSTATUS(status) != EX_OK) errx(EX_UNAVAILABLE, "preprocessor exited with status %d", WEXITSTATUS(status)); else if (WIFSIGNALED(status)) errx(EX_UNAVAILABLE, "preprocessor exited with signal %d", WTERMSIG(status)); } } int main(int ac, char *av[]) { #if defined(_WIN32) && defined(TCC) { WSADATA wsaData; int ret=0; unsigned short wVersionRequested = MAKEWORD(2, 2); ret = WSAStartup(wVersionRequested, &wsaData); if (ret != 0) { /* Tell the user that we could not find a usable */ /* Winsock DLL. */ printf("WSAStartup failed with error: %d\n", ret); return 1; } } #endif /* * If the last argument is an absolute pathname, interpret it * as a file to be preprocessed. */ if (ac > 1 && av[ac - 1][0] == '/') { if (access(av[ac - 1], R_OK) == 0) ipfw_readfile(ac, av); else err(EX_USAGE, "pathname: %s", av[ac - 1]); } else { if (ipfw_main(ac, av)) { errx(EX_USAGE, "usage: ipfw [options]\n" "do \"ipfw -h\" or \"man ipfw\" for details"); } } return EX_OK; } ipfw-user/ipfw/ipfw2.c000644 000423 000000 00000300017 12007565005 015376 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD: head/sbin/ipfw/ipfw2.c 238903 2012-07-30 11:02:22Z luigi $ */ #include #include #include #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include /* ctime */ #include /* _long_to_time */ #include #include #include /* offsetof */ #include #include /* only IFNAMSIZ */ #include #include /* only n_short, n_long */ #include #include #include #include #include struct cmdline_opts co; /* global options */ int resvd_set_number = RESVD_SET; int ipfw_socket = -1; #ifndef s6_addr32 #define s6_addr32 __u6_addr.__u6_addr32 #endif #define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ if (!av[0]) \ errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ if (_substrcmp(*av, "tablearg") == 0) { \ arg = IP_FW_TABLEARG; \ break; \ } \ \ { \ long _xval; \ char *end; \ \ _xval = strtol(*av, &end, 10); \ \ if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ errx(EX_DATAERR, "%s: invalid argument: %s", \ match_value(s_x, tok), *av); \ \ if (errno == ERANGE || _xval < min || _xval > max) \ errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ match_value(s_x, tok), min, max, *av); \ \ if (_xval == IP_FW_TABLEARG) \ errx(EX_DATAERR, "%s: illegal argument value: %s", \ match_value(s_x, tok), *av); \ arg = _xval; \ } \ } while (0) static void PRINT_UINT_ARG(const char *str, uint32_t arg) { if (str != NULL) printf("%s",str); if (arg == IP_FW_TABLEARG) printf("tablearg"); else printf("%u", arg); } static struct _s_x f_tcpflags[] = { { "syn", TH_SYN }, { "fin", TH_FIN }, { "ack", TH_ACK }, { "psh", TH_PUSH }, { "rst", TH_RST }, { "urg", TH_URG }, { "tcp flag", 0 }, { NULL, 0 } }; static struct _s_x f_tcpopts[] = { { "mss", IP_FW_TCPOPT_MSS }, { "maxseg", IP_FW_TCPOPT_MSS }, { "window", IP_FW_TCPOPT_WINDOW }, { "sack", IP_FW_TCPOPT_SACK }, { "ts", IP_FW_TCPOPT_TS }, { "timestamp", IP_FW_TCPOPT_TS }, { "cc", IP_FW_TCPOPT_CC }, { "tcp option", 0 }, { NULL, 0 } }; /* * IP options span the range 0 to 255 so we need to remap them * (though in fact only the low 5 bits are significant). */ static struct _s_x f_ipopts[] = { { "ssrr", IP_FW_IPOPT_SSRR}, { "lsrr", IP_FW_IPOPT_LSRR}, { "rr", IP_FW_IPOPT_RR}, { "ts", IP_FW_IPOPT_TS}, { "ip option", 0 }, { NULL, 0 } }; static struct _s_x f_iptos[] = { { "lowdelay", IPTOS_LOWDELAY}, { "throughput", IPTOS_THROUGHPUT}, { "reliability", IPTOS_RELIABILITY}, { "mincost", IPTOS_MINCOST}, { "congestion", IPTOS_ECN_CE}, { "ecntransport", IPTOS_ECN_ECT0}, { "ip tos option", 0}, { NULL, 0 } }; static struct _s_x limit_masks[] = { {"all", DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT}, {"src-addr", DYN_SRC_ADDR}, {"src-port", DYN_SRC_PORT}, {"dst-addr", DYN_DST_ADDR}, {"dst-port", DYN_DST_PORT}, {NULL, 0} }; /* * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines * This is only used in this code. */ #define IPPROTO_ETHERTYPE 0x1000 static struct _s_x ether_types[] = { /* * Note, we cannot use "-:&/" in the names because they are field * separators in the type specifications. Also, we use s = NULL as * end-delimiter, because a type of 0 can be legal. */ { "ip", 0x0800 }, { "ipv4", 0x0800 }, { "ipv6", 0x86dd }, { "arp", 0x0806 }, { "rarp", 0x8035 }, { "vlan", 0x8100 }, { "loop", 0x9000 }, { "trail", 0x1000 }, { "at", 0x809b }, { "atalk", 0x809b }, { "aarp", 0x80f3 }, { "pppoe_disc", 0x8863 }, { "pppoe_sess", 0x8864 }, { "ipx_8022", 0x00E0 }, { "ipx_8023", 0x0000 }, { "ipx_ii", 0x8137 }, { "ipx_snap", 0x8137 }, { "ipx", 0x8137 }, { "ns", 0x0600 }, { NULL, 0 } }; static struct _s_x rule_actions[] = { { "accept", TOK_ACCEPT }, { "pass", TOK_ACCEPT }, { "allow", TOK_ACCEPT }, { "permit", TOK_ACCEPT }, { "count", TOK_COUNT }, { "pipe", TOK_PIPE }, { "queue", TOK_QUEUE }, { "divert", TOK_DIVERT }, { "tee", TOK_TEE }, { "netgraph", TOK_NETGRAPH }, { "ngtee", TOK_NGTEE }, { "fwd", TOK_FORWARD }, { "forward", TOK_FORWARD }, { "skipto", TOK_SKIPTO }, { "deny", TOK_DENY }, { "drop", TOK_DENY }, { "reject", TOK_REJECT }, { "reset6", TOK_RESET6 }, { "reset", TOK_RESET }, { "unreach6", TOK_UNREACH6 }, { "unreach", TOK_UNREACH }, { "check-state", TOK_CHECKSTATE }, { "//", TOK_COMMENT }, { "nat", TOK_NAT }, { "reass", TOK_REASS }, { "setfib", TOK_SETFIB }, { "call", TOK_CALL }, { "return", TOK_RETURN }, { NULL, 0 } /* terminator */ }; static struct _s_x rule_action_params[] = { { "altq", TOK_ALTQ }, { "log", TOK_LOG }, { "tag", TOK_TAG }, { "untag", TOK_UNTAG }, { NULL, 0 } /* terminator */ }; /* * The 'lookup' instruction accepts one of the following arguments. * -1 is a terminator for the list. * Arguments are passed as v[1] in O_DST_LOOKUP options. */ static int lookup_key[] = { TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, TOK_UID, TOK_JAIL, TOK_DSCP, -1 }; static struct _s_x rule_options[] = { { "tagged", TOK_TAGGED }, { "uid", TOK_UID }, { "gid", TOK_GID }, { "jail", TOK_JAIL }, { "in", TOK_IN }, { "limit", TOK_LIMIT }, { "keep-state", TOK_KEEPSTATE }, { "bridged", TOK_LAYER2 }, { "layer2", TOK_LAYER2 }, { "out", TOK_OUT }, { "diverted", TOK_DIVERTED }, { "diverted-loopback", TOK_DIVERTEDLOOPBACK }, { "diverted-output", TOK_DIVERTEDOUTPUT }, { "xmit", TOK_XMIT }, { "recv", TOK_RECV }, { "via", TOK_VIA }, { "fragment", TOK_FRAG }, { "frag", TOK_FRAG }, { "fib", TOK_FIB }, { "ipoptions", TOK_IPOPTS }, { "ipopts", TOK_IPOPTS }, { "iplen", TOK_IPLEN }, { "ipid", TOK_IPID }, { "ipprecedence", TOK_IPPRECEDENCE }, { "dscp", TOK_DSCP }, { "iptos", TOK_IPTOS }, { "ipttl", TOK_IPTTL }, { "ipversion", TOK_IPVER }, { "ipver", TOK_IPVER }, { "estab", TOK_ESTAB }, { "established", TOK_ESTAB }, { "setup", TOK_SETUP }, { "sockarg", TOK_SOCKARG }, { "tcpdatalen", TOK_TCPDATALEN }, { "tcpflags", TOK_TCPFLAGS }, { "tcpflgs", TOK_TCPFLAGS }, { "tcpoptions", TOK_TCPOPTS }, { "tcpopts", TOK_TCPOPTS }, { "tcpseq", TOK_TCPSEQ }, { "tcpack", TOK_TCPACK }, { "tcpwin", TOK_TCPWIN }, { "icmptype", TOK_ICMPTYPES }, { "icmptypes", TOK_ICMPTYPES }, { "dst-ip", TOK_DSTIP }, { "src-ip", TOK_SRCIP }, { "dst-port", TOK_DSTPORT }, { "src-port", TOK_SRCPORT }, { "proto", TOK_PROTO }, { "MAC", TOK_MAC }, { "mac", TOK_MAC }, { "mac-type", TOK_MACTYPE }, { "verrevpath", TOK_VERREVPATH }, { "versrcreach", TOK_VERSRCREACH }, { "antispoof", TOK_ANTISPOOF }, { "ipsec", TOK_IPSEC }, { "icmp6type", TOK_ICMP6TYPES }, { "icmp6types", TOK_ICMP6TYPES }, { "ext6hdr", TOK_EXT6HDR}, { "flow-id", TOK_FLOWID}, { "ipv6", TOK_IPV6}, { "ip6", TOK_IPV6}, { "ipv4", TOK_IPV4}, { "ip4", TOK_IPV4}, { "dst-ipv6", TOK_DSTIP6}, { "dst-ip6", TOK_DSTIP6}, { "src-ipv6", TOK_SRCIP6}, { "src-ip6", TOK_SRCIP6}, { "lookup", TOK_LOOKUP}, { "//", TOK_COMMENT }, { "not", TOK_NOT }, /* pseudo option */ { "!", /* escape ? */ TOK_NOT }, /* pseudo option */ { "or", TOK_OR }, /* pseudo option */ { "|", /* escape */ TOK_OR }, /* pseudo option */ { "{", TOK_STARTBRACE }, /* pseudo option */ { "(", TOK_STARTBRACE }, /* pseudo option */ { "}", TOK_ENDBRACE }, /* pseudo option */ { ")", TOK_ENDBRACE }, /* pseudo option */ { NULL, 0 } /* terminator */ }; /* * Helper routine to print a possibly unaligned uint64_t on * various platform. If width > 0, print the value with * the desired width, followed by a space; * otherwise, return the required width. */ int pr_u64(uint64_t *pd, int width) { #ifdef TCC #define U64_FMT "I64" #else #define U64_FMT "llu" #endif uint64_t u; unsigned long long d; bcopy (pd, &u, sizeof(u)); d = u; return (width > 0) ? printf("%*" U64_FMT " ", width, d) : snprintf(NULL, 0, "%" U64_FMT, d) ; #undef U64_FMT } void * safe_calloc(size_t number, size_t size) { void *ret = calloc(number, size); if (ret == NULL) err(EX_OSERR, "calloc"); return ret; } void * safe_realloc(void *ptr, size_t size) { void *ret = realloc(ptr, size); if (ret == NULL) err(EX_OSERR, "realloc"); return ret; } /* * conditionally runs the command. * Selected options or negative -> getsockopt */ int do_cmd(int optname, void *optval, uintptr_t optlen) { int i; if (co.test_only) return 0; if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET || optname == IP_FW_ADD || optname == IP_FW3 || optname == IP_FW_NAT_GET_CONFIG || optname < 0 || optname == IP_FW_NAT_GET_LOG) { if (optname < 0) optname = -optname; i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval, (socklen_t *)optlen); } else { i = setsockopt(ipfw_socket, IPPROTO_IP, optname, optval, optlen); } return i; } /* * do_setcmd3 - pass ipfw control cmd to kernel * @optname: option name * @optval: pointer to option data * @optlen: option length * * Function encapsulates option value in IP_FW3 socket option * and calls setsockopt(). * Function returns 0 on success or -1 otherwise. */ static int do_setcmd3(int optname, void *optval, socklen_t optlen) { socklen_t len; ip_fw3_opheader *op3; if (co.test_only) return (0); if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); len = sizeof(ip_fw3_opheader) + optlen; op3 = alloca(len); /* Zero reserved fields */ memset(op3, 0, sizeof(ip_fw3_opheader)); memcpy(op3 + 1, optval, optlen); op3->opcode = optname; return setsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, len); } /** * match_token takes a table and a string, returns the value associated * with the string (-1 in case of failure). */ int match_token(struct _s_x *table, char *string) { struct _s_x *pt; uint i = strlen(string); for (pt = table ; i && pt->s != NULL ; pt++) if (strlen(pt->s) == i && !bcmp(string, pt->s, i)) return pt->x; return -1; } /** * match_value takes a table and a value, returns the string associated * with the value (NULL in case of failure). */ char const * match_value(struct _s_x *p, int value) { for (; p->s != NULL; p++) if (p->x == value) return p->s; return NULL; } /* * _substrcmp takes two strings and returns 1 if they do not match, * and 0 if they match exactly or the first string is a sub-string * of the second. A warning is printed to stderr in the case that the * first string is a sub-string of the second. * * This function will be removed in the future through the usual * deprecation process. */ int _substrcmp(const char *str1, const char* str2) { if (strncmp(str1, str2, strlen(str1)) != 0) return 1; if (strlen(str1) != strlen(str2)) warnx("DEPRECATED: '%s' matched '%s' as a sub-string", str1, str2); return 0; } /* * _substrcmp2 takes three strings and returns 1 if the first two do not match, * and 0 if they match exactly or the second string is a sub-string * of the first. A warning is printed to stderr in the case that the * first string does not match the third. * * This function exists to warn about the bizarre construction * strncmp(str, "by", 2) which is used to allow people to use a shortcut * for "bytes". The problem is that in addition to accepting "by", * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any * other string beginning with "by". * * This function will be removed in the future through the usual * deprecation process. */ int _substrcmp2(const char *str1, const char* str2, const char* str3) { if (strncmp(str1, str2, strlen(str2)) != 0) return 1; if (strcmp(str1, str3) != 0) warnx("DEPRECATED: '%s' matched '%s'", str1, str3); return 0; } /* * prints one port, symbolic or numeric */ static void print_port(int proto, uint16_t port) { if (proto == IPPROTO_ETHERTYPE) { char const *s; if (co.do_resolv && (s = match_value(ether_types, port)) ) printf("%s", s); else printf("0x%04x", port); } else { struct servent *se = NULL; if (co.do_resolv) { struct protoent *pe = getprotobynumber(proto); se = getservbyport(htons(port), pe ? pe->p_name : NULL); } if (se) printf("%s", se->s_name); else printf("%d", port); } } static struct _s_x _port_name[] = { {"dst-port", O_IP_DSTPORT}, {"src-port", O_IP_SRCPORT}, {"ipid", O_IPID}, {"iplen", O_IPLEN}, {"ipttl", O_IPTTL}, {"mac-type", O_MAC_TYPE}, {"tcpdatalen", O_TCPDATALEN}, {"tcpwin", O_TCPWIN}, {"tagged", O_TAGGED}, {NULL, 0} }; /* * Print the values in a list 16-bit items of the types above. * XXX todo: add support for mask. */ static void print_newports(ipfw_insn_u16 *cmd, int proto, int opcode) { uint16_t *p = cmd->ports; int i; char const *sep; if (opcode != 0) { sep = match_value(_port_name, opcode); if (sep == NULL) sep = "???"; printf (" %s", sep); } sep = " "; for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { printf("%s", sep); print_port(proto, p[0]); if (p[0] != p[1]) { printf("-"); print_port(proto, p[1]); } sep = ","; } } /* * Like strtol, but also translates service names into port numbers * for some protocols. * In particular: * proto == -1 disables the protocol check; * proto == IPPROTO_ETHERTYPE looks up an internal table * proto == matches the values there. * Returns *end == s in case the parameter is not found. */ static int strtoport(char *s, char **end, int base, int proto) { char *p, *buf; char *s1; int i; *end = s; /* default - not found */ if (*s == '\0') return 0; /* not found */ if (isdigit(*s)) return strtol(s, end, base); /* * find separator. '\\' escapes the next char. */ for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\') ; s1++) if (*s1 == '\\' && s1[1] != '\0') s1++; buf = safe_calloc(s1 - s + 1, 1); /* * copy into a buffer skipping backslashes */ for (p = s, i = 0; p != s1 ; p++) if (*p != '\\') buf[i++] = *p; buf[i++] = '\0'; if (proto == IPPROTO_ETHERTYPE) { i = match_token(ether_types, buf); free(buf); if (i != -1) { /* found */ *end = s1; return i; } } else { struct protoent *pe = NULL; struct servent *se; if (proto != 0) pe = getprotobynumber(proto); setservent(1); se = getservbyname(buf, pe ? pe->p_name : NULL); free(buf); if (se != NULL) { *end = s1; return ntohs(se->s_port); } } return 0; /* not found */ } /* * Fill the body of the command with the list of port ranges. */ static int fill_newports(ipfw_insn_u16 *cmd, char *av, int proto) { uint16_t a, b, *p = cmd->ports; int i = 0; char *s = av; while (*s) { a = strtoport(av, &s, 0, proto); if (s == av) /* empty or invalid argument */ return (0); switch (*s) { case '-': /* a range */ av = s + 1; b = strtoport(av, &s, 0, proto); /* Reject expressions like '1-abc' or '1-2-3'. */ if (s == av || (*s != ',' && *s != '\0')) return (0); p[0] = a; p[1] = b; break; case ',': /* comma separated list */ case '\0': p[0] = p[1] = a; break; default: warnx("port list: invalid separator <%c> in <%s>", *s, av); return (0); } i++; p += 2; av = s + 1; } if (i > 0) { if (i + 1 > F_LEN_MASK) errx(EX_DATAERR, "too many ports/ranges\n"); cmd->o.len |= i + 1; /* leave F_NOT and F_OR untouched */ } return (i); } static struct _s_x icmpcodes[] = { { "net", ICMP_UNREACH_NET }, { "host", ICMP_UNREACH_HOST }, { "protocol", ICMP_UNREACH_PROTOCOL }, { "port", ICMP_UNREACH_PORT }, { "needfrag", ICMP_UNREACH_NEEDFRAG }, { "srcfail", ICMP_UNREACH_SRCFAIL }, { "net-unknown", ICMP_UNREACH_NET_UNKNOWN }, { "host-unknown", ICMP_UNREACH_HOST_UNKNOWN }, { "isolated", ICMP_UNREACH_ISOLATED }, { "net-prohib", ICMP_UNREACH_NET_PROHIB }, { "host-prohib", ICMP_UNREACH_HOST_PROHIB }, { "tosnet", ICMP_UNREACH_TOSNET }, { "toshost", ICMP_UNREACH_TOSHOST }, { "filter-prohib", ICMP_UNREACH_FILTER_PROHIB }, { "host-precedence", ICMP_UNREACH_HOST_PRECEDENCE }, { "precedence-cutoff", ICMP_UNREACH_PRECEDENCE_CUTOFF }, { NULL, 0 } }; static void fill_reject_code(u_short *codep, char *str) { int val; char *s; val = strtoul(str, &s, 0); if (s == str || *s != '\0' || val >= 0x100) val = match_token(icmpcodes, str); if (val < 0) errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str); *codep = val; return; } static void print_reject_code(uint16_t code) { char const *s = match_value(icmpcodes, code); if (s != NULL) printf("unreach %s", s); else printf("unreach %u", code); } /* * Returns the number of bits set (from left) in a contiguous bitmask, * or -1 if the mask is not contiguous. * XXX this needs a proper fix. * This effectively works on masks in big-endian (network) format. * when compiled on little endian architectures. * * First bit is bit 7 of the first byte -- note, for MAC addresses, * the first bit on the wire is bit 0 of the first byte. * len is the max length in bits. */ int contigmask(uint8_t *p, int len) { int i, n; for (i=0; iarg1 & 0xff; uint8_t clear = (cmd->arg1 >> 8) & 0xff; if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) { printf(" setup"); return; } printf(" %s ", name); for (i=0; list[i].x != 0; i++) { if (set & list[i].x) { set &= ~list[i].x; printf("%s%s", comma, list[i].s); comma = ","; } if (clear & list[i].x) { clear &= ~list[i].x; printf("%s!%s", comma, list[i].s); comma = ","; } } } /* * Print the ip address contained in a command. */ static void print_ip(ipfw_insn_ip *cmd, char const *s) { struct hostent *he = NULL; uint32_t len = F_LEN((ipfw_insn *)cmd); uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { uint32_t d = a[1]; const char *arg = ""; if (d < sizeof(lookup_key)/sizeof(lookup_key[0])) arg = match_value(rule_options, lookup_key[d]); printf("%s lookup %s %d", cmd->o.len & F_NOT ? " not": "", arg, cmd->o.arg1); return; } printf("%s%s ", cmd->o.len & F_NOT ? " not": "", s); if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { printf("me"); return; } if (cmd->o.opcode == O_IP_SRC_LOOKUP || cmd->o.opcode == O_IP_DST_LOOKUP) { printf("table(%u", ((ipfw_insn *)cmd)->arg1); if (len == F_INSN_SIZE(ipfw_insn_u32)) printf(",%u", *a); printf(")"); return; } if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) { uint32_t x, *map = (uint32_t *)&(cmd->mask); int i, j; char comma = '{'; x = cmd->o.arg1 - 1; x = htonl( ~x ); cmd->addr.s_addr = htonl(cmd->addr.s_addr); printf("%s/%d", inet_ntoa(cmd->addr), contigmask((uint8_t *)&x, 32)); x = cmd->addr.s_addr = htonl(cmd->addr.s_addr); x &= 0xff; /* base */ /* * Print bits and ranges. * Locate first bit set (i), then locate first bit unset (j). * If we have 3+ consecutive bits set, then print them as a * range, otherwise only print the initial bit and rescan. */ for (i=0; i < cmd->o.arg1; i++) if (map[i/32] & (1<<(i & 31))) { for (j=i+1; j < cmd->o.arg1; j++) if (!(map[ j/32] & (1<<(j & 31)))) break; printf("%c%d", comma, i+x); if (j>i+2) { /* range has at least 3 elements */ printf("-%d", j-1+x); i = j-1; } comma = ','; } printf("}"); return; } /* * len == 2 indicates a single IP, whereas lists of 1 or more * addr/mask pairs have len = (2n+1). We convert len to n so we * use that to count the number of entries. */ for (len = len / 2; len > 0; len--, a += 2) { int mb = /* mask length */ (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ? 32 : contigmask((uint8_t *)&(a[1]), 32); if (mb == 32 && co.do_resolv) he = gethostbyaddr((char *)&(a[0]), sizeof(u_long), AF_INET); if (he != NULL) /* resolved to name */ printf("%s", he->h_name); else if (mb == 0) /* any */ printf("any"); else { /* numeric IP followed by some kind of mask */ printf("%s", inet_ntoa( *((struct in_addr *)&a[0]) ) ); if (mb < 0) printf(":%s", inet_ntoa( *((struct in_addr *)&a[1]) ) ); else if (mb < 32) printf("/%d", mb); } if (len > 1) printf(","); } } /* * prints a MAC address/mask pair */ static void print_mac(uint8_t *addr, uint8_t *mask) { int l = contigmask(mask, 48); if (l == 0) printf(" any"); else { printf(" %02x:%02x:%02x:%02x:%02x:%02x", addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); if (l == -1) printf("&%02x:%02x:%02x:%02x:%02x:%02x", mask[0], mask[1], mask[2], mask[3], mask[4], mask[5]); else if (l < 48) printf("/%d", l); } } static void fill_icmptypes(ipfw_insn_u32 *cmd, char *av) { uint8_t type; cmd->d[0] = 0; while (*av) { if (*av == ',') av++; type = strtoul(av, &av, 0); if (*av != ',' && *av != '\0') errx(EX_DATAERR, "invalid ICMP type"); if (type > 31) errx(EX_DATAERR, "ICMP type out of range"); cmd->d[0] |= 1 << type; } cmd->o.opcode = O_ICMPTYPE; cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); } static void print_icmptypes(ipfw_insn_u32 *cmd) { int i; char sep= ' '; printf(" icmptypes"); for (i = 0; i < 32; i++) { if ( (cmd->d[0] & (1 << (i))) == 0) continue; printf("%c%d", sep, i); sep = ','; } } /* * show_ipfw() prints the body of an ipfw rule. * Because the standard rule has at least proto src_ip dst_ip, we use * a helper function to produce these entries if not provided explicitly. * The first argument is the list of fields we have, the second is * the list of fields we want to be printed. * * Special cases if we have provided a MAC header: * + if the rule does not contain IP addresses/ports, do not print them; * + if the rule does not contain an IP proto, print "all" instead of "ip"; * * Once we have 'have_options', IP header fields are printed as options. */ #define HAVE_PROTO 0x0001 #define HAVE_SRCIP 0x0002 #define HAVE_DSTIP 0x0004 #define HAVE_PROTO4 0x0008 #define HAVE_PROTO6 0x0010 #define HAVE_IP 0x0100 #define HAVE_OPTIONS 0x8000 static void show_prerequisites(int *flags, int want, int cmd) { (void)cmd; /* UNUSED */ if (co.comment_only) return; if ( (*flags & HAVE_IP) == HAVE_IP) *flags |= HAVE_OPTIONS; if ( !(*flags & HAVE_OPTIONS)) { if ( !(*flags & HAVE_PROTO) && (want & HAVE_PROTO)) { if ( (*flags & HAVE_PROTO4)) printf(" ip4"); else if ( (*flags & HAVE_PROTO6)) printf(" ip6"); else printf(" ip"); } if ( !(*flags & HAVE_SRCIP) && (want & HAVE_SRCIP)) printf(" from any"); if ( !(*flags & HAVE_DSTIP) && (want & HAVE_DSTIP)) printf(" to any"); } *flags |= want; } static void show_ipfw(struct ip_fw *rule, int pcwidth, int bcwidth) { static int twidth = 0; int l; ipfw_insn *cmd, *tagptr = NULL; const char *comment = NULL; /* ptr to comment if we have one */ int proto = 0; /* default */ int flags = 0; /* prerequisites */ ipfw_insn_log *logptr = NULL; /* set if we find an O_LOG */ ipfw_insn_altq *altqptr = NULL; /* set if we find an O_ALTQ */ int or_block = 0; /* we are in an or block */ uint32_t set_disable; bcopy(&rule->next_rule, &set_disable, sizeof(set_disable)); if (set_disable & (1 << rule->set)) { /* disabled */ if (!co.show_sets) return; else printf("# DISABLED "); } printf("%05u ", rule->rulenum); if (pcwidth > 0 || bcwidth > 0) { pr_u64(&rule->pcnt, pcwidth); pr_u64(&rule->bcnt, bcwidth); } if (co.do_time == 2) printf("%10u ", rule->timestamp); else if (co.do_time == 1) { char timestr[30]; time_t t = (time_t)0; if (twidth == 0) { strcpy(timestr, ctime(&t)); *strchr(timestr, '\n') = '\0'; twidth = strlen(timestr); } if (rule->timestamp) { t = _long_to_time(rule->timestamp); strcpy(timestr, ctime(&t)); *strchr(timestr, '\n') = '\0'; printf("%s ", timestr); } else { printf("%*s", twidth, " "); } } if (co.show_sets) printf("set %d ", rule->set); /* * print the optional "match probability" */ if (rule->cmd_len > 0) { cmd = rule->cmd ; if (cmd->opcode == O_PROB) { ipfw_insn_u32 *p = (ipfw_insn_u32 *)cmd; double d = 1.0 * p->d[0]; d = (d / 0x7fffffff); printf("prob %f ", d); } } /* * first print actions */ for (l = rule->cmd_len - rule->act_ofs, cmd = ACTION_PTR(rule); l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { switch(cmd->opcode) { case O_CHECK_STATE: printf("check-state"); /* avoid printing anything else */ flags = HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP; break; case O_ACCEPT: printf("allow"); break; case O_COUNT: printf("count"); break; case O_DENY: printf("deny"); break; case O_REJECT: if (cmd->arg1 == ICMP_REJECT_RST) printf("reset"); else if (cmd->arg1 == ICMP_UNREACH_HOST) printf("reject"); else print_reject_code(cmd->arg1); break; case O_UNREACH6: if (cmd->arg1 == ICMP6_UNREACH_RST) printf("reset6"); else print_unreach6_code(cmd->arg1); break; case O_SKIPTO: PRINT_UINT_ARG("skipto ", cmd->arg1); break; case O_PIPE: PRINT_UINT_ARG("pipe ", cmd->arg1); break; case O_QUEUE: PRINT_UINT_ARG("queue ", cmd->arg1); break; case O_DIVERT: PRINT_UINT_ARG("divert ", cmd->arg1); break; case O_TEE: PRINT_UINT_ARG("tee ", cmd->arg1); break; case O_NETGRAPH: PRINT_UINT_ARG("netgraph ", cmd->arg1); break; case O_NGTEE: PRINT_UINT_ARG("ngtee ", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *s = (ipfw_insn_sa *)cmd; if (s->sa.sin_addr.s_addr == INADDR_ANY) { printf("fwd tablearg"); } else { printf("fwd %s", inet_ntoa(s->sa.sin_addr)); } if (s->sa.sin_port) printf(",%d", s->sa.sin_port); } break; case O_FORWARD_IP6: { char buf[4 + INET6_ADDRSTRLEN + 1]; ipfw_insn_sa6 *s = (ipfw_insn_sa6 *)cmd; printf("fwd %s", inet_ntop(AF_INET6, &s->sa.sin6_addr, buf, sizeof(buf))); if (s->sa.sin6_port) printf(",%d", s->sa.sin6_port); } break; case O_LOG: /* O_LOG is printed last */ logptr = (ipfw_insn_log *)cmd; break; case O_ALTQ: /* O_ALTQ is printed after O_LOG */ altqptr = (ipfw_insn_altq *)cmd; break; case O_TAG: tagptr = cmd; break; case O_NAT: if (cmd->arg1 != 0) PRINT_UINT_ARG("nat ", cmd->arg1); else printf("nat global"); break; case O_SETFIB: PRINT_UINT_ARG("setfib ", cmd->arg1); break; case O_REASS: printf("reass"); break; case O_CALLRETURN: if (cmd->len & F_NOT) printf("return"); else PRINT_UINT_ARG("call ", cmd->arg1); break; default: printf("** unrecognized action %d len %d ", cmd->opcode, cmd->len); } } if (logptr) { if (logptr->max_log > 0) printf(" log logamount %d", logptr->max_log); else printf(" log"); } #ifndef NO_ALTQ if (altqptr) { print_altq_cmd(altqptr); } #endif if (tagptr) { if (tagptr->len & F_NOT) PRINT_UINT_ARG(" untag ", tagptr->arg1); else PRINT_UINT_ARG(" tag ", tagptr->arg1); } /* * then print the body. */ for (l = rule->act_ofs, cmd = rule->cmd ; l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { if ((cmd->len & F_OR) || (cmd->len & F_NOT)) continue; if (cmd->opcode == O_IP4) { flags |= HAVE_PROTO4; break; } else if (cmd->opcode == O_IP6) { flags |= HAVE_PROTO6; break; } } if (rule->_pad & 1) { /* empty rules before options */ if (!co.do_compact) { show_prerequisites(&flags, HAVE_PROTO, 0); printf(" from any to any"); } flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP; } if (co.comment_only) comment = "..."; for (l = rule->act_ofs, cmd = rule->cmd ; l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { /* useful alias */ ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; if (co.comment_only) { if (cmd->opcode != O_NOP) continue; printf(" // %s\n", (char *)(cmd + 1)); return; } show_prerequisites(&flags, 0, cmd->opcode); switch(cmd->opcode) { case O_PROB: break; /* done already */ case O_PROBE_STATE: break; /* no need to print anything here */ case O_IP_SRC: case O_IP_SRC_LOOKUP: case O_IP_SRC_MASK: case O_IP_SRC_ME: case O_IP_SRC_SET: show_prerequisites(&flags, HAVE_PROTO, 0); if (!(flags & HAVE_SRCIP)) printf(" from"); if ((cmd->len & F_OR) && !or_block) printf(" {"); print_ip((ipfw_insn_ip *)cmd, (flags & HAVE_OPTIONS) ? " src-ip" : ""); flags |= HAVE_SRCIP; break; case O_IP_DST: case O_IP_DST_LOOKUP: case O_IP_DST_MASK: case O_IP_DST_ME: case O_IP_DST_SET: show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0); if (!(flags & HAVE_DSTIP)) printf(" to"); if ((cmd->len & F_OR) && !or_block) printf(" {"); print_ip((ipfw_insn_ip *)cmd, (flags & HAVE_OPTIONS) ? " dst-ip" : ""); flags |= HAVE_DSTIP; break; case O_IP6_SRC: case O_IP6_SRC_MASK: case O_IP6_SRC_ME: show_prerequisites(&flags, HAVE_PROTO, 0); if (!(flags & HAVE_SRCIP)) printf(" from"); if ((cmd->len & F_OR) && !or_block) printf(" {"); print_ip6((ipfw_insn_ip6 *)cmd, (flags & HAVE_OPTIONS) ? " src-ip6" : ""); flags |= HAVE_SRCIP | HAVE_PROTO; break; case O_IP6_DST: case O_IP6_DST_MASK: case O_IP6_DST_ME: show_prerequisites(&flags, HAVE_PROTO|HAVE_SRCIP, 0); if (!(flags & HAVE_DSTIP)) printf(" to"); if ((cmd->len & F_OR) && !or_block) printf(" {"); print_ip6((ipfw_insn_ip6 *)cmd, (flags & HAVE_OPTIONS) ? " dst-ip6" : ""); flags |= HAVE_DSTIP; break; case O_FLOW6ID: print_flow6id( (ipfw_insn_u32 *) cmd ); flags |= HAVE_OPTIONS; break; case O_IP_DSTPORT: show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP, 0); case O_IP_SRCPORT: if (flags & HAVE_DSTIP) flags |= HAVE_IP; show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP, 0); if ((cmd->len & F_OR) && !or_block) printf(" {"); if (cmd->len & F_NOT) printf(" not"); print_newports((ipfw_insn_u16 *)cmd, proto, (flags & HAVE_OPTIONS) ? cmd->opcode : 0); break; case O_PROTO: { struct protoent *pe = NULL; if ((cmd->len & F_OR) && !or_block) printf(" {"); if (cmd->len & F_NOT) printf(" not"); proto = cmd->arg1; pe = getprotobynumber(cmd->arg1); if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) && !(flags & HAVE_PROTO)) show_prerequisites(&flags, HAVE_PROTO | HAVE_IP | HAVE_SRCIP | HAVE_DSTIP | HAVE_OPTIONS, 0); if (flags & HAVE_OPTIONS) printf(" proto"); if (pe) printf(" %s", pe->p_name); else printf(" %u", cmd->arg1); } flags |= HAVE_PROTO; break; default: /*options ... */ if (!(cmd->len & (F_OR|F_NOT))) if (((cmd->opcode == O_IP6) && (flags & HAVE_PROTO6)) || ((cmd->opcode == O_IP4) && (flags & HAVE_PROTO4))) break; show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0); if ((cmd->len & F_OR) && !or_block) printf(" {"); if (cmd->len & F_NOT && cmd->opcode != O_IN) printf(" not"); switch(cmd->opcode) { case O_MACADDR2: { ipfw_insn_mac *m = (ipfw_insn_mac *)cmd; printf(" MAC"); print_mac(m->addr, m->mask); print_mac(m->addr + 6, m->mask + 6); } break; case O_MAC_TYPE: print_newports((ipfw_insn_u16 *)cmd, IPPROTO_ETHERTYPE, cmd->opcode); break; case O_FRAG: printf(" frag"); break; case O_FIB: printf(" fib %u", cmd->arg1 ); break; case O_SOCKARG: printf(" sockarg"); break; case O_IN: printf(cmd->len & F_NOT ? " out" : " in"); break; case O_DIVERTED: switch (cmd->arg1) { case 3: printf(" diverted"); break; case 1: printf(" diverted-loopback"); break; case 2: printf(" diverted-output"); break; default: printf(" diverted-?<%u>", cmd->arg1); break; } break; case O_LAYER2: printf(" layer2"); break; case O_XMIT: case O_RECV: case O_VIA: { char const *s; ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd; if (cmd->opcode == O_XMIT) s = "xmit"; else if (cmd->opcode == O_RECV) s = "recv"; else /* if (cmd->opcode == O_VIA) */ s = "via"; if (cmdif->name[0] == '\0') printf(" %s %s", s, inet_ntoa(cmdif->p.ip)); else if (cmdif->name[0] == '\1') /* interface table */ printf(" %s table(%d)", s, cmdif->p.glob); else printf(" %s %s", s, cmdif->name); break; } case O_IPID: if (F_LEN(cmd) == 1) printf(" ipid %u", cmd->arg1 ); else print_newports((ipfw_insn_u16 *)cmd, 0, O_IPID); break; case O_IPTTL: if (F_LEN(cmd) == 1) printf(" ipttl %u", cmd->arg1 ); else print_newports((ipfw_insn_u16 *)cmd, 0, O_IPTTL); break; case O_IPVER: printf(" ipver %u", cmd->arg1 ); break; case O_IPPRECEDENCE: printf(" ipprecedence %u", (cmd->arg1) >> 5 ); break; case O_IPLEN: if (F_LEN(cmd) == 1) printf(" iplen %u", cmd->arg1 ); else print_newports((ipfw_insn_u16 *)cmd, 0, O_IPLEN); break; case O_IPOPT: print_flags("ipoptions", cmd, f_ipopts); break; case O_IPTOS: print_flags("iptos", cmd, f_iptos); break; case O_ICMPTYPE: print_icmptypes((ipfw_insn_u32 *)cmd); break; case O_ESTAB: printf(" established"); break; case O_TCPDATALEN: if (F_LEN(cmd) == 1) printf(" tcpdatalen %u", cmd->arg1 ); else print_newports((ipfw_insn_u16 *)cmd, 0, O_TCPDATALEN); break; case O_TCPFLAGS: print_flags("tcpflags", cmd, f_tcpflags); break; case O_TCPOPTS: print_flags("tcpoptions", cmd, f_tcpopts); break; case O_TCPWIN: if (F_LEN(cmd) == 1) printf(" tcpwin %u", cmd->arg1); else print_newports((ipfw_insn_u16 *)cmd, 0, O_TCPWIN); break; case O_TCPACK: printf(" tcpack %d", ntohl(cmd32->d[0])); break; case O_TCPSEQ: printf(" tcpseq %d", ntohl(cmd32->d[0])); break; case O_UID: { struct passwd *pwd = getpwuid(cmd32->d[0]); if (pwd) printf(" uid %s", pwd->pw_name); else printf(" uid %u", cmd32->d[0]); } break; case O_GID: { struct group *grp = getgrgid(cmd32->d[0]); if (grp) printf(" gid %s", grp->gr_name); else printf(" gid %u", cmd32->d[0]); } break; case O_JAIL: printf(" jail %d", cmd32->d[0]); break; case O_VERREVPATH: printf(" verrevpath"); break; case O_VERSRCREACH: printf(" versrcreach"); break; case O_ANTISPOOF: printf(" antispoof"); break; case O_IPSEC: printf(" ipsec"); break; case O_NOP: comment = (char *)(cmd + 1); break; case O_KEEP_STATE: printf(" keep-state"); break; case O_LIMIT: { struct _s_x *p = limit_masks; ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; uint8_t x = c->limit_mask; char const *comma = " "; printf(" limit"); for (; p->x != 0 ; p++) if ((x & p->x) == p->x) { x &= ~p->x; printf("%s%s", comma, p->s); comma = ","; } PRINT_UINT_ARG(" ", c->conn_limit); break; } case O_IP6: printf(" ip6"); break; case O_IP4: printf(" ip4"); break; case O_ICMP6TYPE: print_icmp6types((ipfw_insn_u32 *)cmd); break; case O_EXT_HDR: print_ext6hdr( (ipfw_insn *) cmd ); break; case O_TAGGED: if (F_LEN(cmd) == 1) PRINT_UINT_ARG(" tagged ", cmd->arg1); else print_newports((ipfw_insn_u16 *)cmd, 0, O_TAGGED); break; default: printf(" [opcode %d len %d]", cmd->opcode, cmd->len); } } if (cmd->len & F_OR) { printf(" or"); or_block = 1; } else if (or_block) { printf(" }"); or_block = 0; } } show_prerequisites(&flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP, 0); if (comment) printf(" // %s", comment); printf("\n"); } static void show_dyn_ipfw(ipfw_dyn_rule *d, int pcwidth, int bcwidth) { struct protoent *pe; struct in_addr a; uint16_t rulenum; char buf[INET6_ADDRSTRLEN]; if (!co.do_expired) { if (!d->expire && !(d->dyn_type == O_LIMIT_PARENT)) return; } bcopy(&d->rule, &rulenum, sizeof(rulenum)); printf("%05d", rulenum); if (pcwidth > 0 || bcwidth > 0) { printf(" "); pr_u64(&d->pcnt, pcwidth); pr_u64(&d->bcnt, bcwidth); printf("(%ds)", d->expire); } switch (d->dyn_type) { case O_LIMIT_PARENT: printf(" PARENT %d", d->count); break; case O_LIMIT: printf(" LIMIT"); break; case O_KEEP_STATE: /* bidir, no mask */ printf(" STATE"); break; } if ((pe = getprotobynumber(d->id.proto)) != NULL) printf(" %s", pe->p_name); else printf(" proto %u", d->id.proto); if (d->id.addr_type == 4) { a.s_addr = htonl(d->id.src_ip); printf(" %s %d", inet_ntoa(a), d->id.src_port); a.s_addr = htonl(d->id.dst_ip); printf(" <-> %s %d", inet_ntoa(a), d->id.dst_port); } else if (d->id.addr_type == 6) { printf(" %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf, sizeof(buf)), d->id.src_port); printf(" <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, buf, sizeof(buf)), d->id.dst_port); } else printf(" UNKNOWN <-> UNKNOWN\n"); printf("\n"); } /* * This one handles all set-related commands * ipfw set { show | enable | disable } * ipfw set swap X Y * ipfw set move X to Y * ipfw set move rule X to Y */ void ipfw_sets_handler(char *av[]) { uint32_t set_disable, masks[2]; int i, nbytes; uint16_t rulenum; uint8_t cmd, new_set; av++; if (av[0] == NULL) errx(EX_USAGE, "set needs command"); if (_substrcmp(*av, "show") == 0) { void *data = NULL; char const *msg; int nalloc; nalloc = nbytes = sizeof(struct ip_fw); while (nbytes >= nalloc) { if (data) free(data); nalloc = nalloc * 2 + 200; nbytes = nalloc; data = safe_calloc(1, nbytes); if (do_cmd(IP_FW_GET, data, (uintptr_t)&nbytes) < 0) err(EX_OSERR, "getsockopt(IP_FW_GET)"); } bcopy(&((struct ip_fw *)data)->next_rule, &set_disable, sizeof(set_disable)); for (i = 0, msg = "disable" ; i < RESVD_SET; i++) if ((set_disable & (1< RESVD_SET) errx(EX_DATAERR, "invalid set number %s\n", av[0]); if (!isdigit(*(av[1])) || new_set > RESVD_SET) errx(EX_DATAERR, "invalid set number %s\n", av[1]); masks[0] = (4 << 24) | (new_set << 16) | (rulenum); i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); } else if (_substrcmp(*av, "move") == 0) { av++; if (av[0] && _substrcmp(*av, "rule") == 0) { cmd = 2; av++; } else cmd = 3; if (av[0] == NULL || av[1] == NULL || av[2] == NULL || av[3] != NULL || _substrcmp(av[1], "to") != 0) errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); rulenum = atoi(av[0]); new_set = atoi(av[2]); if (!isdigit(*(av[0])) || (cmd == 3 && rulenum > RESVD_SET) || (cmd == 2 && rulenum == IPFW_DEFAULT_RULE) ) errx(EX_DATAERR, "invalid source number %s\n", av[0]); if (!isdigit(*(av[2])) || new_set > RESVD_SET) errx(EX_DATAERR, "invalid dest. set %s\n", av[1]); masks[0] = (cmd << 24) | (new_set << 16) | (rulenum); i = do_cmd(IP_FW_DEL, masks, sizeof(uint32_t)); } else if (_substrcmp(*av, "disable") == 0 || _substrcmp(*av, "enable") == 0 ) { int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; av++; masks[0] = masks[1] = 0; while (av[0]) { if (isdigit(**av)) { i = atoi(*av); if (i < 0 || i > RESVD_SET) errx(EX_DATAERR, "invalid set number %d\n", i); masks[which] |= (1<= nalloc) { nalloc = nalloc * 2 + 200; nbytes = nalloc; data = safe_realloc(data, nbytes); if (do_cmd(ocmd, data, (uintptr_t)&nbytes) < 0) err(EX_OSERR, "getsockopt(IP_%s_GET)", co.do_pipe ? "DUMMYNET" : "FW"); } /* * Count static rules. They have variable size so we * need to scan the list to count them. */ for (nstat = 1, r = data, lim = (char *)data + nbytes; r->rulenum < IPFW_DEFAULT_RULE && (char *)r < lim; ++nstat, r = NEXT(r) ) ; /* nothing */ /* * Count dynamic rules. This is easier as they have * fixed size. */ r = NEXT(r); dynrules = (ipfw_dyn_rule *)r ; n = (char *)r - (char *)data; ndyn = (nbytes - n) / sizeof *dynrules; /* if showing stats, figure out column widths ahead of time */ bcwidth = pcwidth = 0; if (show_counters) { for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) { /* skip rules from another set */ if (co.use_set && r->set != co.use_set - 1) continue; /* packet counter */ width = pr_u64(&r->pcnt, 0); if (width > pcwidth) pcwidth = width; /* byte counter */ width = pr_u64(&r->bcnt, 0); if (width > bcwidth) bcwidth = width; } } if (co.do_dynamic && ndyn) { for (n = 0, d = dynrules; n < ndyn; n++, d++) { if (co.use_set) { /* skip rules from another set */ bcopy((char *)&d->rule + sizeof(uint16_t), &set, sizeof(uint8_t)); if (set != co.use_set - 1) continue; } width = pr_u64(&d->pcnt, 0); if (width > pcwidth) pcwidth = width; width = pr_u64(&d->bcnt, 0); if (width > bcwidth) bcwidth = width; } } /* if no rule numbers were specified, list all rules */ if (ac == 0) { for (n = 0, r = data; n < nstat; n++, r = NEXT(r)) { if (co.use_set && r->set != co.use_set - 1) continue; show_ipfw(r, pcwidth, bcwidth); } if (co.do_dynamic && ndyn) { printf("## Dynamic rules (%d):\n", ndyn); for (n = 0, d = dynrules; n < ndyn; n++, d++) { if (co.use_set) { bcopy((char *)&d->rule + sizeof(uint16_t), &set, sizeof(uint8_t)); if (set != co.use_set - 1) continue; } show_dyn_ipfw(d, pcwidth, bcwidth); } } goto done; } /* display specific rules requested on command line */ for (lac = ac, lav = av; lac != 0; lac--) { /* convert command line rule # */ last = rnum = strtoul(*lav++, &endptr, 10); if (*endptr == '-') last = strtoul(endptr+1, &endptr, 10); if (*endptr) { exitval = EX_USAGE; warnx("invalid rule number: %s", *(lav - 1)); continue; } for (n = seen = 0, r = data; n < nstat; n++, r = NEXT(r) ) { if (r->rulenum > last) break; if (co.use_set && r->set != co.use_set - 1) continue; if (r->rulenum >= rnum && r->rulenum <= last) { show_ipfw(r, pcwidth, bcwidth); seen = 1; } } if (!seen) { /* give precedence to other error(s) */ if (exitval == EX_OK) exitval = EX_UNAVAILABLE; warnx("rule %lu does not exist", rnum); } } if (co.do_dynamic && ndyn) { printf("## Dynamic rules:\n"); for (lac = ac, lav = av; lac != 0; lac--) { last = rnum = strtoul(*lav++, &endptr, 10); if (*endptr == '-') last = strtoul(endptr+1, &endptr, 10); if (*endptr) /* already warned */ continue; for (n = 0, d = dynrules; n < ndyn; n++, d++) { uint16_t rulenum; bcopy(&d->rule, &rulenum, sizeof(rulenum)); if (rulenum > rnum) break; if (co.use_set) { bcopy((char *)&d->rule + sizeof(uint16_t), &set, sizeof(uint8_t)); if (set != co.use_set - 1) continue; } if (r->rulenum >= rnum && r->rulenum <= last) show_dyn_ipfw(d, pcwidth, bcwidth); } } } ac = 0; done: free(data); if (exitval != EX_OK) exit(exitval); #undef NEXT } static int lookup_host (char *host, struct in_addr *ipaddr) { struct hostent *he; if (!inet_aton(host, ipaddr)) { if ((he = gethostbyname(host)) == NULL) return(-1); *ipaddr = *(struct in_addr *)he->h_addr_list[0]; } return(0); } /* * fills the addr and mask fields in the instruction as appropriate from av. * Update length as appropriate. * The following formats are allowed: * me returns O_IP_*_ME * 1.2.3.4 single IP address * 1.2.3.4:5.6.7.8 address:mask * 1.2.3.4/24 address/mask * 1.2.3.4/26{1,6,5,4,23} set of addresses in a subnet * We can have multiple comma-separated address/mask entries. */ static void fill_ip(ipfw_insn_ip *cmd, char *av) { int len = 0; uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; cmd->o.len &= ~F_LEN_MASK; /* zero len */ if (_substrcmp(av, "any") == 0) return; if (_substrcmp(av, "me") == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn); return; } if (strncmp(av, "table(", 6) == 0) { char *p = strchr(av + 6, ','); if (p) *p++ = '\0'; cmd->o.opcode = O_IP_DST_LOOKUP; cmd->o.arg1 = strtoul(av + 6, NULL, 0); if (p) { cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); d[0] = strtoul(p, NULL, 0); } else cmd->o.len |= F_INSN_SIZE(ipfw_insn); return; } while (av) { /* * After the address we can have '/' or ':' indicating a mask, * ',' indicating another address follows, '{' indicating a * set of addresses of unspecified size. */ char *t = NULL, *p = strpbrk(av, "/:,{"); int masklen; char md, nd = '\0'; if (p) { md = *p; *p++ = '\0'; if ((t = strpbrk(p, ",{")) != NULL) { nd = *t; *t = '\0'; } } else md = '\0'; if (lookup_host(av, (struct in_addr *)&d[0]) != 0) errx(EX_NOHOST, "hostname ``%s'' unknown", av); switch (md) { case ':': if (!inet_aton(p, (struct in_addr *)&d[1])) errx(EX_DATAERR, "bad netmask ``%s''", p); break; case '/': masklen = atoi(p); if (masklen == 0) d[1] = htonl(0); /* mask */ else if (masklen > 32) errx(EX_DATAERR, "bad width ``%s''", p); else d[1] = htonl(~0 << (32 - masklen)); break; case '{': /* no mask, assume /24 and put back the '{' */ d[1] = htonl(~0 << (32 - 24)); *(--p) = md; break; case ',': /* single address plus continuation */ *(--p) = md; /* FALLTHROUGH */ case 0: /* initialization value */ default: d[1] = htonl(~0); /* force /32 */ break; } d[0] &= d[1]; /* mask base address with mask */ if (t) *t = nd; /* find next separator */ if (p) p = strpbrk(p, ",{"); if (p && *p == '{') { /* * We have a set of addresses. They are stored as follows: * arg1 is the set size (powers of 2, 2..256) * addr is the base address IN HOST FORMAT * mask.. is an array of arg1 bits (rounded up to * the next multiple of 32) with bits set * for each host in the map. */ uint32_t *map = (uint32_t *)&cmd->mask; int low, high; int i = contigmask((uint8_t *)&(d[1]), 32); if (len > 0) errx(EX_DATAERR, "address set cannot be in a list"); if (i < 24 || i > 31) errx(EX_DATAERR, "invalid set with mask %d\n", i); cmd->o.arg1 = 1<<(32-i); /* map length */ d[0] = ntohl(d[0]); /* base addr in host format */ cmd->o.opcode = O_IP_DST_SET; /* default */ cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32; for (i = 0; i < (cmd->o.arg1+31)/32 ; i++) map[i] = 0; /* clear map */ av = p + 1; low = d[0] & 0xff; high = low + cmd->o.arg1 - 1; /* * Here, i stores the previous value when we specify a range * of addresses within a mask, e.g. 45-63. i = -1 means we * have no previous value. */ i = -1; /* previous value in a range */ while (isdigit(*av)) { char *s; int a = strtol(av, &s, 0); if (s == av) { /* no parameter */ if (*av != '}') errx(EX_DATAERR, "set not closed\n"); if (i != -1) errx(EX_DATAERR, "incomplete range %d-", i); break; } if (a < low || a > high) errx(EX_DATAERR, "addr %d out of range [%d-%d]\n", a, low, high); a -= low; if (i == -1) /* no previous in range */ i = a; else { /* check that range is valid */ if (i > a) errx(EX_DATAERR, "invalid range %d-%d", i+low, a+low); if (*s == '-') errx(EX_DATAERR, "double '-' in range"); } for (; i <= a; i++) map[i/32] |= 1<<(i & 31); i = -1; if (*s == '-') i = a; else if (*s == '}') break; av = s+1; } return; } av = p; if (av) /* then *av must be a ',' */ av++; /* Check this entry */ if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */ /* * 'any' turns the entire list into a NOP. * 'not any' never matches, so it is removed from the * list unless it is the only item, in which case we * report an error. */ if (cmd->o.len & F_NOT) { /* "not any" never matches */ if (av == NULL && len == 0) /* only this entry */ errx(EX_DATAERR, "not any never matches"); } /* else do nothing and skip this entry */ return; } /* A single IP can be stored in an optimized format */ if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); return; } len += 2; /* two words... */ d += 2; } /* end while */ if (len + 1 > F_LEN_MASK) errx(EX_DATAERR, "address list too long"); cmd->o.len |= len+1; } /* n2mask sets n bits of the mask */ void n2mask(struct in6_addr *mask, int n) { static int minimask[9] = { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff }; u_char *p; memset(mask, 0, sizeof(struct in6_addr)); p = (u_char *) mask; for (; n > 0; p++, n -= 8) { if (n >= 8) *p = 0xff; else *p = minimask[n]; } return; } /* * helper function to process a set of flags and set bits in the * appropriate masks. */ static void fill_flags(ipfw_insn *cmd, enum ipfw_opcodes opcode, struct _s_x *flags, char *p) { uint8_t set=0, clear=0; while (p && *p) { char *q; /* points to the separator */ int val; uint8_t *which; /* mask we are working on */ if (*p == '!') { p++; which = &clear; } else which = &set; q = strchr(p, ','); if (q) *q++ = '\0'; val = match_token(flags, p); if (val <= 0) errx(EX_DATAERR, "invalid flag %s", p); *which |= (uint8_t)val; p = q; } cmd->opcode = opcode; cmd->len = (cmd->len & (F_NOT | F_OR)) | 1; cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8); } void ipfw_delete(char *av[]) { uint32_t rulenum; int i; int exitval = EX_OK; int do_set = 0; av++; NEED1("missing rule specification"); if ( *av && _substrcmp(*av, "set") == 0) { /* Do not allow using the following syntax: * ipfw set N delete set M */ if (co.use_set) errx(EX_DATAERR, "invalid syntax"); do_set = 1; /* delete set */ av++; } /* Rule number */ while (*av && isdigit(**av)) { i = atoi(*av); av++; if (co.do_nat) { exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); if (exitval) { exitval = EX_UNAVAILABLE; warn("rule %u not available", i); } } else if (co.do_pipe) { exitval = ipfw_delete_pipe(co.do_pipe, i); } else { if (co.use_set) rulenum = (i & 0xffff) | (5 << 24) | ((co.use_set - 1) << 16); else rulenum = (i & 0xffff) | (do_set << 24); i = do_cmd(IP_FW_DEL, &rulenum, sizeof rulenum); if (i) { exitval = EX_UNAVAILABLE; warn("rule %u: setsockopt(IP_FW_DEL)", rulenum); } } } if (exitval != EX_OK) exit(exitval); } /* * fill the interface structure. We do not check the name as we can * create interfaces dynamically, so checking them at insert time * makes relatively little sense. * Interface names containing '*', '?', or '[' are assumed to be shell * patterns which match interfaces. */ static void fill_iface(ipfw_insn_if *cmd, char *arg) { cmd->name[0] = '\0'; cmd->o.len |= F_INSN_SIZE(ipfw_insn_if); /* Parse the interface or address */ if (strcmp(arg, "any") == 0) cmd->o.len = 0; /* effectively ignore this command */ else if (strncmp(arg, "table(", 6) == 0) { char *p = strchr(arg + 6, ','); if (p) *p++ = '\0'; cmd->name[0] = '\1'; /* Special value indicating table */ cmd->p.glob = strtoul(arg + 6, NULL, 0); } else if (!isdigit(*arg)) { strlcpy(cmd->name, arg, sizeof(cmd->name)); cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0; } else if (!inet_aton(arg, &cmd->p.ip)) errx(EX_DATAERR, "bad ip address ``%s''", arg); } static void get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) { int i; size_t l; char *ap, *ptr, *optr; struct ether_addr *mac; const char *macset = "0123456789abcdefABCDEF:"; if (strcmp(p, "any") == 0) { for (i = 0; i < ETHER_ADDR_LEN; i++) addr[i] = mask[i] = 0; return; } optr = ptr = strdup(p); if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) { l = strlen(ap); if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL) errx(EX_DATAERR, "Incorrect MAC address"); bcopy(mac, addr, ETHER_ADDR_LEN); } else errx(EX_DATAERR, "Incorrect MAC address"); if (ptr != NULL) { /* we have mask? */ if (p[ptr - optr - 1] == '/') { /* mask len */ long ml = strtol(ptr, &ap, 10); if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) errx(EX_DATAERR, "Incorrect mask length"); for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); } else { /* mask */ l = strlen(ptr); if (strspn(ptr, macset) != l || (mac = ether_aton(ptr)) == NULL) errx(EX_DATAERR, "Incorrect mask"); bcopy(mac, mask, ETHER_ADDR_LEN); } } else { /* default mask: ff:ff:ff:ff:ff:ff */ for (i = 0; i < ETHER_ADDR_LEN; i++) mask[i] = 0xff; } for (i = 0; i < ETHER_ADDR_LEN; i++) addr[i] &= mask[i]; free(optr); } /* * helper function, updates the pointer to cmd with the length * of the current command, and also cleans up the first word of * the new command in case it has been clobbered before. */ static ipfw_insn * next_cmd(ipfw_insn *cmd) { cmd += F_LEN(cmd); bzero(cmd, sizeof(*cmd)); return cmd; } /* * Takes arguments and copies them into a comment */ static void fill_comment(ipfw_insn *cmd, char **av) { int i, l; char *p = (char *)(cmd + 1); cmd->opcode = O_NOP; cmd->len = (cmd->len & (F_NOT | F_OR)); /* Compute length of comment string. */ for (i = 0, l = 0; av[i] != NULL; i++) l += strlen(av[i]) + 1; if (l == 0) return; if (l > 84) errx(EX_DATAERR, "comment too long (max 80 chars)"); l = 1 + (l+3)/4; cmd->len = (cmd->len & (F_NOT | F_OR)) | l; for (i = 0; av[i] != NULL; i++) { strcpy(p, av[i]); p += strlen(av[i]); *p++ = ' '; } *(--p) = '\0'; } /* * A function to fill simple commands of size 1. * Existing flags are preserved. */ static void fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) { cmd->opcode = opcode; cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | 1; cmd->arg1 = arg; } /* * Fetch and add the MAC address and type, with masks. This generates one or * two microinstructions, and returns the pointer to the last one. */ static ipfw_insn * add_mac(ipfw_insn *cmd, char *av[]) { ipfw_insn_mac *mac; if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) errx(EX_DATAERR, "MAC dst src"); cmd->opcode = O_MACADDR2; cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac); mac = (ipfw_insn_mac *)cmd; get_mac_addr_mask(av[0], mac->addr, mac->mask); /* dst */ get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]), &(mac->mask[ETHER_ADDR_LEN])); /* src */ return cmd; } static ipfw_insn * add_mactype(ipfw_insn *cmd, char *av) { if (!av) errx(EX_DATAERR, "missing MAC type"); if (strcmp(av, "any") != 0) { /* we have a non-null type */ fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE); cmd->opcode = O_MAC_TYPE; return cmd; } else return NULL; } static ipfw_insn * add_proto0(ipfw_insn *cmd, char *av, u_char *protop) { struct protoent *pe; char *ep; int proto; proto = strtol(av, &ep, 10); if (*ep != '\0' || proto <= 0) { if ((pe = getprotobyname(av)) == NULL) return NULL; proto = pe->p_proto; } fill_cmd(cmd, O_PROTO, 0, proto); *protop = proto; return cmd; } static ipfw_insn * add_proto(ipfw_insn *cmd, char *av, u_char *protop) { u_char proto = IPPROTO_IP; if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) ; /* do not set O_IP4 nor O_IP6 */ else if (strcmp(av, "ip4") == 0) /* explicit "just IPv4" rule */ fill_cmd(cmd, O_IP4, 0, 0); else if (strcmp(av, "ip6") == 0) { /* explicit "just IPv6" rule */ proto = IPPROTO_IPV6; fill_cmd(cmd, O_IP6, 0, 0); } else return add_proto0(cmd, av, protop); *protop = proto; return cmd; } static ipfw_insn * add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop) { u_char proto = IPPROTO_IP; if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) ; /* do not set O_IP4 nor O_IP6 */ else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0) /* explicit "just IPv4" rule */ fill_cmd(cmd, O_IP4, 0, 0); else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) { /* explicit "just IPv6" rule */ proto = IPPROTO_IPV6; fill_cmd(cmd, O_IP6, 0, 0); } else return add_proto0(cmd, av, protop); *protop = proto; return cmd; } static ipfw_insn * add_srcip(ipfw_insn *cmd, char *av) { fill_ip((ipfw_insn_ip *)cmd, av); if (cmd->opcode == O_IP_DST_SET) /* set */ cmd->opcode = O_IP_SRC_SET; else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ cmd->opcode = O_IP_SRC_LOOKUP; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ cmd->opcode = O_IP_SRC_ME; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ cmd->opcode = O_IP_SRC; else /* addr/mask */ cmd->opcode = O_IP_SRC_MASK; return cmd; } static ipfw_insn * add_dstip(ipfw_insn *cmd, char *av) { fill_ip((ipfw_insn_ip *)cmd, av); if (cmd->opcode == O_IP_DST_SET) /* set */ ; else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ ; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ cmd->opcode = O_IP_DST_ME; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ cmd->opcode = O_IP_DST; else /* addr/mask */ cmd->opcode = O_IP_DST_MASK; return cmd; } static ipfw_insn * add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode) { /* XXX "any" is trapped before. Perhaps "to" */ if (_substrcmp(av, "any") == 0) { return NULL; } else if (fill_newports((ipfw_insn_u16 *)cmd, av, proto)) { /* XXX todo: check that we have a protocol with ports */ cmd->opcode = opcode; return cmd; } return NULL; } static ipfw_insn * add_src(ipfw_insn *cmd, char *av, u_char proto) { struct in6_addr a; char *host, *ch; ipfw_insn *ret = NULL; if ((host = strdup(av)) == NULL) return NULL; if ((ch = strrchr(host, '/')) != NULL) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || inet_pton(AF_INET6, host, &a) == 1) ret = add_srcip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || inet_pton(AF_INET6, host, &a) != 1)) ret = add_srcip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; free(host); return ret; } static ipfw_insn * add_dst(ipfw_insn *cmd, char *av, u_char proto) { struct in6_addr a; char *host, *ch; ipfw_insn *ret = NULL; if ((host = strdup(av)) == NULL) return NULL; if ((ch = strrchr(host, '/')) != NULL) *ch = '\0'; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || inet_pton(AF_INET6, host, &a) == 1) ret = add_dstip6(cmd, av); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || inet_pton(AF_INET6, host, &a) != 1)) ret = add_dstip(cmd, av); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; free(host); return ret; } /* * Parse arguments and assemble the microinstructions which make up a rule. * Rules are added into the 'rulebuf' and then copied in the correct order * into the actual rule. * * The syntax for a rule starts with the action, followed by * optional action parameters, and the various match patterns. * In the assembled microcode, the first opcode must be an O_PROBE_STATE * (generated if the rule includes a keep-state option), then the * various match patterns, log/altq actions, and the actual action. * */ void ipfw_add(char *av[]) { /* * rules are added into the 'rulebuf' and then copied in * the correct order into the actual rule. * Some things that need to go out of order (prob, action etc.) * go into actbuf[]. */ static uint32_t rulebuf[255], actbuf[255], cmdbuf[255]; ipfw_insn *src, *dst, *cmd, *action, *prev=NULL; ipfw_insn *first_cmd; /* first match pattern */ struct ip_fw *rule; /* * various flags used to record that we entered some fields. */ ipfw_insn *have_state = NULL; /* check-state or keep-state */ ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL; size_t len; int i; int open_par = 0; /* open parenthesis ( */ /* proto is here because it is used to fetch ports */ u_char proto = IPPROTO_IP; /* default protocol */ double match_prob = 1; /* match probability, default is always match */ bzero(actbuf, sizeof(actbuf)); /* actions go here */ bzero(cmdbuf, sizeof(cmdbuf)); bzero(rulebuf, sizeof(rulebuf)); rule = (struct ip_fw *)rulebuf; cmd = (ipfw_insn *)cmdbuf; action = (ipfw_insn *)actbuf; av++; /* [rule N] -- Rule number optional */ if (av[0] && isdigit(**av)) { rule->rulenum = atoi(*av); av++; } /* [set N] -- set number (0..RESVD_SET), optional */ if (av[0] && av[1] && _substrcmp(*av, "set") == 0) { int set = strtoul(av[1], NULL, 10); if (set < 0 || set > RESVD_SET) errx(EX_DATAERR, "illegal set %s", av[1]); rule->set = set; av += 2; } /* [prob D] -- match probability, optional */ if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { match_prob = strtod(av[1], NULL); if (match_prob <= 0 || match_prob > 1) errx(EX_DATAERR, "illegal match prob. %s", av[1]); av += 2; } /* action -- mandatory */ NEED1("missing action"); i = match_token(rule_actions, *av); av++; action->len = 1; /* default */ switch(i) { case TOK_CHECKSTATE: have_state = action; action->opcode = O_CHECK_STATE; break; case TOK_ACCEPT: action->opcode = O_ACCEPT; break; case TOK_DENY: action->opcode = O_DENY; action->arg1 = 0; break; case TOK_REJECT: action->opcode = O_REJECT; action->arg1 = ICMP_UNREACH_HOST; break; case TOK_RESET: action->opcode = O_REJECT; action->arg1 = ICMP_REJECT_RST; break; case TOK_RESET6: action->opcode = O_UNREACH6; action->arg1 = ICMP6_UNREACH_RST; break; case TOK_UNREACH: action->opcode = O_REJECT; NEED1("missing reject code"); fill_reject_code(&action->arg1, *av); av++; break; case TOK_UNREACH6: action->opcode = O_UNREACH6; NEED1("missing unreach code"); fill_unreach6_code(&action->arg1, *av); av++; break; case TOK_COUNT: action->opcode = O_COUNT; break; case TOK_NAT: action->opcode = O_NAT; action->len = F_INSN_SIZE(ipfw_insn_nat); if (_substrcmp(*av, "global") == 0) { action->arg1 = 0; av++; break; } else goto chkarg; case TOK_QUEUE: action->opcode = O_QUEUE; goto chkarg; case TOK_PIPE: action->opcode = O_PIPE; goto chkarg; case TOK_SKIPTO: action->opcode = O_SKIPTO; goto chkarg; case TOK_NETGRAPH: action->opcode = O_NETGRAPH; goto chkarg; case TOK_NGTEE: action->opcode = O_NGTEE; goto chkarg; case TOK_DIVERT: action->opcode = O_DIVERT; goto chkarg; case TOK_TEE: action->opcode = O_TEE; goto chkarg; case TOK_CALL: action->opcode = O_CALLRETURN; chkarg: if (!av[0]) errx(EX_USAGE, "missing argument for %s", *(av - 1)); if (isdigit(**av)) { action->arg1 = strtoul(*av, NULL, 10); if (action->arg1 <= 0 || action->arg1 >= IP_FW_TABLEARG) errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); } else if (_substrcmp(*av, "tablearg") == 0) { action->arg1 = IP_FW_TABLEARG; } else if (i == TOK_DIVERT || i == TOK_TEE) { struct servent *s; setservent(1); s = getservbyname(av[0], "divert"); if (s != NULL) action->arg1 = ntohs(s->s_port); else errx(EX_DATAERR, "illegal divert/tee port"); } else errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); av++; break; case TOK_FORWARD: { /* * Locate the address-port separator (':' or ','). * Could be one of the following: * hostname:port * IPv4 a.b.c.d,port * IPv4 a.b.c.d:port * IPv6 w:x:y::z,port * The ':' can only be used with hostname and IPv4 address. * XXX-BZ Should we also support [w:x:y::z]:port? */ struct sockaddr_storage result; struct addrinfo *res; char *s, *end; int family; u_short port_number; NEED1("missing forward address[:port]"); /* * locate the address-port separator (':' or ',') */ s = strchr(*av, ','); if (s == NULL) { /* Distinguish between IPv4:port and IPv6 cases. */ s = strchr(*av, ':'); if (s && strchr(s+1, ':')) s = NULL; /* no port */ } port_number = 0; if (s != NULL) { /* Terminate host portion and set s to start of port. */ *(s++) = '\0'; i = strtoport(s, &end, 0 /* base */, 0 /* proto */); if (s == end) errx(EX_DATAERR, "illegal forwarding port ``%s''", s); port_number = (u_short)i; } if (_substrcmp(*av, "tablearg") == 0) { family = PF_INET; ((struct sockaddr_in*)&result)->sin_addr.s_addr = INADDR_ANY; } else { /* * Resolve the host name or address to a family and a * network representation of the address. */ if (getaddrinfo(*av, NULL, NULL, &res)) errx(EX_DATAERR, NULL); /* Just use the first host in the answer. */ family = res->ai_family; memcpy(&result, res->ai_addr, res->ai_addrlen); freeaddrinfo(res); } if (family == PF_INET) { ipfw_insn_sa *p = (ipfw_insn_sa *)action; action->opcode = O_FORWARD_IP; action->len = F_INSN_SIZE(ipfw_insn_sa); /* * In the kernel we assume AF_INET and use only * sin_port and sin_addr. Remember to set sin_len as * the routing code seems to use it too. */ p->sa.sin_len = sizeof(struct sockaddr_in); p->sa.sin_family = AF_INET; p->sa.sin_port = port_number; p->sa.sin_addr.s_addr = ((struct sockaddr_in *)&result)->sin_addr.s_addr; } else if (family == PF_INET6) { ipfw_insn_sa6 *p = (ipfw_insn_sa6 *)action; action->opcode = O_FORWARD_IP6; action->len = F_INSN_SIZE(ipfw_insn_sa6); p->sa.sin6_len = sizeof(struct sockaddr_in6); p->sa.sin6_family = AF_INET6; p->sa.sin6_port = port_number; p->sa.sin6_flowinfo = 0; p->sa.sin6_scope_id = 0; /* No table support for v6 yet. */ bcopy(&((struct sockaddr_in6*)&result)->sin6_addr, &p->sa.sin6_addr, sizeof(p->sa.sin6_addr)); } else { errx(EX_DATAERR, "Invalid address family in forward action"); } av++; break; } case TOK_COMMENT: /* pretend it is a 'count' rule followed by the comment */ action->opcode = O_COUNT; av--; /* go back... */ break; case TOK_SETFIB: { int numfibs; size_t intsize = sizeof(int); action->opcode = O_SETFIB; NEED1("missing fib number"); if (_substrcmp(*av, "tablearg") == 0) { action->arg1 = IP_FW_TABLEARG; } else { action->arg1 = strtoul(*av, NULL, 10); if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) errx(EX_DATAERR, "fibs not suported.\n"); if (action->arg1 >= numfibs) /* Temporary */ errx(EX_DATAERR, "fib too large.\n"); } av++; break; } case TOK_REASS: action->opcode = O_REASS; break; case TOK_RETURN: fill_cmd(action, O_CALLRETURN, F_NOT, 0); break; default: errx(EX_DATAERR, "invalid action %s\n", av[-1]); } action = next_cmd(action); /* * [altq queuename] -- altq tag, optional * [log [logamount N]] -- log, optional * * If they exist, it go first in the cmdbuf, but then it is * skipped in the copy section to the end of the buffer. */ while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { av++; switch (i) { case TOK_LOG: { ipfw_insn_log *c = (ipfw_insn_log *)cmd; int l; if (have_log) errx(EX_DATAERR, "log cannot be specified more than once"); have_log = (ipfw_insn *)c; cmd->len = F_INSN_SIZE(ipfw_insn_log); cmd->opcode = O_LOG; if (av[0] && _substrcmp(*av, "logamount") == 0) { av++; NEED1("logamount requires argument"); l = atoi(*av); if (l < 0) errx(EX_DATAERR, "logamount must be positive"); c->max_log = l; av++; } else { len = sizeof(c->max_log); if (sysctlbyname("net.inet.ip.fw.verbose_limit", &c->max_log, &len, NULL, 0) == -1) errx(1, "sysctlbyname(\"%s\")", "net.inet.ip.fw.verbose_limit"); } } break; #ifndef NO_ALTQ case TOK_ALTQ: { ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; NEED1("missing altq queue name"); if (have_altq) errx(EX_DATAERR, "altq cannot be specified more than once"); have_altq = (ipfw_insn *)a; cmd->len = F_INSN_SIZE(ipfw_insn_altq); cmd->opcode = O_ALTQ; a->qid = altq_name_to_qid(*av); av++; } break; #endif case TOK_TAG: case TOK_UNTAG: { uint16_t tag; if (have_tag) errx(EX_USAGE, "tag and untag cannot be " "specified more than once"); GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i, rule_action_params); have_tag = cmd; fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); av++; break; } default: abort(); } cmd = next_cmd(cmd); } if (have_state) /* must be a check-state, we are done */ goto done; #define OR_START(target) \ if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ if (open_par) \ errx(EX_USAGE, "nested \"(\" not allowed\n"); \ prev = NULL; \ open_par = 1; \ if ( (av[0])[1] == '\0') { \ av++; \ } else \ (*av)++; \ } \ target: \ #define CLOSE_PAR \ if (open_par) { \ if (av[0] && ( \ strcmp(*av, ")") == 0 || \ strcmp(*av, "}") == 0)) { \ prev = NULL; \ open_par = 0; \ av++; \ } else \ errx(EX_USAGE, "missing \")\"\n"); \ } #define NOT_BLOCK \ if (av[0] && _substrcmp(*av, "not") == 0) { \ if (cmd->len & F_NOT) \ errx(EX_USAGE, "double \"not\" not allowed\n"); \ cmd->len |= F_NOT; \ av++; \ } #define OR_BLOCK(target) \ if (av[0] && _substrcmp(*av, "or") == 0) { \ if (prev == NULL || open_par == 0) \ errx(EX_DATAERR, "invalid OR block"); \ prev->len |= F_OR; \ av++; \ goto target; \ } \ CLOSE_PAR; first_cmd = cmd; #if 0 /* * MAC addresses, optional. * If we have this, we skip the part "proto from src to dst" * and jump straight to the option parsing. */ NOT_BLOCK; NEED1("missing protocol"); if (_substrcmp(*av, "MAC") == 0 || _substrcmp(*av, "mac") == 0) { av++; /* the "MAC" keyword */ add_mac(cmd, av); /* exits in case of errors */ cmd = next_cmd(cmd); av += 2; /* dst-mac and src-mac */ NOT_BLOCK; NEED1("missing mac type"); if (add_mactype(cmd, av[0])) cmd = next_cmd(cmd); av++; /* any or mac-type */ goto read_options; } #endif /* * protocol, mandatory */ OR_START(get_proto); NOT_BLOCK; NEED1("missing protocol"); if (add_proto_compat(cmd, *av, &proto)) { av++; if (F_LEN(cmd) != 0) { prev = cmd; cmd = next_cmd(cmd); } } else if (first_cmd != cmd) { errx(EX_DATAERR, "invalid protocol ``%s''", *av); } else goto read_options; OR_BLOCK(get_proto); /* * "from", mandatory */ if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) errx(EX_USAGE, "missing ``from''"); av++; /* * source IP, mandatory */ OR_START(source_ip); NOT_BLOCK; /* optional "not" */ NEED1("missing source address"); if (add_src(cmd, *av, proto)) { av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); } } else errx(EX_USAGE, "bad source address %s", *av); OR_BLOCK(source_ip); /* * source ports, optional */ NOT_BLOCK; /* optional "not" */ if ( av[0] != NULL ) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } } /* * "to", mandatory */ if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) errx(EX_USAGE, "missing ``to''"); av++; /* * destination, mandatory */ OR_START(dest_ip); NOT_BLOCK; /* optional "not" */ NEED1("missing dst address"); if (add_dst(cmd, *av, proto)) { av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd); } } else errx( EX_USAGE, "bad destination address %s", *av); OR_BLOCK(dest_ip); /* * dest. ports, optional */ NOT_BLOCK; /* optional "not" */ if (av[0]) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd); } } read_options: if (av[0] && first_cmd == cmd) { /* * nothing specified so far, store in the rule to ease * printout later. */ rule->_pad = 1; } prev = NULL; while ( av[0] != NULL ) { char *s; ipfw_insn_u32 *cmd32; /* alias for cmd */ s = *av; cmd32 = (ipfw_insn_u32 *)cmd; if (*s == '!') { /* alternate syntax for NOT */ if (cmd->len & F_NOT) errx(EX_USAGE, "double \"not\" not allowed\n"); cmd->len = F_NOT; s++; } i = match_token(rule_options, s); av++; switch(i) { case TOK_NOT: if (cmd->len & F_NOT) errx(EX_USAGE, "double \"not\" not allowed\n"); cmd->len = F_NOT; break; case TOK_OR: if (open_par == 0 || prev == NULL) errx(EX_USAGE, "invalid \"or\" block\n"); prev->len |= F_OR; break; case TOK_STARTBRACE: if (open_par) errx(EX_USAGE, "+nested \"(\" not allowed\n"); open_par = 1; break; case TOK_ENDBRACE: if (!open_par) errx(EX_USAGE, "+missing \")\"\n"); open_par = 0; prev = NULL; break; case TOK_IN: fill_cmd(cmd, O_IN, 0, 0); break; case TOK_OUT: cmd->len ^= F_NOT; /* toggle F_NOT */ fill_cmd(cmd, O_IN, 0, 0); break; case TOK_DIVERTED: fill_cmd(cmd, O_DIVERTED, 0, 3); break; case TOK_DIVERTEDLOOPBACK: fill_cmd(cmd, O_DIVERTED, 0, 1); break; case TOK_DIVERTEDOUTPUT: fill_cmd(cmd, O_DIVERTED, 0, 2); break; case TOK_FRAG: fill_cmd(cmd, O_FRAG, 0, 0); break; case TOK_LAYER2: fill_cmd(cmd, O_LAYER2, 0, 0); break; case TOK_XMIT: case TOK_RECV: case TOK_VIA: NEED1("recv, xmit, via require interface name" " or address"); fill_iface((ipfw_insn_if *)cmd, av[0]); av++; if (F_LEN(cmd) == 0) /* not a valid address */ break; if (i == TOK_XMIT) cmd->opcode = O_XMIT; else if (i == TOK_RECV) cmd->opcode = O_RECV; else if (i == TOK_VIA) cmd->opcode = O_VIA; break; case TOK_ICMPTYPES: NEED1("icmptypes requires list of types"); fill_icmptypes((ipfw_insn_u32 *)cmd, *av); av++; break; case TOK_ICMP6TYPES: NEED1("icmptypes requires list of types"); fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av); av++; break; case TOK_IPTTL: NEED1("ipttl requires TTL"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPTTL)) errx(EX_DATAERR, "invalid ipttl %s", *av); } else fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPID: NEED1("ipid requires id"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPID)) errx(EX_DATAERR, "invalid ipid %s", *av); } else fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPLEN: NEED1("iplen requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPLEN)) errx(EX_DATAERR, "invalid ip len %s", *av); } else fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPVER: NEED1("ipver requires version"); fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPPRECEDENCE: NEED1("ipprecedence requires value"); fill_cmd(cmd, O_IPPRECEDENCE, 0, (strtoul(*av, NULL, 0) & 7) << 5); av++; break; case TOK_IPOPTS: NEED1("missing argument for ipoptions"); fill_flags(cmd, O_IPOPT, f_ipopts, *av); av++; break; case TOK_IPTOS: NEED1("missing argument for iptos"); fill_flags(cmd, O_IPTOS, f_iptos, *av); av++; break; case TOK_UID: NEED1("uid requires argument"); { char *end; uid_t uid; struct passwd *pwd; cmd->opcode = O_UID; uid = strtoul(*av, &end, 0); pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av); if (pwd == NULL) errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); cmd32->d[0] = pwd->pw_uid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_GID: NEED1("gid requires argument"); { char *end; gid_t gid; struct group *grp; cmd->opcode = O_GID; gid = strtoul(*av, &end, 0); grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av); if (grp == NULL) errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); cmd32->d[0] = grp->gr_gid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_JAIL: NEED1("jail requires argument"); { char *end; int jid; cmd->opcode = O_JAIL; jid = (int)strtol(*av, &end, 0); if (jid < 0 || *end != '\0') errx(EX_DATAERR, "jail requires prison ID"); cmd32->d[0] = (uint32_t)jid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_ESTAB: fill_cmd(cmd, O_ESTAB, 0, 0); break; case TOK_SETUP: fill_cmd(cmd, O_TCPFLAGS, 0, (TH_SYN) | ( (TH_ACK) & 0xff) <<8 ); break; case TOK_TCPDATALEN: NEED1("tcpdatalen requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TCPDATALEN)) errx(EX_DATAERR, "invalid tcpdata len %s", *av); } else fill_cmd(cmd, O_TCPDATALEN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_TCPOPTS: NEED1("missing argument for tcpoptions"); fill_flags(cmd, O_TCPOPTS, f_tcpopts, *av); av++; break; case TOK_TCPSEQ: case TOK_TCPACK: NEED1("tcpseq/tcpack requires argument"); cmd->len = F_INSN_SIZE(ipfw_insn_u32); cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); av++; break; case TOK_TCPWIN: NEED1("tcpwin requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TCPWIN)) errx(EX_DATAERR, "invalid tcpwin len %s", *av); } else fill_cmd(cmd, O_TCPWIN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_TCPFLAGS: NEED1("missing argument for tcpflags"); cmd->opcode = O_TCPFLAGS; fill_flags(cmd, O_TCPFLAGS, f_tcpflags, *av); av++; break; case TOK_KEEPSTATE: if (open_par) errx(EX_USAGE, "keep-state cannot be part " "of an or block"); if (have_state) errx(EX_USAGE, "only one of keep-state " "and limit is allowed"); have_state = cmd; fill_cmd(cmd, O_KEEP_STATE, 0, 0); break; case TOK_LIMIT: { ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; int val; if (open_par) errx(EX_USAGE, "limit cannot be part of an or block"); if (have_state) errx(EX_USAGE, "only one of keep-state and " "limit is allowed"); have_state = cmd; cmd->len = F_INSN_SIZE(ipfw_insn_limit); cmd->opcode = O_LIMIT; c->limit_mask = c->conn_limit = 0; while ( av[0] != NULL ) { if ((val = match_token(limit_masks, *av)) <= 0) break; c->limit_mask |= val; av++; } if (c->limit_mask == 0) errx(EX_USAGE, "limit: missing limit mask"); GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_LIMIT, rule_options); av++; break; } case TOK_PROTO: NEED1("missing protocol"); if (add_proto(cmd, *av, &proto)) { av++; } else errx(EX_DATAERR, "invalid protocol ``%s''", *av); break; case TOK_SRCIP: NEED1("missing source IP"); if (add_srcip(cmd, *av)) { av++; } break; case TOK_DSTIP: NEED1("missing destination IP"); if (add_dstip(cmd, *av)) { av++; } break; case TOK_SRCIP6: NEED1("missing source IP6"); if (add_srcip6(cmd, *av)) { av++; } break; case TOK_DSTIP6: NEED1("missing destination IP6"); if (add_dstip6(cmd, *av)) { av++; } break; case TOK_SRCPORT: NEED1("missing source port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT)) { av++; } else errx(EX_DATAERR, "invalid source port %s", *av); break; case TOK_DSTPORT: NEED1("missing destination port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT)) { av++; } else errx(EX_DATAERR, "invalid destination port %s", *av); break; case TOK_MAC: if (add_mac(cmd, av)) av += 2; break; case TOK_MACTYPE: NEED1("missing mac type"); if (!add_mactype(cmd, *av)) errx(EX_DATAERR, "invalid mac type %s", *av); av++; break; case TOK_VERREVPATH: fill_cmd(cmd, O_VERREVPATH, 0, 0); break; case TOK_VERSRCREACH: fill_cmd(cmd, O_VERSRCREACH, 0, 0); break; case TOK_ANTISPOOF: fill_cmd(cmd, O_ANTISPOOF, 0, 0); break; case TOK_IPSEC: fill_cmd(cmd, O_IPSEC, 0, 0); break; case TOK_IPV6: fill_cmd(cmd, O_IP6, 0, 0); break; case TOK_IPV4: fill_cmd(cmd, O_IP4, 0, 0); break; case TOK_EXT6HDR: fill_ext6hdr( cmd, *av ); av++; break; case TOK_FLOWID: if (proto != IPPROTO_IPV6 ) errx( EX_USAGE, "flow-id filter is active " "only for ipv6 protocol\n"); fill_flow6( (ipfw_insn_u32 *) cmd, *av ); av++; break; case TOK_COMMENT: fill_comment(cmd, av); av[0]=NULL; break; case TOK_TAGGED: if (av[0] && strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TAGGED)) errx(EX_DATAERR, "tagged: invalid tag" " list: %s", *av); } else { uint16_t tag; GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_TAGGED, rule_options); fill_cmd(cmd, O_TAGGED, 0, tag); } av++; break; case TOK_FIB: NEED1("fib requires fib number"); fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_SOCKARG: fill_cmd(cmd, O_SOCKARG, 0, 0); break; case TOK_LOOKUP: { ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd; char *p; int j; if (!av[0] || !av[1]) errx(EX_USAGE, "format: lookup argument tablenum"); cmd->opcode = O_IP_DST_LOOKUP; cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; i = match_token(rule_options, *av); for (j = 0; lookup_key[j] >= 0 ; j++) { if (i == lookup_key[j]) break; } if (lookup_key[j] <= 0) errx(EX_USAGE, "format: cannot lookup on %s", *av); __PAST_END(c->d, 1) = j; // i converted to option av++; cmd->arg1 = strtoul(*av, &p, 0); if (p && *p) errx(EX_USAGE, "format: lookup argument tablenum"); av++; } break; default: errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s); } if (F_LEN(cmd) > 0) { /* prepare to advance */ prev = cmd; cmd = next_cmd(cmd); } } done: /* * Now copy stuff into the rule. * If we have a keep-state option, the first instruction * must be a PROBE_STATE (which is generated here). * If we have a LOG option, it was stored as the first command, * and now must be moved to the top of the action part. */ dst = (ipfw_insn *)rule->cmd; /* * First thing to write into the command stream is the match probability. */ if (match_prob != 1) { /* 1 means always match */ dst->opcode = O_PROB; dst->len = 2; *((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff); dst += dst->len; } /* * generate O_PROBE_STATE if necessary */ if (have_state && have_state->opcode != O_CHECK_STATE) { fill_cmd(dst, O_PROBE_STATE, 0, 0); dst = next_cmd(dst); } /* copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG */ for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) { i = F_LEN(src); switch (src->opcode) { case O_LOG: case O_KEEP_STATE: case O_LIMIT: case O_ALTQ: case O_TAG: break; default: bcopy(src, dst, i * sizeof(uint32_t)); dst += i; } } /* * put back the have_state command as last opcode */ if (have_state && have_state->opcode != O_CHECK_STATE) { i = F_LEN(have_state); bcopy(have_state, dst, i * sizeof(uint32_t)); dst += i; } /* * start action section */ rule->act_ofs = dst - rule->cmd; /* put back O_LOG, O_ALTQ, O_TAG if necessary */ if (have_log) { i = F_LEN(have_log); bcopy(have_log, dst, i * sizeof(uint32_t)); dst += i; } if (have_altq) { i = F_LEN(have_altq); bcopy(have_altq, dst, i * sizeof(uint32_t)); dst += i; } if (have_tag) { i = F_LEN(have_tag); bcopy(have_tag, dst, i * sizeof(uint32_t)); dst += i; } /* * copy all other actions */ for (src = (ipfw_insn *)actbuf; src != action; src += i) { i = F_LEN(src); bcopy(src, dst, i * sizeof(uint32_t)); dst += i; } rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd); i = (char *)dst - (char *)rule; if (do_cmd(IP_FW_ADD, rule, (uintptr_t)&i) == -1) err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_ADD"); if (!co.do_quiet) show_ipfw(rule, 0, 0); } /* * clear the counters or the log counters. */ void ipfw_zero(int ac, char *av[], int optname /* 0 = IP_FW_ZERO, 1 = IP_FW_RESETLOG */) { uint32_t arg, saved_arg; int failed = EX_OK; char const *errstr; char const *name = optname ? "RESETLOG" : "ZERO"; optname = optname ? IP_FW_RESETLOG : IP_FW_ZERO; av++; ac--; if (!ac) { /* clear all entries */ if (do_cmd(optname, NULL, 0) < 0) err(EX_UNAVAILABLE, "setsockopt(IP_FW_%s)", name); if (!co.do_quiet) printf("%s.\n", optname == IP_FW_ZERO ? "Accounting cleared":"Logging counts reset"); return; } while (ac) { /* Rule number */ if (isdigit(**av)) { arg = strtonum(*av, 0, 0xffff, &errstr); if (errstr) errx(EX_DATAERR, "invalid rule number %s\n", *av); saved_arg = arg; if (co.use_set) arg |= (1 << 24) | ((co.use_set - 1) << 16); av++; ac--; if (do_cmd(optname, &arg, sizeof(arg))) { warn("rule %u: setsockopt(IP_FW_%s)", saved_arg, name); failed = EX_UNAVAILABLE; } else if (!co.do_quiet) printf("Entry %d %s.\n", saved_arg, optname == IP_FW_ZERO ? "cleared" : "logging count reset"); } else { errx(EX_USAGE, "invalid rule number ``%s''", *av); } } if (failed != EX_OK) exit(failed); } void ipfw_flush(int force) { int cmd = co.do_pipe ? IP_DUMMYNET_FLUSH : IP_FW_FLUSH; if (!force && !co.do_quiet) { /* need to ask user */ int c; printf("Are you sure? [yn] "); fflush(stdout); do { c = toupper(getc(stdin)); while (c != '\n' && getc(stdin) != '\n') if (feof(stdin)) return; /* and do not flush */ } while (c != 'Y' && c != 'N'); printf("\n"); if (c == 'N') /* user said no */ return; } if (co.do_pipe) { dummynet_flush(); return; } /* `ipfw set N flush` - is the same that `ipfw delete set N` */ if (co.use_set) { uint32_t arg = ((co.use_set - 1) & 0xffff) | (1 << 24); if (do_cmd(IP_FW_DEL, &arg, sizeof(arg)) < 0) err(EX_UNAVAILABLE, "setsockopt(IP_FW_DEL)"); } else if (do_cmd(cmd, NULL, 0) < 0) err(EX_UNAVAILABLE, "setsockopt(IP_%s_FLUSH)", co.do_pipe ? "DUMMYNET" : "FW"); if (!co.do_quiet) printf("Flushed all %s.\n", co.do_pipe ? "pipes" : "rules"); } static void table_list(uint16_t num, int need_header); /* * This one handles all table-related commands * ipfw table N add addr[/masklen] [value] * ipfw table N delete addr[/masklen] * ipfw table {N | all} flush * ipfw table {N | all} list */ void ipfw_table_handler(int ac, char *av[]) { ipfw_table_xentry xent; int do_add; int is_all; size_t len; char *p; uint32_t a, type, mask, addrlen; uint32_t tables_max; mask = 0; // XXX uninitialized ? len = sizeof(tables_max); if (sysctlbyname("net.inet.ip.fw.tables_max", &tables_max, &len, NULL, 0) == -1) errx(1, "Can't determine maximum number of ipfw tables. " "Perhaps you forgot to load ipfw module?"); memset(&xent, 0, sizeof(xent)); ac--; av++; if (ac && isdigit(**av)) { xent.tbl = atoi(*av); is_all = 0; ac--; av++; } else if (ac && _substrcmp(*av, "all") == 0) { xent.tbl = 0; is_all = 1; ac--; av++; } else errx(EX_USAGE, "table number or 'all' keyword required"); if (xent.tbl >= tables_max) errx(EX_USAGE, "The table number exceeds the maximum allowed " "value (%d)", tables_max - 1); NEED1("table needs command"); if (is_all && _substrcmp(*av, "list") != 0 && _substrcmp(*av, "flush") != 0) errx(EX_USAGE, "table number required"); if (_substrcmp(*av, "add") == 0 || _substrcmp(*av, "delete") == 0) { do_add = **av == 'a'; ac--; av++; if (!ac) errx(EX_USAGE, "address required"); /* * Let's try to guess type by agrument. * Possible types: * 1) IPv4[/mask] * 2) IPv6[/mask] * 3) interface name * 4) port ? */ type = 0; if (ishexnumber(*av[0])) { /* Remove / if exists */ if ((p = strchr(*av, '/')) != NULL) { *p = '\0'; mask = atoi(p + 1); } if (inet_pton(AF_INET, *av, &xent.k.addr6) == 1) { type = IPFW_TABLE_CIDR; if ((p != NULL) && (mask > 32)) errx(EX_DATAERR, "bad IPv4 mask width: %s", p + 1); xent.masklen = p ? mask : 32; addrlen = sizeof(struct in_addr); } else if (inet_pton(AF_INET6, *av, &xent.k.addr6) == 1) { type = IPFW_TABLE_CIDR; if ((p != NULL) && (mask > 128)) errx(EX_DATAERR, "bad IPv6 mask width: %s", p + 1); xent.masklen = p ? mask : 128; addrlen = sizeof(struct in6_addr); } } if ((type == 0) && (strchr(*av, '.') == NULL)) { /* Assume interface name. Copy significant data only */ mask = MIN(strlen(*av), IF_NAMESIZE - 1); memcpy(xent.k.iface, *av, mask); /* Set mask to exact match */ xent.masklen = 8 * IF_NAMESIZE; type = IPFW_TABLE_INTERFACE; addrlen = IF_NAMESIZE; } if (type == 0) { if (lookup_host(*av, (struct in_addr *)&xent.k.addr6) != 0) errx(EX_NOHOST, "hostname ``%s'' unknown", *av); xent.masklen = 32; type = IPFW_TABLE_CIDR; addrlen = sizeof(struct in_addr); } xent.type = type; xent.len = offsetof(ipfw_table_xentry, k) + addrlen; ac--; av++; if (do_add && ac) { unsigned int tval; /* isdigit is a bit of a hack here.. */ if (strchr(*av, (int)'.') == NULL && isdigit(**av)) { xent.value = strtoul(*av, NULL, 0); } else { if (lookup_host(*av, (struct in_addr *)&tval) == 0) { /* The value must be stored in host order * * so that the values < 65k can be distinguished */ xent.value = ntohl(tval); } else { errx(EX_NOHOST, "hostname ``%s'' unknown", *av); } } } else xent.value = 0; if (do_setcmd3(do_add ? IP_FW_TABLE_XADD : IP_FW_TABLE_XDEL, &xent, xent.len) < 0) { /* If running silent, don't bomb out on these errors. */ if (!(co.do_quiet && (errno == (do_add ? EEXIST : ESRCH)))) err(EX_OSERR, "setsockopt(IP_FW_TABLE_%s)", do_add ? "XADD" : "XDEL"); /* In silent mode, react to a failed add by deleting */ if (do_add) { do_setcmd3(IP_FW_TABLE_XDEL, &xent, xent.len); if (do_setcmd3(IP_FW_TABLE_XADD, &xent, xent.len) < 0) err(EX_OSERR, "setsockopt(IP_FW_TABLE_XADD)"); } } } else if (_substrcmp(*av, "flush") == 0) { a = is_all ? tables_max : (uint32_t)(xent.tbl + 1); do { if (do_cmd(IP_FW_TABLE_FLUSH, &xent.tbl, sizeof(xent.tbl)) < 0) err(EX_OSERR, "setsockopt(IP_FW_TABLE_FLUSH)"); } while (++xent.tbl < a); } else if (_substrcmp(*av, "list") == 0) { a = is_all ? tables_max : (uint32_t)(xent.tbl + 1); do { table_list(xent.tbl, is_all); } while (++xent.tbl < a); } else errx(EX_USAGE, "invalid table command %s", *av); } static void table_list(uint16_t num, int need_header) { ipfw_xtable *tbl; ipfw_table_xentry *xent; socklen_t l; uint32_t *a, sz, tval; char tbuf[128]; struct in6_addr *addr6; ip_fw3_opheader *op3; /* Prepend value with IP_FW3 header */ l = sizeof(ip_fw3_opheader) + sizeof(uint32_t); op3 = alloca(l); /* Zero reserved fields */ memset(op3, 0, sizeof(ip_fw3_opheader)); a = (uint32_t *)(op3 + 1); *a = num; op3->opcode = IP_FW_TABLE_XGETSIZE; if (do_cmd(IP_FW3, op3, (uintptr_t)&l) < 0) err(EX_OSERR, "getsockopt(IP_FW_TABLE_XGETSIZE)"); /* If a is zero we have nothing to do, the table is empty. */ if (*a == 0) return; l = *a; tbl = safe_calloc(1, l); tbl->opheader.opcode = IP_FW_TABLE_XLIST; tbl->tbl = num; if (do_cmd(IP_FW3, tbl, (uintptr_t)&l) < 0) err(EX_OSERR, "getsockopt(IP_FW_TABLE_XLIST)"); if (tbl->cnt && need_header) printf("---table(%d)---\n", tbl->tbl); sz = tbl->size - sizeof(ipfw_xtable); xent = &tbl->xent[0]; while (sz > 0) { switch (tbl->type) { case IPFW_TABLE_CIDR: /* IPv4 or IPv6 prefixes */ tval = xent->value; addr6 = &xent->k.addr6; if ((addr6->s6_addr32[0] == 0) && (addr6->s6_addr32[1] == 0) && (addr6->s6_addr32[2] == 0)) { /* IPv4 address */ inet_ntop(AF_INET, &addr6->s6_addr32[3], tbuf, sizeof(tbuf)); } else { /* IPv6 address */ inet_ntop(AF_INET6, addr6, tbuf, sizeof(tbuf)); } if (co.do_value_as_ip) { tval = htonl(tval); printf("%s/%u %s\n", tbuf, xent->masklen, inet_ntoa(*(struct in_addr *)&tval)); } else printf("%s/%u %u\n", tbuf, xent->masklen, tval); break; case IPFW_TABLE_INTERFACE: /* Interface names */ tval = xent->value; if (co.do_value_as_ip) { tval = htonl(tval); printf("%s %s\n", xent->k.iface, inet_ntoa(*(struct in_addr *)&tval)); } else printf("%s %u\n", xent->k.iface, tval); } if (sz < xent->len) break; sz -= xent->len; xent = (ipfw_table_xentry *)((char *)xent + xent->len); } free(tbl); } ipfw-user/ipfw/Makefile000644 000423 000000 00000002352 12007740305 015642 0ustar00luigiwheel000000 000000 # Makefile to build the userland part of ipfw # and have it communicate with the kernel part through a socket. # # For portability, this Makefile needs gmake # (the 'Makefile.bsd' is the original one, and extremely simple # XXX we could reverse the logic, Makefile is the original, Makefile.linux # is the gmake version) # include ../Makefile.inc VPATH = .:../extra CFLAGS += -include ../extra/glue.h CFLAGS += -O2 -Wall -Werror OBJS = ipfw2.o dummynet.o main.o ipv6.o altq.o LDFLAGS= LDFLAGS += -lutil # expand_number and humanize_number CFLAGS += $(INCDIRS) CFLAGS += -DUSERSPACE # communicate through userspace ifeq ($(OSARCH),Linux) # XXX untested CFLAGS += -D__BSD_VISIBLE CFLAGS += -DNEED_STRTONUM -DNEED_SYSCTLBYNAME CFLAGS += -Wno-unused-but-set-variable OBJS += expand_number.o humanize_number.o endif ifeq ($(OSARCH),Darwin) CFLAGS += -D__BSD_VISIBLE CFLAGS += -DNEED_STRTONUM OBJS += expand_number.o humanize_number.o endif OBJS += glue.o ipfw: $(OBJS) $(MSG) " LD $@" $(HIDE)$(CC) $(LDFLAGS) -o $@ $^ ../$(OBJDIR)/include_e: (cd ../objs; $(MAKE) -f ../Makefile.kipfw include_e) $(OBJS) : ipfw2.h ../extra/glue.h ../$(OBJDIR)/include_e clean: -rm -f *.o ipfw diff: (diff -ubwr $(BSD_HEAD)/sbin/ipfw .) ipfw-user/ipfw/ipfw2.h000644 000423 000000 00000015537 11725221076 015420 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD: head/sbin/ipfw/ipfw2.h 223666 2011-06-29 10:06:58Z ae $ */ /* * Options that can be set on the command line. * When reading commands from a file, a subset of the options can also * be applied globally by specifying them before the file name. * After that, each line can contain its own option that changes * the global value. * XXX The context is not restored after each line. */ struct cmdline_opts { /* boolean options: */ int do_value_as_ip; /* show table value as IP */ int do_resolv; /* try to resolve all ip to names */ int do_time; /* Show time stamps */ int do_quiet; /* Be quiet in add and flush */ int do_pipe; /* this cmd refers to a pipe/queue/sched */ int do_nat; /* this cmd refers to a nat config */ int do_dynamic; /* display dynamic rules */ int do_expired; /* display expired dynamic rules */ int do_compact; /* show rules in compact mode */ int do_force; /* do not ask for confirmation */ int show_sets; /* display the set each rule belongs to */ int test_only; /* only check syntax */ int comment_only; /* only print action and comment */ int verbose; /* be verbose on some commands */ /* The options below can have multiple values. */ int do_sort; /* field to sort results (0 = no) */ /* valid fields are 1 and above */ int use_set; /* work with specified set number */ /* 0 means all sets, otherwise apply to set use_set - 1 */ }; extern struct cmdline_opts co; /* * _s_x is a structure that stores a string <-> token pairs, used in * various places in the parser. Entries are stored in arrays, * with an entry with s=NULL as terminator. * The search routines are match_token() and match_value(). * Often, an element with x=0 contains an error string. * */ struct _s_x { char const *s; int x; }; enum tokens { TOK_NULL=0, TOK_OR, TOK_NOT, TOK_STARTBRACE, TOK_ENDBRACE, TOK_ACCEPT, TOK_COUNT, TOK_PIPE, TOK_LINK, TOK_QUEUE, TOK_FLOWSET, TOK_SCHED, TOK_DIVERT, TOK_TEE, TOK_NETGRAPH, TOK_NGTEE, TOK_FORWARD, TOK_SKIPTO, TOK_DENY, TOK_REJECT, TOK_RESET, TOK_UNREACH, TOK_CHECKSTATE, TOK_NAT, TOK_REASS, TOK_CALL, TOK_RETURN, TOK_ALTQ, TOK_LOG, TOK_TAG, TOK_UNTAG, TOK_TAGGED, TOK_UID, TOK_GID, TOK_JAIL, TOK_IN, TOK_LIMIT, TOK_KEEPSTATE, TOK_LAYER2, TOK_OUT, TOK_DIVERTED, TOK_DIVERTEDLOOPBACK, TOK_DIVERTEDOUTPUT, TOK_XMIT, TOK_RECV, TOK_VIA, TOK_FRAG, TOK_IPOPTS, TOK_IPLEN, TOK_IPID, TOK_IPPRECEDENCE, TOK_DSCP, TOK_IPTOS, TOK_IPTTL, TOK_IPVER, TOK_ESTAB, TOK_SETUP, TOK_TCPDATALEN, TOK_TCPFLAGS, TOK_TCPOPTS, TOK_TCPSEQ, TOK_TCPACK, TOK_TCPWIN, TOK_ICMPTYPES, TOK_MAC, TOK_MACTYPE, TOK_VERREVPATH, TOK_VERSRCREACH, TOK_ANTISPOOF, TOK_IPSEC, TOK_COMMENT, TOK_PLR, TOK_NOERROR, TOK_BUCKETS, TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, TOK_ALL, TOK_MASK, TOK_FLOW_MASK, TOK_SCHED_MASK, TOK_BW, TOK_DELAY, TOK_PROFILE, TOK_BURST, TOK_RED, TOK_GRED, TOK_DROPTAIL, TOK_PROTO, /* dummynet tokens */ TOK_WEIGHT, TOK_LMAX, TOK_PRI, TOK_TYPE, TOK_SLOTSIZE, TOK_IP, TOK_IF, TOK_ALOG, TOK_DENY_INC, TOK_SAME_PORTS, TOK_UNREG_ONLY, TOK_SKIP_GLOBAL, TOK_RESET_ADDR, TOK_ALIAS_REV, TOK_PROXY_ONLY, TOK_REDIR_ADDR, TOK_REDIR_PORT, TOK_REDIR_PROTO, TOK_IPV6, TOK_FLOWID, TOK_ICMP6TYPES, TOK_EXT6HDR, TOK_DSTIP6, TOK_SRCIP6, TOK_IPV4, TOK_UNREACH6, TOK_RESET6, TOK_FIB, TOK_SETFIB, TOK_LOOKUP, TOK_SOCKARG, }; /* * the following macro returns an error message if we run out of * arguments. */ #define NEED(_p, msg) {if (!_p) errx(EX_USAGE, msg);} #define NEED1(msg) {if (!(*av)) errx(EX_USAGE, msg);} int pr_u64(uint64_t *pd, int width); /* memory allocation support */ void *safe_calloc(size_t number, size_t size); void *safe_realloc(void *ptr, size_t size); /* string comparison functions used for historical compatibility */ int _substrcmp(const char *str1, const char* str2); int _substrcmp2(const char *str1, const char* str2, const char* str3); /* utility functions */ int match_token(struct _s_x *table, char *string); char const *match_value(struct _s_x *p, int value); int do_cmd(int optname, void *optval, uintptr_t optlen); struct in6_addr; void n2mask(struct in6_addr *mask, int n); int contigmask(uint8_t *p, int len); /* * Forward declarations to avoid include way too many headers. * C does not allow duplicated typedefs, so we use the base struct * that the typedef points to. * Should the typedefs use a different type, the compiler will * still detect the change when compiling the body of the * functions involved, so we do not lose error checking. */ struct _ipfw_insn; struct _ipfw_insn_altq; struct _ipfw_insn_u32; struct _ipfw_insn_ip6; struct _ipfw_insn_icmp6; /* * The reserved set numer. This is a constant in ip_fw.h * but we store it in a variable so other files do not depend * in that header just for one constant. */ extern int resvd_set_number; /* first-level command handlers */ void ipfw_add(char *av[]); void ipfw_show_nat(int ac, char **av); void ipfw_config_pipe(int ac, char **av); void ipfw_config_nat(int ac, char **av); void ipfw_sets_handler(char *av[]); void ipfw_table_handler(int ac, char *av[]); void ipfw_sysctl_handler(char *av[], int which); void ipfw_delete(char *av[]); void ipfw_flush(int force); void ipfw_zero(int ac, char *av[], int optname); void ipfw_list(int ac, char *av[], int show_counters); /* altq.c */ void altq_set_enabled(int enabled); u_int32_t altq_name_to_qid(const char *name); void print_altq_cmd(struct _ipfw_insn_altq *altqptr); /* dummynet.c */ void dummynet_list(int ac, char *av[], int show_counters); void dummynet_flush(void); int ipfw_delete_pipe(int pipe_or_queue, int n); /* ipv6.c */ void print_unreach6_code(uint16_t code); void print_ip6(struct _ipfw_insn_ip6 *cmd, char const *s); void print_flow6id(struct _ipfw_insn_u32 *cmd); void print_icmp6types(struct _ipfw_insn_u32 *cmd); void print_ext6hdr(struct _ipfw_insn *cmd ); struct _ipfw_insn *add_srcip6(struct _ipfw_insn *cmd, char *av); struct _ipfw_insn *add_dstip6(struct _ipfw_insn *cmd, char *av); void fill_flow6(struct _ipfw_insn_u32 *cmd, char *av ); void fill_unreach6_code(u_short *codep, char *str); void fill_icmp6types(struct _ipfw_insn_icmp6 *cmd, char *av); int fill_ext6hdr(struct _ipfw_insn *cmd, char *av); ipfw-user/extra/missing.h000644 000423 000000 00000052775 12007760447 016231 0ustar00luigiwheel000000 000000 /* * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: missing.h 8377 2011-04-04 16:08:27Z marta $ * * Header for kernel variables and functions that are not available in * userland. */ #ifndef _MISSING_H_ #define _MISSING_H_ #define KLD_MODULE /* disable kernel dependencies */ /* defined as assert */ void panic(const char *fmt, ...); #define KASSERT(exp,msg) do { \ if (__predict_false(!(exp))) \ panic msg; \ } while (0) /* don't bother to optimize */ #ifndef __predict_false #define __predict_false(x) (x) /* __builtin_expect((exp), 0) */ #endif // XXX #ifdef _KERNEL #define NEED_KERNEL #undef _KERNEL #endif #include // printf #include // IFNAMSIZ ? #include // strncmp #ifdef NEED_KERNEL #define _KERNEL #include #include #define __user // not defined here ? #define __init #define __exit /* portability features, to be set before the rest: */ #define HAVE_NET_IPLEN /* iplen/ipoff in net format */ #define WITHOUT_BPF /* do not use bpf logging */ #define MALLOC_DECLARE(x) /* nothing */ // XXX kernel malloc/free extern void *kern_malloc(int); extern void kern_free(void *); #define malloc(_size, type, flags) kern_malloc(_size) #define free(_var, type) kern_free(_var) /* inet_ntoa_r() differs in userspace and kernel. * We load netinet/in.h so we get the kernel prototype ? * but we also need to put #defines in the two places where * it is used XXX fixme */ #include /* log() conflicts with the math function. * Revise, modifying the first argument. */ #define LOG_ERR 0x100 #define LOG_INFO 0x200 #ifndef LOG_SECURITY #define LOG_SECURITY 0x400 #endif #define log(_level, fmt, arg...) do { \ int __attribute__((unused)) _querty = _level; \ printf("kernel: " fmt, ##arg); } while (0) #endif /* _KERNEL */ /* * Kernel locking support. * FreeBSD uses mtx in dummynet.c and struct rwlock ip_fw2.c * * In linux we use spinlock_bh to implement both. * For 'struct rwlock' we need an #ifdef to change it to spinlock_t */ #ifndef DEFINE_SPINLOCK /* this is for linux 2.4 */ #if defined(__APPLE__) #define DEFINE_SPINLOCK(x) struct mtx x; #else /* linux ? */ #define DEFINE_SPINLOCK(x) spinlock_t x // = SPIN_LOCK_UNLOCKED #endif #endif /* 20111031 * redefine mutex in terms of threads. */ #undef _KERNEL // #include #include #ifdef NEED_KERNEL #define _KERNEL #endif struct mtx { pthread_mutex_t p0; }; struct rwlock { pthread_mutex_t p0; }; #ifndef __FreeBSD__ struct rmlock { pthread_mutex_t p0; }; #endif extern pthread_mutex_t dummynet_mtx_p; extern pthread_mutex_t ipfw_dyn_mtx_p; extern pthread_mutex_t pfil_global_lock_p; #define mtx_assert(a, b) /* * the first argument to mtx_init is often a static variable, * so use (void)m to prevent a compiler warning */ #define mtx_init(m, a,b,c) do { \ (void)m; pthread_mutex_init(&((m)->p0), NULL); } while (0) #define MTX_SYSINIT(a, m, c, d) // pthread_mutex_init(m##_p, NULL) #define mtx_lock(m) pthread_mutex_lock(m.p0) #define mtx_unlock(m) pthread_mutex_unlock(m.p0) #define mtx_destroy(m) pthread_mutex_destroy(m.p0) #if 1 //------------------ #define rw_assert(a, b) #define rw_destroy(_l) #define rw_init(_l, msg) // XXX mtx_init((_l), 0, 0, 0) #define rw_rlock(_l) mtx_lock(_l) #define rw_runlock(_l) mtx_unlock(_l) #define rw_wlock(_l) mtx_lock(_l) #define rw_wunlock(_l) mtx_unlock(_l) #define rw_init_flags(_l, s, v) #endif // locking on linux ? /* end of locking support */ /* * Reference to an ipfw rule that can be carried outside critical sections. * A rule is identified by rulenum:rule_id which is ordered. * In version chain_id the rule can be found in slot 'slot', so * we don't need a lookup if chain_id == chain->id. * * On exit from the firewall this structure refers to the rule after * the matching one (slot points to the new rule; rulenum:rule_id-1 * is the matching rule), and additional info (e.g. info often contains * the insn argument or tablearg in the low 16 bits, in host format). * On entry, the structure is valid if slot>0, and refers to the starting * rules. 'info' contains the reason for reinject, e.g. divert port, * divert direction, and so on. */ struct ipfw_rule_ref { uint32_t slot; /* slot for matching rule */ uint32_t rulenum; /* matching rule number */ uint32_t rule_id; /* matching rule id */ uint32_t chain_id; /* ruleset id */ uint32_t info; /* see below */ }; enum { IPFW_INFO_MASK = 0x0000ffff, IPFW_INFO_OUT = 0x00000000, /* outgoing, just for convenience */ IPFW_INFO_IN = 0x80000000, /* incoming, overloads dir */ IPFW_ONEPASS = 0x40000000, /* One-pass, do not reinject */ IPFW_IS_MASK = 0x30000000, /* which source ? */ IPFW_IS_DIVERT = 0x20000000, IPFW_IS_DUMMYNET =0x10000000, IPFW_IS_PIPE = 0x08000000, /* pipe=1, queue = 0 */ }; /* in netinet/in.h */ #define in_nullhost(x) ((x).s_addr == INADDR_ANY) /* * Historically, BSD keeps ip_len and ip_off in host format * when doing layer 3 processing, and this often requires * to translate the format back and forth. * To make the process explicit, we define a couple of macros * that also take into account the fact that at some point * we may want to keep those fields always in net format. */ #if (BYTE_ORDER == BIG_ENDIAN) || defined(HAVE_NET_IPLEN) #define SET_NET_IPLEN(p) do {} while (0) #define SET_HOST_IPLEN(p) do {} while (0) #else /* never on linux */ #define SET_NET_IPLEN(p) do { \ struct ip *h_ip = (p); \ h_ip->ip_len = htons(h_ip->ip_len); \ h_ip->ip_off = htons(h_ip->ip_off); \ } while (0) #define SET_HOST_IPLEN(p) do { \ struct ip *h_ip = (p); \ h_ip->ip_len = ntohs(h_ip->ip_len); \ h_ip->ip_off = ntohs(h_ip->ip_off); \ } while (0) #endif /* !HAVE_NET_IPLEN */ /* ip_dummynet.c */ #ifndef __FreeBSD_version #define __FreeBSD_version 500035 #endif /* define some macro for ip_dummynet */ struct malloc_type { }; #define MALLOC_DEFINE(type, shortdesc, longdesc) \ struct malloc_type type[1]; void *md_dummy_ ## type = type #define CTASSERT(x) /* * gettimeofday would be in sys/time.h but it is not * visible if _KERNEL is defined */ //int gettimeofday(struct timeval *, struct timezone *); extern int hz; extern long tick; /* exists in 2.4 but not in 2.6 */ extern int bootverbose; extern struct timeval boottime; /* time_uptime is a FreeBSD variable increased each second */ extern time_t time_uptime; extern int max_linkhdr; extern int ip_defttl; extern u_long in_ifaddrhmask; /* mask for hash table */ extern struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ /*-------------------------------------------------*/ /* define, includes and functions missing in linux */ /* include and define */ #include /* inet_ntoa */ struct mbuf; #define M_MCAST 0x04 /* send/received as link-level multicast */ /* used by ip_dummynet.c */ void reinject_drop(struct mbuf* m); #include /* for ETHERTYPE_IP */ #ifdef _KERNEL #include /* IFNAMESIZ */ #endif void rn_init(int); /* * some network structure can be defined in the bsd way * by using the _FAVOR_BSD definition. This is not true * for icmp structure. * XXX struct icmp contains bsd names in * /usr/include/netinet/ip_icmp.h */ /* missing definition */ #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 #define TH_ACK 0x10 #define RTF_CLONING 0x100 /* generate new routes on use */ #define IPPROTO_OSPFIGP 89 /* OSPFIGP */ #define IPPROTO_CARP 112 /* CARP */ #define CARP_VERSION 2 #define CARP_ADVERTISEMENT 0x01 #define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define NETISR_IP 2 /* same as AF_INET */ #define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ extern int securelevel; #define if_xname name #define if_snd XXX // XXX we could use this to point to the incoming peer struct ifnet { char if_xname[IFNAMSIZ]; /* external name (name + unit) */ }; struct ifaltq { void *ifq_head; }; int ffs(int); // XXX where int fls(int); // XXX where struct ip; /* machine/in_cksum.h */ int in_cksum(struct mbuf *m, int len); #ifndef __FreeBSD__ u_short in_cksum_hdr(struct ip *); #endif /* * ifnet->if_snd is used in ip_dummynet.c to take the transmission * clock. */ #if defined( __linux__) #define if_xname name #define if_snd XXX struct route_in6 { }; #elif defined( _WIN32 ) /* used in ip_dummynet.c */ struct ifnet { char if_xname[IFNAMSIZ]; /* external name (name + unit) */ // struct ifaltq if_snd; /* output queue (includes altq) */ }; struct net_device { char if_xname[IFNAMSIZ]; /* external name (name + unit) */ }; #elif defined(__APPLE__) typedef u_int32_t tcp_cc; #ifndef s6_addr32 // XXX #define s6_addr32 __u6_addr.__u6_addr32 #endif #include struct route_in6 { }; struct icmphdr { u_char icmp_type; /* type of message, see below */ u_char icmp_code; /* type sub code */ u_short icmp_cksum; /* ones complement cksum of struct */ }; #define IPPROTO_SCTP 132 /* SCTP */ /* defined in linux/sctp.h with no bsd definition */ struct sctphdr { uint16_t src_port; /* source port */ uint16_t dest_port; /* destination port */ uint32_t v_tag; /* verification tag of packet */ uint32_t checksum; /* Adler32 C-Sum */ /* chunks follow... */ }; struct carp_header { #if BYTE_ORDER == LITTLE_ENDIAN u_int8_t carp_type:4, carp_version:4; #endif #if BYTE_ORDER == BIG_ENDIAN u_int8_t carp_version:4, carp_type:4; #endif }; struct pim { int dummy; /* windows compiler does not like empty definition */ }; #endif /* involves mbufs */ //int in_cksum(struct mbuf *m, int len); #define divert_cookie(mtag) 0 #define divert_info(mtag) 0 #define pf_find_mtag(a) NULL #define pf_get_mtag(a) NULL #if !defined(_WIN32) && !defined(AF_LINK) #define AF_LINK AF_ASH /* ? our sys/socket.h */ #endif /* search local the ip addresses, used for the "me" keyword */ #define INADDR_TO_IFP(ip, b) b = NULL /* we don't pullup, either success or free and fail */ #define m_pullup(m, x) \ ((m)->m_len >= x ? (m) : (FREE_PKT(m), NULL)) struct pf_mtag { void *hdr; /* saved hdr pos in mbuf, for ECN */ sa_family_t af; /* for ECN */ u_int32_t qid; /* queue id */ }; /* missing kernel functions */ char *inet_ntoa(struct in_addr ina); long random(void); /* * Return the risult of a/b * * this is used in linux kernel space, * since the 64bit division needs to * be done using a macro */ //int64_t div64(int64_t a, int64_t b); /* from bsd sys/queue.h */ #define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = TAILQ_FIRST((head)); \ (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define SLIST_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = SLIST_FIRST((head)); \ (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ (var) = (tvar)) /*-------------------------------------------------*/ #define RT_NUMFIBS 1 extern u_int rt_numfibs; /* involves kernel locking function */ #ifdef RTFREE #undef RTFREE #define RTFREE(a) fprintf(stderr, "RTFREE: commented out locks\n"); #endif void getmicrouptime(struct timeval *tv); /* from sys/netinet/ip_output.c */ struct ip_moptions; struct route; struct ip; struct inpcb; struct mbuf *ip_reass(struct mbuf *); int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp); /* from net/netisr.c */ int netisr_dispatch(u_int proto, struct mbuf *m); /* definition moved in missing.c */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); /* defined in session.c */ int priv_check(struct thread *td, int priv); /* struct ucred is in linux/socket.h and has pid, uid, gid. * We need a 'bsd_ucred' to store also the extra info */ struct bsd_ucred { uid_t uid; gid_t gid; uint32_t xid; uint32_t nid; }; #ifdef _KERNEL #if 0 // XXX int cred_check(void *insn, int proto, struct ifnet *oif, struct in_addr dst_ip, u_int16_t dst_port, struct in_addr src_ip, u_int16_t src_port, struct bsd_ucred *u, int *ugid_lookupp, struct sk_buff *skb); #endif struct ucred; int securelevel_ge(struct ucred *cr, int level); /* * stripped down version of the sysctl api */ struct sysctl_oid; struct sysctl_req { void *oldptr; /* store here the original value */ int oldlen; void *newptr; /* NULL on reads */ int newlen; }; #ifdef _WIN32 #define module_param_named(_name, _var, _ty, _perm) #else /* !_WIN32 */ #endif /* !_WIN32 so maybe __linux__ */ #if 0 // XXX disable sysctl defined (__linux__) && !defined (EMULATE_SYSCTL) #define SYSCTL_DECL(_1) #define SYSCTL_OID(_1, _2, _3, _4, _5, _6, _7, _8) #define SYSCTL_NODE(_1, _2, _3, _4, _5, _6) #define _SYSCTL_BASE(_name, _var, _ty, _perm) \ module_param_named(_name, *(_var), _ty, \ ( (_perm) == CTLFLAG_RD) ? 0444: 0644 ) #define SYSCTL_PROC(_base, _oid, _name, _mode, _var, _val, _desc, _a, _b) #define SYSCTL_INT(_base, _oid, _name, _mode, _var, _val, _desc) \ _SYSCTL_BASE(_name, _var, int, _mode) #define SYSCTL_LONG(_base, _oid, _name, _mode, _var, _val, _desc) \ _SYSCTL_BASE(_name, _var, long, _mode) #define SYSCTL_ULONG(_base, _oid, _name, _mode, _var, _val, _desc) \ _SYSCTL_BASE(_name, _var, ulong, _mode) #define SYSCTL_UINT(_base, _oid, _name, _mode, _var, _val, _desc) \ _SYSCTL_BASE(_name, _var, uint, _mode) #define TUNABLE_INT(_name, _ptr) #define SYSCTL_VNET_PROC SYSCTL_PROC #define SYSCTL_VNET_INT SYSCTL_INT #define SYSCTL_VNET_UINT SYSCTL_UINT #endif #define SYSCTL_HANDLER_ARGS \ struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req int sysctl_handle_int(SYSCTL_HANDLER_ARGS); int sysctl_handle_long(SYSCTL_HANDLER_ARGS); #ifdef EMULATE_SYSCTL /* mandatory here */ #define STRINGIFY(x) #x #ifdef SYSCTL_NODE #undef SYSCTL_NODE #endif #define SYSCTL_NODE(a,b,c,d,e,f) int a; (void)a #define SYSCTL_DECL(a) #define SYSCTL_PROC(a,b,c,d,e,f,g,h,i) (void)g #define GST_HARD_LIMIT 100 /* In the module, GST is implemented as an array of * sysctlentry, but while passing data to the userland * pointers are useless, the buffer is actually made of: * - sysctlhead (fixed size, containing lengths) * - data (typically 32 bit) * - name (zero-terminated and padded to mod4) */ struct sysctlentry { struct sysctlhead head; char* name; void* data; }; struct sysctltable { int count; //number of valid tables int totalsize; //total size of valid entries of al the valid tables void* namebuffer; //a buffer for all chained names struct sysctlentry entry[GST_HARD_LIMIT]; }; #ifdef SYSBEGIN #undef SYSBEGIN #endif #define SYSBEGIN(x) void sysctl_addgroup_##x() { #ifdef SYSEND #undef SYSEND #endif #define SYSEND } /* XXX remove duplication */ #define SYSCTL_INT(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e) #define SYSCTL_VNET_INT(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_INT << 2), sizeof(*e), e) #define SYSCTL_UINT(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e) #define SYSCTL_VNET_UINT(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_UINT << 2), sizeof(*e), e) #define SYSCTL_LONG(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_LONG << 2), sizeof(*e), e) #define SYSCTL_ULONG(a,b,c,d,e,f,g) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d) | (SYSCTLTYPE_ULONG << 2), sizeof(*e), e) #define TUNABLE_INT(a,b) /* * SYSCTL_VNET_PROC: We should call the function (g) * arguments are SYSCTL_HANDLER_ARGS */ #define SYSCTL_VNET_PROC(a,b,c,d,e,f,g,h,i) \ sysctl_pushback(STRINGIFY(a) "." STRINGIFY(c) + 1, \ (d), 4 /* XXX large */, g) void keinit_GST(void); void keexit_GST(void); int kesysctl_emu_set(void* p, int l); int kesysctl_emu_get(struct sockopt* sopt); void sysctl_pushback(char* name, int flags, int datalen, void* data); #endif /* EMULATE_SYSCTL */ struct ifnet; void ether_demux(struct ifnet *ifp, struct mbuf *m); int ether_output_frame(struct ifnet *ifp, struct mbuf *m); void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum); void icmp_error(struct mbuf *n, int type, int code, uint32_t dest, int mtu); #ifndef __FreeBSD__ struct rtentry; #endif void rtfree(struct rtentry *rt); u_short in_cksum_skip(struct mbuf *m, int len, int skip); #ifdef INP_LOCK_ASSERT #undef INP_LOCK_ASSERT #define INP_LOCK_ASSERT(a) #endif int jailed(struct ucred *cred); /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). If subnetsarelocal * is true, this includes other subnets of the local net. * Otherwise, it includes only the directly-connected (sub)nets. */ int in_localaddr(struct in_addr in); int fnmatch(const char *pattern, const char *string, int flags); /* vnet wrappers, in vnet.h and ip_var.h */ //int ipfw_init(void); //void ipfw_destroy(void); #define MTAG_IPFW 1148380143 /* IPFW-tagged cookie */ #define MTAG_IPFW_RULE 1262273568 /* rule reference */ #define MTAG_IPFW_CALL 1308397630 /* call stack */ #ifdef __APPLE__ #define offsetof(type, field) __builtin_offsetof(type, field) #endif struct ip_fw_args; extern int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); #if 1 /* include vnet.h */ #define curvnet NULL #define CURVNET_SET(_v) #define CURVNET_RESTORE() #define VNET_ASSERT(condition) #define VNET_NAME(n) n #define VNET_DECLARE(t, n) extern t n #define VNET_DEFINE(t, n) t n #define _VNET_PTR(b, n) &VNET_NAME(n) /* * Virtualized global variable accessor macros. */ #define VNET_VNET_PTR(vnet, n) (&(n)) #define VNET_VNET(vnet, n) (n) #define VNET_PTR(n) (&(n)) #define VNET(n) (n) #endif VNET_DECLARE(int, ip_defttl); #define V_ip_defttl VNET(ip_defttl); int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); /* hooks for divert */ extern void (*ip_divert_ptr)(struct mbuf *m, int incoming); extern int (*ip_dn_ctl_ptr)(struct sockopt *); typedef int ip_fw_ctl_t(struct sockopt *); extern ip_fw_ctl_t *ip_fw_ctl_ptr; /* netgraph prototypes */ typedef int ng_ipfw_input_t(struct mbuf **, int, struct ip_fw_args *, int); extern ng_ipfw_input_t *ng_ipfw_input_p; /* For kernel ipfw_ether and ipfw_bridge. */ struct ip_fw_args; typedef int ip_fw_chk_t(struct ip_fw_args *args); extern ip_fw_chk_t *ip_fw_chk_ptr; #define V_ip_fw_chk_ptr VNET(ip_fw_chk_ptr) #define V_ip_fw_ctl_ptr VNET(ip_fw_ctl_ptr) #define V_tcbinfo VNET(tcbinfo) #define V_udbinfo VNET(udbinfo) #endif /* _KERNEL */ // sys/eventhandler.h #define EVENTHANDLER_DECLARE(a, b) /* application specific */ struct sess; typedef int (handler_t)(struct sess *sess, void *arg); /* * flags to control the callback * WANT_READ select on read * WANT_WRITE select on write * WANT_RUN run unconditionally * WANT_DELETE session is exiting */ enum flags_t { WANT_READ=1, WANT_WRITE=2, WANT_RUN=4, WANT_DELETE=0x8000 }; struct sess { struct sess *next; int fd; handler_t *func; void *arg; enum flags_t flags; void *private; /* pointer managed by the session code */ }; struct sess * new_session(int fd, handler_t *func, void *arg, enum flags_t flags); void netmap_add_port(const char *dev); #endif /* !_MISSING_H_ */ ipfw-user/extra/sys/000755 000423 000000 00000000000 12006744005 015174 5ustar00luigiwheel000000 000000 ipfw-user/extra/glue.h000644 000423 000000 00000027075 12007761111 015474 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: glue.h 8327 2011-03-22 17:01:35Z marta $ * * glue code to adapt the FreeBSD version to linux and windows, * userland and kernel. * This is included before any other headers, so we do not have * a chance to override any #define that should appear in other * headers. * First handle headers for userland and kernel. Then common code * (including headers that require a specific order of inclusion), * then the user- and kernel- specific parts. */ #ifndef _GLUE_H #define _GLUE_H /* * common definitions to allow portability */ #ifndef __FBSDID #define __FBSDID(x) #endif /* FBSDID */ #include /* linux needs it in addition to sys/types.h */ #include /* for size_t */ #ifdef _KERNEL /* prevent a warning */ #undef _KERNEL #include #include #define _KERNEL #else #include #endif #include #include #ifndef USERSPACE #include #endif /* ipfw2.c - from timeconv.h */ static __inline time_t _long_to_time(long tlong) { if (sizeof(long) == sizeof(__int32_t)) return((time_t)(__int32_t)(tlong)); return((time_t)tlong); } #define min(a, b) ((a) < (b) ? (a) : (b) ) // radix.c /* * debugging macros from ip_dn_private.h */ #include #include extern char *strrchr(const char *, int); static inline const char *xyz(const char *s) { static char buf[128]; struct timeval t; const char *ret = strrchr(s, '/'); if (ret) s = ret + 1; gettimeofday(&t, NULL); buf[sizeof(buf) - 1] = '\0'; snprintf(buf, sizeof(buf), "[%4d.%06d] %s", (int)(t.tv_sec % 1000), (int)(t.tv_usec), s); return buf; } #define ND(fmt, ...) do {} while (0) #define D1(fmt, ...) do {} while (0) #define D(fmt, ...) fprintf(stderr, "%s:%-10s [%d] " fmt "\n", \ xyz(__FILE__), __FUNCTION__, __LINE__, ## __VA_ARGS__) #define DX(lev, fmt, ...) do { \ if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) /* end debugging macros */ /* * sbin/ipfw on non-freebsd platform */ #ifdef NEED_STRTONUM /* prototypes from libutil */ /* humanize_number(3) */ #define HN_DECIMAL 0x01 #define HN_NOSPACE 0x02 #define HN_B 0x04 #define HN_DIVISOR_1000 0x08 #define HN_IEC_PREFIXES 0x10 #define HN_GETSCALE 0x10 #define HN_AUTOSCALE 0x20 int humanize_number(char *_buf, size_t _len, int64_t _number, const char *_suffix, int _scale, int _flags); int expand_number(const char *buf, uint64_t *num); long long strtonum(const char *nptr, long long minval, long long maxval, const char **errstr); #ifndef __APPLE__ int ishexnumber(int c); #endif #endif /* NEED_STRTONUM */ #ifdef NEED_SYSCTLBYNAME /* and other linux calls */ int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen); #define setprogname(x) /* not present in linux */ extern int optreset; /* not present in linux */ long long int strtonum(const char *nptr, long long minval, long long maxval, const char **errstr); /* no sin_len in sockaddr, we only remap in userland */ #define sin_len sin_zero[0] #define sin6_len sin6_flowinfo struct ether_addr; struct ether_addr * ether_aton(const char *a); #define ICMP6_MAXTYPE 201 #define __u6_addr in6_u #define in6_u __in6_u /* missing type for ipv6 (linux 2.6.28) */ #define __u6_addr32 u6_addr32 /* on freebsd sys/socket.h pf specific */ #define NET_RT_IFLIST 3 /* survey interface list */ #define RTM_VERSION 5 /* Up the ante and ignore older versions */ // #define __unused // conflicts with linux/sysctl.h #endif // NEED_SYSCTLBYNAME /* possibly redundant, does not harm */ size_t strlcpy(char * dst, const char * src, size_t siz); /* * Part 2: common userland and kernel definitions */ #define ICMP6_DST_UNREACH_NOROUTE 0 /* no route to destination */ #define ICMP6_DST_UNREACH_ADMIN 1 /* administratively prohibited */ #define ICMP6_DST_UNREACH_ADDR 3 /* address unreachable */ #define ICMP6_DST_UNREACH_NOPORT 4 /* port unreachable */ /* * linux: sysctl are mapped into /sys/module/ipfw_mod parameters * windows: they are emulated via get/setsockopt */ #define CTLFLAG_RD 1 #define CTLFLAG_RDTUN 1 #define CTLFLAG_RW 2 #define CTLFLAG_SECURE3 0 /* unsupported */ #define CTLFLAG_VNET 0 /* unsupported */ /* if needed, queue.h must be included here after list.h */ /* * our own struct thread */ struct thread { /* ip_fw_sockopt */ void *sopt_td; void *td_ucred; }; enum sopt_dir { SOPT_GET, SOPT_SET }; struct sockopt { enum sopt_dir sopt_dir; /* is this a get or a set? */ int sopt_level; /* second arg of [gs]etsockopt */ int sopt_name; /* third arg of [gs]etsockopt */ void *sopt_val; /* fourth arg of [gs]etsockopt */ size_t sopt_valsize; /* (almost) fifth arg of [gs]etsockopt */ struct thread *sopt_td; /* calling thread or null if kernel */ }; /* * List of values used for set/getsockopt options. * The base value on FreeBSD is defined as a macro, * if not available we will use our own enum. * The TABLE_BASE value is used in the kernel. */ #define _IPFW_SOCKOPT_BASE 100 /* 40 on freebsd */ #define IP_FW_TABLE_ADD (_IPFW_SOCKOPT_BASE + 0) #define IP_FW_TABLE_DEL (_IPFW_SOCKOPT_BASE + 1) #define IP_FW_TABLE_FLUSH (_IPFW_SOCKOPT_BASE + 2) #define IP_FW_TABLE_GETSIZE (_IPFW_SOCKOPT_BASE + 3) #define IP_FW_TABLE_LIST (_IPFW_SOCKOPT_BASE + 4) #define IP_FW_DYN_GET (_IPFW_SOCKOPT_BASE + 5) #define IP_FW3 (_IPFW_SOCKOPT_BASE + 8) #define IP_DUMMYNET3 (_IPFW_SOCKOPT_BASE + 9) #define IP_FW_ADD (_IPFW_SOCKOPT_BASE + 10) #define IP_FW_DEL (_IPFW_SOCKOPT_BASE + 11) #define IP_FW_FLUSH (_IPFW_SOCKOPT_BASE + 12) #define IP_FW_ZERO (_IPFW_SOCKOPT_BASE + 13) #define IP_FW_GET (_IPFW_SOCKOPT_BASE + 14) #define IP_FW_RESETLOG (_IPFW_SOCKOPT_BASE + 15) #define IP_FW_NAT_CFG (_IPFW_SOCKOPT_BASE + 16) #define IP_FW_NAT_DEL (_IPFW_SOCKOPT_BASE + 17) #define IP_FW_NAT_GET_CONFIG (_IPFW_SOCKOPT_BASE + 18) #define IP_FW_NAT_GET_LOG (_IPFW_SOCKOPT_BASE + 19) #define IP_DUMMYNET_CONFIGURE (_IPFW_SOCKOPT_BASE + 20) #define IP_DUMMYNET_DEL (_IPFW_SOCKOPT_BASE + 21) #define IP_DUMMYNET_FLUSH (_IPFW_SOCKOPT_BASE + 22) /* 63 is missing */ #define IP_DUMMYNET_GET (_IPFW_SOCKOPT_BASE + 24) #define _IPFW_SOCKOPT_END (_IPFW_SOCKOPT_BASE + 25) /* * Part 3: userland stuff for linux/windows */ /* * now remap functions for userland or linux kernel etc. */ #ifdef USERSPACE /* * definitions used when the programs communicate through userspace. * We need to define the socket and addresses used to talk, and * the userland side must also remap socket() and [gs]etsockopt() * to appropriate wrappers. */ #define LOCALADDR "127.0.0.1" #define IPFW_PORT 5555 #ifndef KERNEL_SIDE #ifdef _KERNEL #error _KERNEL defined in user space #endif int do_connect(const char *addr, int port); #include /* for socklen_t */ #define socket(a, b, c) do_connect(LOCALADDR, IPFW_PORT) #define setsockopt setsockopt2 #define getsockopt getsockopt2 int getsockopt2(int s, int lev, int optname, void *optval, socklen_t *optlen); int setsockopt2(int s, int lev, int optname, void *optval, socklen_t optlen); #endif /* KERNEL_SIDE */ #endif /* USERSPACE */ /* * Part 5: windows specific stuff and sysctl emulation */ /******************* * SYSCTL emulation * ********************/ #ifdef EMULATE_SYSCTL /* this needs to be here, as it is part of the user-kernel messages */ /* flag is set with the last 2 bits for access, as defined in glue.h * and the rest for type */ enum { SYSCTLTYPE_INT = 0, SYSCTLTYPE_UINT = 1, SYSCTLTYPE_SHORT = 2, SYSCTLTYPE_USHORT = 3, SYSCTLTYPE_LONG = 4, SYSCTLTYPE_ULONG = 5, SYSCTLTYPE_STRING = 6, /* the following are SYSCTL_PROC equivalents of the above, * where the SYSCTLTYPE is shifted 2 bits, * and SYSCTLTYPE_PROC is set */ SYSCTLTYPE_PROC = 0x100, CTLTYPE_INT = (0x100 | (0<<2)), CTLTYPE_UINT = (0x100 | (1<<2)) }; struct sysctlhead { uint32_t blocklen; //total size of the entry uint32_t namelen; //strlen(name) + '\0' uint32_t flags; //type and access uint32_t datalen; }; #endif /* EMULATE_SYSCTL */ int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen); #ifndef __FreeBSD__ #define test_bit(ix, pData) ((*pData) & (1<<(ix))) #define __set_bit(ix, pData) (*pData) |= (1<<(ix)) #define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) static inline int fls(int _n) { unsigned int n = _n; int i = 0; for (i = 0; n > 0; n >>= 1, i++) ; return i; } static inline unsigned long __fls(unsigned long word) { return fls(word) - 1; } #endif /* !FreeBSD */ #ifdef _KERNEL /* XXX kernel support */ /* on freebsd net/if.h XXX used */ #ifdef linux #define div64(a,b) (((int64_t)a)/((int64_t)b)) #define LINUX_VERSION_CODE 30003 #define KERNEL_VERSION(a,b,c) (a*10000+b*100 + c) #define __printflike(a,b) #endif /* linux */ #endif /* _KERNEL */ #ifndef __FreeBSD__ struct if_data { /* ... */ u_long ifi_mtu; /* maximum transmission unit */ }; #endif #ifdef __APPLE__ #include // need in kernel /* needed both in kernel and userspace */ struct if_data64 { // XXX Darwin version /* ... */ u_long ifi_mtu; /* maximum transmission unit */ }; struct net_event_data { }; struct in_addr; #endif /* __APPLE__ */ #define __PAST_END(v, idx) v[idx] /* * a fast copy routine */ #include // XXX only for multiples of 64 bytes, non overlapped. static inline void pkt_copy(const void *_src, void *_dst, int l) { const uint64_t *src = _src; uint64_t *dst = _dst; #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) if (unlikely(l >= 1024)) { bcopy(src, dst, l); return; } for (; l > 0; l-=64) { *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; } } #endif /* !_GLUE_H */ ipfw-user/extra/session.c000644 000423 000000 00000031700 12012143254 016202 0ustar00luigiwheel000000 000000 /* * Session handler to simulate soopt* and network communication * over a TCP socket, and also run the callbacks. */ #ifdef _KERNEL #undef _KERNEL #endif /* these headers need to be compiled without _KERNEL */ #include #include #include #include #include // PFIL_IN #include extern int errno; #ifdef free /* we are built in a pseudo-kernel env so malloc and free are redefined */ #undef free #undef malloc #endif /* free */ #include #include #include /* timersub */ #include #include #include /* read() */ #include /* mbuf */ #define _KERNEL /* args for ipfw */ #include #include /* * Global variables need to be somewhere... */ void ip_dn_init(void); int ipfw_init(void); void ipfw_destroy(void); extern int (*ip_fw_ctl_ptr)(struct sockopt *); extern int (*ip_dn_ctl_ptr)(struct sockopt *); extern struct ip_fw *ip_fw_default_rule; extern int (*ip_fw_chk_ptr)(struct ip_fw_args *args); extern int ticks; /* kernel ticks counter */ int callout_startup(void); int callout_run(void); /* * generic handler for sockopt functions */ static int ctl_handler(struct sockopt *sopt) { int error = EINVAL; ND("called, level %d", sopt->sopt_level); if (sopt->sopt_level != IPPROTO_IP) return (EINVAL); switch (sopt->sopt_name) { default: D("command not recognised %d", sopt->sopt_name); break; case IP_FW3: // XXX untested case IP_FW_ADD: /* ADD actually returns the body... */ case IP_FW_GET: case IP_FW_DEL: case IP_FW_TABLE_GETSIZE: case IP_FW_TABLE_LIST: case IP_FW_NAT_GET_CONFIG: case IP_FW_NAT_GET_LOG: case IP_FW_FLUSH: case IP_FW_ZERO: case IP_FW_RESETLOG: case IP_FW_TABLE_ADD: case IP_FW_TABLE_DEL: case IP_FW_TABLE_FLUSH: case IP_FW_NAT_CFG: case IP_FW_NAT_DEL: if (ip_fw_ctl_ptr != NULL) error = ip_fw_ctl_ptr(sopt); else { D("ipfw not enabled"); error = ENOPROTOOPT; } break; case IP_DUMMYNET_GET: case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: case IP_DUMMYNET_FLUSH: case IP_DUMMYNET3: if (ip_dn_ctl_ptr != NULL) error = ip_dn_ctl_ptr(sopt); else error = ENOPROTOOPT; break ; } ND("returning error %d", error); return error; } /* * copy data back to userland */ int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len) { size_t valsize = sopt->sopt_valsize; ND("data len %d sopt_len %d", len, valsize); if (len < valsize) sopt->sopt_valsize = valsize = len; bcopy(buf, sopt->sopt_val, valsize); return 0; } /* * copy data from userland to kernel */ int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) { size_t valsize = sopt->sopt_valsize; ND("have %d len %d minlen %d", valsize, len, minlen); if (valsize < minlen) return EINVAL; if (valsize > len) sopt->sopt_valsize = valsize = len; bcopy(sopt->sopt_val, buf, valsize); return 0; } /* * session description for event-based programming */ /* event-based session support */ #define SOCK_QLEN 5 /* listen lenght for incoming connection */ static struct sess *all_sessions, *new_sessions; struct sess * new_session(int fd, handler_t *func, void *arg, enum flags_t flags) { struct sess *desc; desc = calloc(1, sizeof(*desc)); if (desc == NULL) return NULL; desc->fd = fd; desc->func = func; desc->arg = arg; desc->flags = flags; desc->next = new_sessions; new_sessions = desc; return desc; } /* remove deleted sessions, merge with new ones */ static void merge_sessions(void) { struct sess *cur, *prev, *tmp; for (prev = NULL, cur = all_sessions; cur; prev = cur, cur = tmp) { tmp = cur->next; if ( (cur->flags & WANT_DELETE) == 0) continue; if (prev) prev->next = cur->next; else all_sessions = cur->next; memset(cur, 0, sizeof(*cur)); free(cur); if (prev == NULL) cur = NULL; } if (prev) prev->next = new_sessions; else all_sessions = new_sessions; new_sessions = NULL; } /* set the fdset, return the fdmax+1 for select() */ int set_sessions(fd_set *r, fd_set *w) { struct sess *cur; int fd_max = -1; int count = 0,ready = 0; FD_ZERO(r); FD_ZERO(w); merge_sessions(); for (cur = all_sessions; cur; cur = cur->next) { count++; if (cur->flags & WANT_RUN) { ND("WANT_RUN on session %p", cur); cur->flags &= ~WANT_RUN; cur->func(cur, cur->arg); } if (cur->flags & WANT_READ) FD_SET(cur->fd, r); if (cur->flags & WANT_WRITE) FD_SET(cur->fd, w); if (cur->flags & (WANT_WRITE|WANT_READ)) { ready ++; if (cur->fd > fd_max) fd_max = cur->fd; } } ND("%d session %d waiting", count, ready); return fd_max + 1; } int run_sessions(fd_set *r, fd_set *w) { struct sess *cur; for (cur = all_sessions; cur; cur = cur->next) { int fd = cur->fd; // fprintf(stderr, "%s sess %p\n", __FUNCTION__, cur); if (FD_ISSET(fd, r) || FD_ISSET(fd, w)) cur->func(cur, cur->arg); } return 0; } struct sess_buf { int len; /* allocation length */ int used; /* bytes used */ int start; /* start position for next write */ char data[0]; }; struct sess_buf * get_buf(int size, struct sess_buf *old) { struct sess_buf *p = old; if (!p) { ND("new buffer size %d", size); p = calloc(1, sizeof(*p) + size); } else if (p->len >= size) { return p; } else { ND("calling realloc %p %d", old, size); p = realloc(old, sizeof(*p) + size); } if (!p) { if (old) free(old); } else { p->len = size; } return p; } /* * do a non-blocking read into the buffer, reallocating if space * is needed. */ static struct sess_buf * get_data(int fd, struct sess_buf *buf, int want) { int l; buf = get_buf(want, buf); if (buf == NULL) return buf; l = read(fd, buf->data + buf->used, want - buf->used); if (l > 0) buf->used += l; return buf; } /* * Handler for a request coming from the control socket. */ enum sockopt_state { READING = 0, WRITING = 1 }; struct sockopt_desc { int state; /* internal state */ struct sess_buf *rd; struct sess_buf *wr; }; /* header prepended to data in all transactions */ struct rx_hdr { uint32_t optlen; /* data len */ uint32_t level; /* or error ? */ uint32_t optname; /* or desired len ? */ uint32_t dir; /* in or out */ }; /* * Return the number of remainig bytes from the buffer. * The meessage is int optname; [int optlen; int data] * where the second part is present or not depending on the * message type. */ int get_want(struct sess_buf *rd, struct rx_hdr *r) { struct rx_hdr _r; int l = sizeof(_r); if (r == NULL) r = &_r; if (!rd || rd->used < l) { ND("short buffer (%d), return %d to bootstrap", rd ? rd->used : -1, l); return l; } bcopy(rd->data, r, l); /* header fields are in network format, convert to host fmt */ r->optlen = ntohl(r->optlen); r->level = ntohl(r->level); r->optname = ntohl(r->optname); r->dir = ntohl(r->dir); l += r->optlen; return l; } /* * The sockopt commands are sent in network format (at least the header) */ int sockopt_handler(struct sess *sess, void *arg) { struct sockopt_desc *d; int error = 1; ND("sess %p arg %p", sess, arg); if (sess->private == NULL) sess->private = calloc(1, sizeof(struct sockopt_desc)); d = sess->private; if (d == NULL) goto done; if (sess->flags & WANT_READ) { int l, want, prev; struct rx_hdr r; struct sockopt sopt; struct thread dummy; want = get_want(d->rd, &r); prev = d->rd ? d->rd->used : 0; ND("total message size is %d (prev %d)", want, prev); d->rd = get_data(sess->fd, d->rd, want); l = d->rd ? d->rd->used : 0; ND("read %d prev %d want %d", l, prev, want); if (l == prev) /* no data -> error */ goto done; want = get_want(d->rd, &r); ND("again, want %d l %d", want, l); if (l < want) /* must read more data */ return 0; sopt.sopt_dir = r.dir; sopt.sopt_level = r.level; sopt.sopt_name = r.optname; sopt.sopt_val = (l <= sizeof(r)) ? NULL : d->rd->data + sizeof(r); sopt.sopt_valsize = r.optlen; sopt.sopt_td = &dummy; ND("dir 0x%x lev %d opt %d optval %p optlen %d", sopt.sopt_dir, sopt.sopt_level, sopt.sopt_name, sopt.sopt_val, (int)sopt.sopt_valsize); r.level = htonl(ctl_handler(&sopt)); r.optlen = htonl(0); /* default len */ r.dir = htonl(sopt.sopt_dir); /* prepare the buffer for writing */ d->wr = d->rd; d->rd = NULL; d->wr->used = sopt.sopt_valsize + sizeof(r); d->wr->start = 0; /* now update the header */ if (sopt.sopt_dir == SOPT_GET) r.optlen = htonl(sopt.sopt_valsize); bcopy(&r, d->wr->data, sizeof(r)); sess->flags = WANT_WRITE; return 0; } if (sess->flags & WANT_WRITE) { struct sess_buf *wr = d->wr; int l = write(sess->fd, wr->data + wr->start, wr->used - wr->start); ND("written %d bytes out of %d", l, wr->used - wr->start); if (l <= 0) { if (errno == EAGAIN) return 0; goto done; /* error */ } wr->start += l; if (wr->start < wr->used) return 0; // prepare for another rpc sess->flags = WANT_READ; return 0; //goto done; } done: ND("closing session"); if (d) { if (d->rd) free(d->rd); if (d->wr) free(d->wr); d->rd = d->wr = NULL; sess->flags = WANT_DELETE; } return error; } /* * testing code when reading fake packets from socket 5556. * Turns out that ipfw_check_hook() is a lot slower than ipfw_chk() */ int packet_handler(struct sess *sess, void *arg) { char fake_buf[2048]; struct mbuf dm; int i; bzero(&dm, sizeof(dm)); dm.m_data = fake_buf + 14; /* skip mac hdr */ dm.m_len = dm.m_pkthdr.len = 128; fake_buf[14] = 0x45; // ip *(uint16_t *)(fake_buf+16) = htons(64); // bytes *(uint32_t *)(fake_buf+26) = htonl(0x01020304); // src *(uint32_t *)(fake_buf+30) = htonl(0x05060708); // dst { #if 0 struct ip_fw_args args; bzero(&args, sizeof(args)); args.m = &dm; for (i = 0; i < 1000; i++) ipfw_chk(&args); #else struct ifnet *ifp = NULL; struct inpcb *inp = NULL; struct mbuf *m = &dm; ND("sess %p arg %p", sess, arg); for (i = 0; i < 1000; i++) ipfw_check_hook(NULL, &m, ifp, PFIL_IN, inp); #endif } return 0; } /* * This task accepts a new connection and creates a new session. */ static int listener(struct sess *sess, void *arg) { int fd; ND("sess %p arg %p", sess, arg); fd = accept(sess->fd, NULL, NULL); if (fd < 0) return -1; new_session(fd, sess->arg ? sockopt_handler: packet_handler, sess->arg, WANT_READ); sess->flags = WANT_READ; return 0; } /* * listen on a socket, * return the listen fd or -1 on error. */ static int do_server(const char *addr, int port) { int fd = -1, on; struct sockaddr_in server; /* open the listen socket */ fd = socket(AF_INET, SOCK_STREAM, 0); if (fd < 0) { perror( "socket" ); return -1; } on = 1; #ifdef SO_REUSEADDR if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) perror("SO_REUSEADDR failed(non fatal)"); #endif #ifdef SO_REUSEPORT on = 1; if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &on, sizeof(on)) == -1) perror("SO_REUSEPORT failed(non fatal)"); #endif /* fill the server struct */ bzero(&server, sizeof(server)); server.sin_family = AF_INET; inet_aton(addr, &server.sin_addr); server.sin_port = htons(port); /* bind the local address */ if (bind(fd, (struct sockaddr*) &server, sizeof(server)) < 0) { perror( "bind" ); return -1; } D("+++ listening tcp %s:%d", inet_ntoa(server.sin_addr), ntohs(server.sin_port)); /* listen for incoming connection */ if (listen(fd, SOCK_QLEN) < 0) { perror( "listen" ); return -1; } return fd; } extern int ipfw_module_init(void); /* * main program for ipfw kernel side when running an userspace emulation: * open a socket on which we receive requests from userland, * another socket for calls from the 'kernel' (simulating packet * arrivals etc), and then periodically run the tick handler. */ int mainloop(int argc, char *argv[]) { int listen_fd; struct timeval t0; const char *s, *addr = LOCALADDR; int port = IPFW_PORT; int i; gettimeofday(&t0, NULL); ticks = 0; callout_startup(); ipfw_module_init(); /* override the host if set in the environment */ s = getenv("IPFW_HOST"); if (s) addr = s; s = getenv("IPFW_PORT"); if (s && atoi(s) > 0) port = atoi(s); /* start the server */ listen_fd = do_server(addr, port); if (listen_fd < 0) { printf("Error starting server\n"); return -1; } new_session(listen_fd, listener, (void *)1, WANT_READ); #ifdef WITH_NETMAP for (i = 1; i < argc; i++) { netmap_add_port(argv[i]); } #endif /* WITH_NETMAP */ #if 1 // test code: a telnet on 5556 becomes an infinite source { int net_fd = do_server(addr, port+1); if (net_fd >= 0) new_session(net_fd, listener, NULL, WANT_READ); } #endif for (;;) { struct timeval now, delta = { 0, 1000000/hz} ; int n; fd_set r, w; n = set_sessions(&r, &w); select(n, &r, &w, NULL, &delta); run_sessions(&r, &w); gettimeofday(&now, 0); timersub(&now, &t0, &delta); /* compute absolute ticks. */ ticks = (delta.tv_sec * hz) + (delta.tv_usec * hz) / 1000000; callout_run(); } ipfw_destroy(); return 0; } ipfw-user/extra/linux_defs.h000644 000423 000000 00000007463 12006744005 016701 0ustar00luigiwheel000000 000000 #ifndef __LINUX_DEFS_ #define __LINUX_DEFS_ /* define, includes and functions missing in linux */ #ifdef __linux__ /* include and define */ #include /* inet_ntoa */ #include #include /* error define */ #include /* u_int32_t */ #include /* snprintf */ typedef struct mtx spinlock_t; typedef struct mtx rwlock_t; /* * some network structure can be defined in the bsd way * by using the _FAVOR_BSD definition. This is not true * for icmp structure. * XXX struct icmp contains bsd names in * /usr/include/netinet/ip_icmp.h */ #define icmp_code code #define icmp_type type /* linux in6_addr has no member __u6_addr * replace the whole structure ? */ #define __u6_addr __in6_u // #define __u6_addr32 u6_addr32 /* defined in linux/sctp.h with no bsd definition */ struct sctphdr { uint16_t src_port; /* source port */ uint16_t dest_port; /* destination port */ uint32_t v_tag; /* verification tag of packet */ uint32_t checksum; /* Adler32 C-Sum */ /* chunks follow... */ } SCTP_PACKED; /* missing definition */ #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 #define TH_ACK 0x10 #define RTF_CLONING 0x100 /* generate new routes on use */ #define IPPROTO_OSPFIGP 89 /* OSPFIGP */ #define IPPROTO_CARP 112 /* CARP */ #define IPPROTO_IPV4 IPPROTO_IPIP /* for compatibility */ #define CARP_VERSION 2 #define CARP_ADVERTISEMENT 0x01 #define PRIV_NETINET_IPFW 491 /* Administer IPFW firewall. */ #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) /* linux/stdlib */ #define IP_FORWARDING 0x1 /* most of ip header exists */ #define NETISR_IP 2 /* same as AF_INET */ #define PRIV_NETINET_DUMMYNET 494 /* Administer DUMMYNET. */ extern int securelevel; struct carp_header { #if BYTE_ORDER == LITTLE_ENDIAN u_int8_t carp_type:4, carp_version:4; #endif #if BYTE_ORDER == BIG_ENDIAN u_int8_t carp_version:4, carp_type:4; #endif }; struct pim { }; struct route { struct rtentry *ro_rt; struct sockaddr ro_dst; }; #if 0 // already in main header struct ifaltq { void *ifq_head; }; struct ifnet { char if_xname[IFNAMSIZ]; /* external name (name + unit) */ struct ifaltq if_snd; /* output queue (includes altq) */ }; /* involves mbufs */ int in_cksum(struct mbuf *m, int len); #define divert_cookie(mtag) 0 #define divert_info(mtag) 0 #define INADDR_TO_IFP(a, b) b = NULL #define pf_find_mtag(a) NULL #define pf_get_mtag(a) NULL #define AF_LINK AF_ASH /* ? linux/socket.h */ struct pf_mtag { void *hdr; /* saved hdr pos in mbuf, for ECN */ sa_family_t af; /* for ECN */ u_int32_t qid; /* queue id */ }; #endif /* radix related */ #if 0 struct radix_node { caddr_t rn_key; /* object of search */ caddr_t rn_mask; /* netmask, if present */ }; #endif /* missing functions */ /* from bsd sys/queue.h */ #define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = TAILQ_FIRST((head)); \ (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ (var) = (tvar)) #define SLIST_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = SLIST_FIRST((head)); \ (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ (var) = (tvar)) /* depending of linux version */ #ifndef ETHERTYPE_IPV6 #define ETHERTYPE_IPV6 0x86dd /* IP protocol version 6 */ #endif #endif /* __linux__ */ #endif /* !__LINUX_DEFS_ */ ipfw-user/extra/ipfw2_mod.c000644 000423 000000 00000020744 12006744005 016417 0ustar00luigiwheel000000 000000 /* * Copyright (C) 2009 Luigi Rizzo, Marta Carbone, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id: ipfw2_mod.c 7787 2010-11-19 21:15:50Z marta $ * * The main interface to build ipfw+dummynet as a linux module. * (and possibly as a windows module as well, though that part * is not complete yet). * * The control interface uses the sockopt mechanism * on a socket(AF_INET, SOCK_RAW, IPPROTO_RAW). * * The data interface uses the netfilter interface, at the moment * hooked to the PRE_ROUTING and POST_ROUTING hooks. * Unfortunately the netfilter interface is a moving target, * so we need a set of macros to adapt to the various cases. * * In the netfilter hook we just mark packet as 'QUEUE' and then * let the queue handler to do the whole work (filtering and * possibly emulation). * As we receive packets, we wrap them with an mbuf descriptor * so the existing ipfw+dummynet code runs unmodified. */ #include #include /* sizeof struct mbuf */ #include /* NGROUPS */ #include /* in_addr */ #include /* ip_fw_ctl_t, ip_fw_chk_t */ #include /* ip_fw_ctl_t, ip_fw_chk_t */ #include /* ip_dn_ctl_t, ip_dn_io_t */ #include /* PFIL_IN, PFIL_OUT */ #include /* inet_iif */ /* * Here we allocate some global variables used in the firewall. */ //ip_dn_ctl_t *ip_dn_ctl_ptr; int (*ip_dn_ctl_ptr)(struct sockopt *); ip_fw_ctl_t *ip_fw_ctl_ptr; int (*ip_dn_io_ptr)(struct mbuf **m, int dir, struct ip_fw_args *fwa); ip_fw_chk_t *ip_fw_chk_ptr; void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* Divert hooks. */ void (*ip_divert_ptr)(struct mbuf *m, int incoming); /* ng_ipfw hooks. */ ng_ipfw_input_t *ng_ipfw_input_p = NULL; /*--- * Control hooks: * ipfw_ctl_h() is a wrapper for linux to FreeBSD sockopt call convention. * then call the ipfw handler in order to manage requests. * In turn this is called by the linux set/get handlers. */ static int ipfw_ctl_h(struct sockopt *s, int cmd, int dir, int len, void __user *user) { struct thread t; int ret = EINVAL; memset(s, 0, sizeof(s)); s->sopt_name = cmd; s->sopt_dir = dir; s->sopt_valsize = len; s->sopt_val = user; /* sopt_td is not used but it is referenced */ memset(&t, 0, sizeof(t)); s->sopt_td = &t; if (ip_fw_ctl_ptr && cmd != IP_DUMMYNET3 && (cmd == IP_FW3 || cmd < IP_DUMMYNET_CONFIGURE)) ret = ip_fw_ctl_ptr(s); else if (ip_dn_ctl_ptr && (cmd == IP_DUMMYNET3 || cmd >= IP_DUMMYNET_CONFIGURE)) ret = ip_dn_ctl_ptr(s); return -ret; /* errors are < 0 on linux */ } /* * setsockopt hook has no return value other than the error code. */ int do_ipfw_set_ctl(void *sk, int cmd, void __user *user, unsigned int len) { struct sockopt s; /* pass arguments */ return ipfw_ctl_h(&s, cmd, SOPT_SET, len, user); } /* * getsockopt can can return a block of data in response. */ int do_ipfw_get_ctl(void *sk, int cmd, void __user *user, int *len) { struct sockopt s; /* pass arguments */ int ret = ipfw_ctl_h(&s, cmd, SOPT_GET, *len, user); *len = s.sopt_valsize; /* return lenght back to the caller */ return ret; } /* * Module glue - init and exit function. */ #include /* descriptors for the children, until i find a way for the * linker to produce them */ extern moduledata_t *moddesc_ipfw; extern moduledata_t *moddesc_dummynet; extern moduledata_t *moddesc_dn_fifo; extern moduledata_t *moddesc_dn_wf2qp; extern moduledata_t *moddesc_dn_rr; extern moduledata_t *moddesc_dn_qfq; extern moduledata_t *moddesc_dn_prio; extern void *sysinit_ipfw_init; extern void *sysuninit_ipfw_destroy; extern void *sysinit_vnet_ipfw_init; extern void *sysuninit_vnet_ipfw_uninit; /*--- * Glue code to implement the registration of children with the parent. * Each child should call my_mod_register() when linking, so that * module_init() and module_exit() can call init_children() and * fini_children() to provide the necessary initialization. * We use the same mechanism for MODULE_ and SYSINIT_. * The former only get a pointer to the moduledata, * the latter have two function pointers (init/uninit) */ #include struct mod_args { const char *name; int order; struct moduledata *mod; void (*init)(void), (*uninit)(void); }; static unsigned int mod_idx; static struct mod_args mods[10]; /* hard limit to 10 modules */ int my_mod_register(const char *name, int order, struct moduledata *mod, void *init, void *uninit); /* * my_mod_register should be called automatically as the init * functions in the submodules. Unfortunately this compiler/linker * trick is not supported yet so we call it manually. */ int my_mod_register(const char *name, int order, struct moduledata *mod, void *init, void *uninit) { struct mod_args m; m.name = name; m.order = order; m.mod = mod; m.init = init; m.uninit = uninit; ND("called for %s", name); if (mod_idx < sizeof(mods) / sizeof(mods[0])) mods[mod_idx++] = m; return 0; } static void init_children(void) { unsigned int i; /* Call the functions registered at init time. */ printf("%s mod_idx value %d\n", __FUNCTION__, mod_idx); for (i = 0; i < mod_idx; i++) { struct mod_args *m = &mods[i]; printf("+++ start module %d %s %s at %p order 0x%x\n", i, m->name, m->mod ? m->mod->name : "SYSINIT", m->mod, m->order); if (m->mod && m->mod->evhand) m->mod->evhand(NULL, MOD_LOAD, m->mod->priv); else if (m->init) m->init(); } } static void fini_children(void) { int i; /* Call the functions registered at init time. */ for (i = mod_idx - 1; i >= 0; i--) { struct mod_args *m = &mods[i]; printf("+++ end module %d %s %s at %p order 0x%x\n", i, m->name, m->mod ? m->mod->name : "SYSINIT", m->mod, m->order); if (m->mod && m->mod->evhand) m->mod->evhand(NULL, MOD_UNLOAD, m->mod->priv); else if (m->uninit) m->uninit(); } } /*--- end of module binding helper functions ---*/ int ipfw_module_init(void) { int ret = 0; rn_init(64); my_mod_register("ipfw", 1, moddesc_ipfw, NULL, NULL); my_mod_register("sy_ipfw", 2, NULL, sysinit_ipfw_init, sysuninit_ipfw_destroy); my_mod_register("sy_Vnet_ipfw", 3, NULL, sysinit_vnet_ipfw_init, sysuninit_vnet_ipfw_uninit); my_mod_register("dummynet", 4, moddesc_dummynet, NULL, NULL); my_mod_register("dn_fifo", 5, moddesc_dn_fifo, NULL, NULL); my_mod_register("dn_wf2qp", 6, moddesc_dn_wf2qp, NULL, NULL); my_mod_register("dn_rr", 7, moddesc_dn_rr, NULL, NULL); my_mod_register("dn_qfq", 8, moddesc_dn_qfq, NULL, NULL); my_mod_register("dn_prio", 9, moddesc_dn_prio, NULL, NULL); init_children(); #ifdef EMULATE_SYSCTL keinit_GST(); #endif return ret; } /* module shutdown */ void ipfw_module_exit(void) { #ifdef EMULATE_SYSCTL keexit_GST(); #endif fini_children(); printf("%s unloaded\n", __FUNCTION__); } ipfw-user/extra/expand_number.c000644 000423 000000 00000005773 11725221076 017372 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2007 Eric Anderson * Copyright (c) 2007 Pawel Jakub Dawidek * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/lib/libutil/expand_number.c 211343 2010-08-15 18:32:06Z des $"); #include #include #include #include //#include #include /* * Convert an expression of the following forms to a uint64_t. * 1) A positive decimal number. * 2) A positive decimal number followed by a 'b' or 'B' (mult by 1). * 3) A positive decimal number followed by a 'k' or 'K' (mult by 1 << 10). * 4) A positive decimal number followed by a 'm' or 'M' (mult by 1 << 20). * 5) A positive decimal number followed by a 'g' or 'G' (mult by 1 << 30). * 6) A positive decimal number followed by a 't' or 'T' (mult by 1 << 40). * 7) A positive decimal number followed by a 'p' or 'P' (mult by 1 << 50). * 8) A positive decimal number followed by a 'e' or 'E' (mult by 1 << 60). */ int expand_number(const char *buf, uint64_t *num) { uint64_t number; unsigned shift; char *endptr; number = strtoumax(buf, &endptr, 0); if (endptr == buf) { /* No valid digits. */ errno = EINVAL; return (-1); } switch (tolower((unsigned char)*endptr)) { case 'e': shift = 60; break; case 'p': shift = 50; break; case 't': shift = 40; break; case 'g': shift = 30; break; case 'm': shift = 20; break; case 'k': shift = 10; break; case 'b': case '\0': /* No unit. */ *num = number; return (0); default: /* Unrecognized unit. */ errno = EINVAL; return (-1); } if ((number << shift) >> shift != number) { /* Overflow */ errno = ERANGE; return (-1); } *num = number << shift; return (0); } ipfw-user/extra/humanize_number.c000644 000423 000000 00000011567 11725221076 017731 0ustar00luigiwheel000000 000000 /* $NetBSD: humanize_number.c,v 1.14 2008/04/28 20:22:59 martin Exp $ */ /* * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/lib/libutil/humanize_number.c 220582 2011-04-12 22:48:03Z delphij $"); #include #include #include #include #include #include #include //#include static const int maxscale = 7; int humanize_number(char *buf, size_t len, int64_t quotient, const char *suffix, int scale, int flags) { const char *prefixes, *sep; int i, r, remainder, s1, s2, sign; int64_t divisor, max; size_t baselen; assert(buf != NULL); assert(suffix != NULL); assert(scale >= 0); assert(scale < maxscale || (((scale & (HN_AUTOSCALE | HN_GETSCALE)) != 0))); assert(!((flags & HN_DIVISOR_1000) && (flags & HN_IEC_PREFIXES))); remainder = 0; if (flags & HN_IEC_PREFIXES) { baselen = 2; /* * Use the prefixes for power of two recommended by * the International Electrotechnical Commission * (IEC) in IEC 80000-3 (i.e. Ki, Mi, Gi...). * * HN_IEC_PREFIXES implies a divisor of 1024 here * (use of HN_DIVISOR_1000 would have triggered * an assertion earlier). */ divisor = 1024; if (flags & HN_B) prefixes = "B\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; else prefixes = "\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; } else { baselen = 1; if (flags & HN_DIVISOR_1000) divisor = 1000; else divisor = 1024; if (flags & HN_B) prefixes = "B\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; else prefixes = "\0\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; } #define SCALE2PREFIX(scale) (&prefixes[(scale) * 3]) if (scale < 0 || (scale >= maxscale && (scale & (HN_AUTOSCALE | HN_GETSCALE)) == 0)) return (-1); if (buf == NULL || suffix == NULL) return (-1); if (len > 0) buf[0] = '\0'; if (quotient < 0) { sign = -1; quotient = -quotient; baselen += 2; /* sign, digit */ } else { sign = 1; baselen += 1; /* digit */ } if (flags & HN_NOSPACE) sep = ""; else { sep = " "; baselen++; } baselen += strlen(suffix); /* Check if enough room for `x y' + suffix + `\0' */ if (len < baselen + 1) return (-1); if (scale & (HN_AUTOSCALE | HN_GETSCALE)) { /* See if there is additional columns can be used. */ for (max = 1, i = len - baselen; i-- > 0;) max *= 10; /* * Divide the number until it fits the given column. * If there will be an overflow by the rounding below, * divide once more. */ for (i = 0; (quotient >= max || (quotient == max - 1 && remainder >= 950)) && i < maxscale; i++) { remainder = quotient % divisor; quotient /= divisor; } if (scale & HN_GETSCALE) return (i); } else { for (i = 0; i < scale && i < maxscale; i++) { remainder = quotient % divisor; quotient /= divisor; } } /* If a value <= 9.9 after rounding and ... */ if (quotient <= 9 && remainder < 950 && i > 0 && flags & HN_DECIMAL) { /* baselen + \0 + .N */ if (len < baselen + 1 + 2) return (-1); s1 = (int)quotient + ((remainder + 50) / 1000); s2 = ((remainder + 50) / 100) % 10; r = snprintf(buf, len, "%d%s%d%s%s%s", sign * s1, localeconv()->decimal_point, s2, sep, SCALE2PREFIX(i), suffix); } else r = snprintf(buf, len, "%" PRId64 "%s%s%s", sign * (quotient + (remainder + 50) / 1000), sep, SCALE2PREFIX(i), suffix); return (r); } ipfw-user/extra/missing.c000644 000423 000000 00000036606 12007760741 016214 0ustar00luigiwheel000000 000000 /* * $Id$ * * Support to compile the kernel side of ipfw/dummynet in userland. * This file contains variables and functions that are not available in * userland. It is compiled in a kernel-like environment, so * it has _KERNEL defined, together with malloc() and free(). * They must be redefined here as we build the real thing. */ #include "glue.h" /* normally comes from the command line */ #include "missing.h" /* normally comes from the command line */ #undef _KERNEL #include #include #include #include /* timersub */ #define _KERNEL #include #include #include #undef malloc #undef free #include // calloc #include /* struct sockaddr, route, sockopt... */ #include #define IF_NAMESIZE 16 /* ip_fw.h */ #define IFNAMSIZ IF_NAMESIZE /* ip_fw.h */ /* * Global bariables in the kernel */ int ticks; /* kernel ticks counter */ int hz = 1000; /* default clock time */ long tick = 1000; /* XXX is this 100000/hz ? */ int bootverbose = 0; time_t time_uptime = 0; struct timeval boottime; int max_linkhdr; int ip_defttl; u_long in_ifaddrhmask; /* mask for hash table */ struct in_ifaddrhashhead *in_ifaddrhashtbl; /* inet addr hash table */ u_int rt_numfibs = RT_NUMFIBS; void module_register_init(const void *foo) { D("start for %p", foo); } /* defined as assert */ #include void panic(const char *fmt, ...) { assert(1); } void getmicrouptime(struct timeval *tv) { gettimeofday(tv, NULL); } /* * pfil hook support. * We make pfil_head_get return a non-null pointer, which is then ignored * in our 'add-hook' routines. */ struct pfil_head; typedef int (pfil_hook_t) (void *, struct mbuf **, struct ifnet *, int, struct inpcb *); struct pfil_head * pfil_head_get(int proto, u_long flags) { static int dummy; D("called"); return (struct pfil_head *)(void *)&dummy; } int pfil_add_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) { D("called"); return 0; } int pfil_remove_hook(pfil_hook_t *func, void *arg, int dir, struct pfil_head *h) { D("called"); return 0; } /* from sys/netinet/ip_output.c */ int ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, struct ip_moptions *imo, struct inpcb *inp) { D("unimplemented"); return 0; } struct tags_freelist tags_freelist; int tags_minlen = 64; int tags_freelist_count = 0; static int tags_freelist_max = 0; struct mbuf *mbuf_freelist; void m_freem(struct mbuf *m) { struct m_tag *t; /* free the m_tag chain */ while ( (t = SLIST_FIRST(&m->m_pkthdr.tags) ) ) { ND("free tag %p", &m->m_pkthdr.tags); SLIST_REMOVE_HEAD(&m->m_pkthdr.tags, m_tag_link); SLIST_INSERT_HEAD(&tags_freelist, t, m_tag_link); tags_freelist_count++; if (tags_freelist_count > tags_freelist_max) { static int pr=0; if ((pr++ % 1000) == 0) D("new max %d", tags_freelist_count); tags_freelist_max = tags_freelist_count; } } if (m->m_flags & M_STACK) { ND("free invalid mbuf %p", m); return; } /* free the mbuf */ ND("free(m = %p, M_IPFW);", m); m->m_next = mbuf_freelist; mbuf_freelist = m; }; /* from net/netisr.c */ int netisr_dispatch(u_int proto, struct mbuf *m) { if (m->__m_callback) m->__m_callback(m, proto); else D("unimplemented proto %d mbuf %p", proto, m); return 0; } /* define empty body for kernel function */ int priv_check(struct thread *td, int priv) { /* once connected, always allow */ ND("called"); return 0; } int securelevel_ge(struct ucred *cr, int level) { /* we are always secure... */ ND("called"); return 0; } int sysctl_handle_int(SYSCTL_HANDLER_ARGS) { int tmp; ND("called"); if (!req || !req->oldptr || req->oldlen != sizeof(int)) return EINVAL; tmp = arg1 ? *(int *)arg1 : arg2; bcopy(&tmp, req->oldptr, sizeof(int)); /* XXX check the SET routine */ if (req->newptr && arg1) bcopy(req->newptr, arg1, sizeof(int)); return 0; } int sysctl_handle_long(SYSCTL_HANDLER_ARGS) { D("called"); return 0; } void ether_demux(struct ifnet *ifp, struct mbuf *m) { D("incomplete"); return; } int ether_output_frame(struct ifnet *ifp, struct mbuf *m) { D("incomplete"); return 0; } void in_rtalloc_ign(struct route *ro, u_long ignflags, u_int fibnum) { D("called"); return; } void icmp_error(struct mbuf *n, int type, int code, n_long dest, int mtu) { D("called"); return; } void rtfree(struct rtentry *rt) { D("called"); return; } u_short in_cksum_skip(struct mbuf *m, int len, int skip) { D("called"); return 0; } u_short in_cksum_hdr(struct ip *ip) { D("called"); return 0; } struct mbuf * ip_reass(struct mbuf *clone) { D("called"); return clone; } #ifdef INP_LOCK_ASSERT #undef INP_LOCK_ASSERT #define INP_LOCK_ASSERT(a) #endif int jailed(struct ucred *cred) { D("called"); return 0; } /* * Return 1 if an internet address is for a ``local'' host * (one to which we have a connection). If subnetsarelocal * is true, this includes other subnets of the local net. * Otherwise, it includes only the directly-connected (sub)nets. */ int in_localaddr(struct in_addr in) { D("called"); return 1; } #if 0 int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) { return 1; } #endif /* * Procedures for the callout interface * * callout_init() initializes a descriptor, * callout_reset() starts a timer * callout_stop() stops a timer * * Internally we hold a list of callout entries etc etc. */ struct callout_tailq callout_head; #include void callout_init(struct callout *c, int mpsafe) { D("c %p mpsafe %d", c, mpsafe); bzero(c, sizeof(*c)); } int callout_reset_on(struct callout *c, int due_ticks, void (*func)(void *), void *arg, int p) { return callout_reset(c, due_ticks, func, arg); } int callout_reset(struct callout *c, int due_ticks, void (*func)(void *), void *arg) { struct callout *cur; ND("c %p ticks %d f %p(%p)", c, due_ticks, func, arg); if (c->c_flags & CALLOUT_ACTIVE) { D(" --- callout was already active"); return -1; } c->c_time = ticks + due_ticks; /* XXX not the original meaning */ c->c_func = func; c->c_arg = arg; c->c_flags |= CALLOUT_ACTIVE; TAILQ_FOREACH(cur, &callout_head, c_links.tqe) { if ( (c->c_time - cur->c_time) < 0) break; } if (cur) TAILQ_INSERT_BEFORE(cur, c, c_links.tqe); else TAILQ_INSERT_TAIL(&callout_head, c, c_links.tqe); return 0; /* no error */ } int _callout_stop_safe(struct callout *c, int safe) { D("c %p safe %d", c, safe); TAILQ_REMOVE(&callout_head, c, c_links.tqe); return 0; } int callout_drain(struct callout *c) { _callout_stop_safe(c, 1); return 0; } void callout_startup(void) { D("start"); TAILQ_INIT( &callout_head); } void callout_run(void) { struct callout *cur, *tmp; ND("Run pending callouts tick %d", ticks); TAILQ_FOREACH_SAFE(cur, &callout_head, c_links.tqe, tmp) { int delta = ticks - cur->c_time; if (delta < 0) { // early ? //fprintf(stderr, "c %p due at %d\n", cur, cur->c_time); continue; } if (delta > 100) D("running %p due at %d now %d", cur, cur->c_time, ticks); TAILQ_REMOVE(&callout_head, cur, c_links.tqe); cur->c_flags &= ~CALLOUT_ACTIVE; cur->c_func(cur->c_arg); } } /* * the taskqueue type is actually opaque */ struct taskqueue { STAILQ_ENTRY(taskqueue) tq_link; STAILQ_HEAD(, task) tq_queue; const char *tq_name; taskqueue_enqueue_fn tq_enqueue; void *tq_context; struct task *tq_running; int tq_pcount; int tq_spin; int tq_flags; }; #if 0 /* * instead of enqueueing, we run this immediately. */ int taskqueue_enqueue(struct taskqueue *queue, struct task *task) { task->ta_func(task->ta_context, 1); return 0; } #endif void taskqueue_thread_enqueue(void *context) { D("ctx %p", context); } struct taskqueue * taskqueue_create(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context) { struct taskqueue *tq; tq = calloc(1, sizeof(*tq)); if (tq == NULL) return NULL; D("start %s fn %p ctx %p", name, enqueue, context); return tq; } int taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, const char *name, ...) { D("tqp %p count %d (dummy)", tqp, count); return 0; } void taskqueue_drain(struct taskqueue *queue, struct task *task) { D("q %p task %p", queue, task); } void taskqueue_free(struct taskqueue *queue) { D("q %p", queue); free(queue); } void * kern_malloc(int sz) { return malloc(sz); } void kern_free(void *p) { free(p); } #ifdef linux size_t strlcpy(char *dst, const char *src, size_t siz) { char *d = dst; const char *s = src; size_t n = siz; /* Copy as many bytes as will fit */ if (n != 0 && --n != 0) { do { if ((*d++ = *s++) == 0) break; } while (--n != 0); } /* Not enough room in dst, add NUL and traverse rest of src */ if (n == 0) { if (siz != 0) *d = '\0'; /* NUL-terminate dst */ while (*s++) ; } return(s - src - 1); /* count does not include NUL */ } #endif // linux #ifdef EMULATE_SYSCTL /* * Support for sysctl emulation. * We transfer options as part of the IP_DUMMYNET3 sockopt emulation, * so we need to include ip_fw.h and ip_dummynet.h */ #include /* struct ip_fw_args */ #include /* struct dn_id */ static struct sysctltable GST; int kesysctl_emu_get(struct sockopt* sopt) { struct dn_id* oid = sopt->sopt_val; struct sysctlhead* entry; int sizeneeded = sizeof(struct dn_id) + GST.totalsize + sizeof(struct sysctlhead); unsigned char* pstring; unsigned char* pdata; int i; if (sopt->sopt_valsize < sizeneeded) { // this is a probe to retrieve the space needed for // a dump of the sysctl table oid->id = sizeneeded; sopt->sopt_valsize = sizeof(struct dn_id); return 0; } entry = (struct sysctlhead*)(oid+1); /* [entry][data(datalen)][name(namelen)] */ ND("copying values"); for( i=0; iblocklen = GST.entry[i].head.blocklen; entry->namelen = GST.entry[i].head.namelen; entry->flags = GST.entry[i].head.flags; entry->datalen = GST.entry[i].head.datalen; pdata = (unsigned char*)(entry+1); pstring = pdata+GST.entry[i].head.datalen; if (entry->flags & SYSCTLTYPE_PROC) { int (*f)(SYSCTL_HANDLER_ARGS); int tmp = 0, ret; struct sysctl_req req; bzero(&req, sizeof(req)); req.oldlen = req.newlen = sizeof(int); req.oldptr = &tmp; f = (void *)GST.entry[i].data; ND("-- %s is a proc -- at %p", GST.entry[i].name, f); ret = f(NULL, NULL, 0, &req); ND("-- %s returns %d", GST.entry[i].name, ret); bcopy(&tmp, pdata, sizeof(tmp)); } else { bcopy(GST.entry[i].data, pdata, GST.entry[i].head.datalen); } bcopy(GST.entry[i].name, pstring, GST.entry[i].head.namelen); entry = (struct sysctlhead*) ((unsigned char*)(entry) + GST.entry[i].head.blocklen); } sopt->sopt_valsize = sizeneeded; return 0; } int kesysctl_emu_set(void* p, int l) { struct sysctlhead* entry; unsigned char* pdata; unsigned char* pstring; int i = 0; entry = (struct sysctlhead*)(((struct dn_id*)p)+1); pdata = (unsigned char*)(entry+1); pstring = pdata + entry->datalen; for (i=0; idatalen != GST.entry[i].head.datalen) { printf("%s: len mismatch, user %d vs kernel %d\n", __FUNCTION__, entry->datalen, GST.entry[i].head.datalen); return -1; } // check access (at the moment flags handles only the R/W rights //later on will be type + access if( (GST.entry[i].head.flags & 3) == CTLFLAG_RD) { printf("%s: the entry %s is read only\n", __FUNCTION__,GST.entry[i].name); return -1; } if (GST.entry[i].head.flags & SYSCTLTYPE_PROC) { int (*f)(SYSCTL_HANDLER_ARGS); int tmp = 0, ret; struct sysctl_req req; bzero(&req, sizeof(req)); req.oldlen = req.newlen = sizeof(int); req.oldptr = &tmp; req.newptr = pdata; f = (void *)GST.entry[i].data; ND("-- %s is a proc -- at %p", GST.entry[i].name, f); ret = f(NULL, NULL, 0, &req); ND("-- %s returns %d", GST.entry[i].name, ret); } else { bcopy(pdata, GST.entry[i].data, GST.entry[i].head.datalen); } return 0; } D("%s: match not found\n",__FUNCTION__); return 0; } /* convert all _ to . until the first . */ static void underscoretopoint(char* s) { for (; *s && *s != '.'; s++) if (*s == '_') *s = '.'; } static int formatnames(void) { int i; int size=0; char* name; for (i=0; i> 2, GST.entry[i].head.flags & 0x00000003); printf("data %i\n", *(int*)(GST.entry[i].data)); printf("datalen %i\n", GST.entry[i].head.datalen); printf("blocklen %i\n", GST.entry[i].head.blocklen); } } void sysctl_addgroup_f1(void); void sysctl_addgroup_f2(void); void sysctl_addgroup_f3(void); void sysctl_addgroup_f4(void); void keinit_GST(void) { int ret; sysctl_addgroup_f1(); sysctl_addgroup_f2(); sysctl_addgroup_f3(); sysctl_addgroup_f4(); ret = formatnames(); if (ret != 0) printf("conversion of names failed for some reason\n"); if (0) dumpGST(); // XXX debugging printf("*** Global Sysctl Table entries = %i, total size = %i ***\n", GST.count, GST.totalsize); } void keexit_GST(void) { if (GST.namebuffer != NULL) free(GST.namebuffer); bzero(&GST, sizeof(GST)); } void sysctl_pushback(char* name, int flags, int datalen, void* data) { if (GST.count >= GST_HARD_LIMIT) { printf("WARNING: global sysctl table full, this entry will not be added," "please recompile the module increasing the table size\n"); return; } GST.entry[GST.count].head.namelen = strlen(name)+1; //add space for '\0' GST.entry[GST.count].name = name; GST.entry[GST.count].head.flags = flags; GST.entry[GST.count].data = data; GST.entry[GST.count].head.datalen = datalen; GST.entry[GST.count].head.blocklen = ((sizeof(struct sysctlhead) + GST.entry[GST.count].head.namelen + GST.entry[GST.count].head.datalen)+3) & ~3; GST.totalsize += GST.entry[GST.count].head.blocklen; GST.count++; } #endif /* EMULATE_SYSCTL */ extern int mainloop(int argc, char *argv[]); /* * main program for ipfw kernel side when running an userspace emulation: * open a socket on which we receive requests from userland, * another socket for calls from the 'kernel' (simulating packet * arrivals etc), and then periodically run the tick handler. */ int main(int argc, char *argv[]) { return mainloop(argc, argv); } ipfw-user/extra/glue.c000644 000423 000000 00000031360 12007433754 015470 0ustar00luigiwheel000000 000000 /* * Userland functions missing in linux * taken from /usr/src/lib/libc/stdtime/time32.c */ #include #include #include #include #include /* sockaddr_in */ #include #include /* uint* types */ #include #include /* bzero */ #include /* htonl */ #ifndef HAVE_NAT /* dummy nat functions */ void ipfw_show_nat(int ac, char **av) { D("unsupported"); } void ipfw_config_nat(int ac, char **av) { D("unsupported"); } #endif /* HAVE_NAT */ #ifdef NEED_STRTONUM /* missing in linux and windows */ long long int strtonum(const char *nptr, long long minval, long long maxval, const char **errstr) { long long ret; int errno_c = errno; /* save actual errno */ errno = 0; #ifdef TCC ret = strtol(nptr, (char **)errstr, 0); #else ret = strtoll(nptr, (char **)errstr, 0); #endif /* We accept only a string that represent exactly a number (ie. start * and end with a digit). * FreeBSD version wants errstr==NULL if no error occurs, otherwise * errstr should point to an error string. * For our purspose, we implement only the invalid error, ranges * error aren't checked */ if (errno != 0 || nptr == *errstr || **errstr != '\0') *errstr = "invalid"; else { *errstr = NULL; errno = errno_c; } return ret; } int ishexnumber(int c) { return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') ); } #endif /* NEED_STRTONUM */ #ifdef __linux__ int optreset; /* missing in linux */ /* * not implemented in linux. * taken from /usr/src/lib/libc/string/strlcpy.c */ size_t strlcpy(dst, src, siz) char *dst; const char *src; size_t siz; { char *d = dst; const char *s = src; size_t n = siz; /* Copy as many bytes as will fit */ if (n != 0 && --n != 0) { do { if ((*d++ = *s++) == 0) break; } while (--n != 0); } /* Not enough room in dst, add NUL and traverse rest of src */ if (n == 0) { if (siz != 0) *d = '\0'; /* NUL-terminate dst */ while (*s++) ; } return(s - src - 1); /* count does not include NUL */ } #endif /* __linux__ */ #if defined (EMULATE_SYSCTL) //XXX missing prerequisites #include //openwrt #include //openwrt #include #include int do_cmd(int optname, void *optval, uintptr_t optlen); #endif /* EMULATE_SYSCTL */ /* * set or get system information * XXX lock acquisition/serialize calls * * we export this as sys/module/ipfw_mod/parameters/___ * This function get or/and set the value of the sysctl passed by * the name parameter. If the old value is not desired, * oldp and oldlenp should be set to NULL. * * XXX * I do not know how this works in FreeBSD in the case * where there are no write permission on the sysctl var. * We read the value and set return variables in any way * but returns -1 on write failures, regardless the * read success. * * Since there is no information on types, in the following * code we assume a length of 4 is a int. * * Returns 0 on success, -1 on errors. */ int sysctlbyname(const char *name, void *oldp, size_t *oldlenp, void *newp, size_t newlen) { #if defined (EMULATE_SYSCTL) /* * we embed the sysctl request in the usual sockopt mechanics. * the sockopt buffer il filled with a dn_id with IP_DUMMYNET3 * command, and the special DN_SYSCTL_GET and DN_SYSCTL_SET * subcommands. * the syntax of this function is fully compatible with * POSIX sysctlby name: * if newp and newlen are != 0 => this is a set * else if oldp and oldlen are != 0 => this is a get * to avoid too much overhead in the module, the whole * sysctltable is returned, and the parsing is done in userland, * a probe request is done to retrieve the size needed to * transfer the table, before the real request * if both old and new params = 0 => this is a print * this is a special request, done only by main() * to implement the extension './ipfw sysctl', * a command that bypasses the normal getopt, and that * is available on those platforms that use this * sysctl emulation. * in this case, a negative oldlen signals that *oldp * is actually a FILE* to print somewhere else than stdout */ int l; int ret; struct dn_id* oid; struct sysctlhead* entry; char* pstring; char* pdata; FILE* fp; if((oldlenp != NULL) && (*oldlenp < 0)) fp = (FILE*)oldp; else fp = stdout; if(newp != NULL && newlen != 0) { //this is a set l = sizeof(struct dn_id) + sizeof(struct sysctlhead) + strlen(name)+1 + newlen; oid = malloc(l); if (oid == NULL) return -1; oid->len = l; oid->type = DN_SYSCTL_SET; oid->id = DN_API_VERSION; entry = (struct sysctlhead*)(oid+1); pdata = (char*)(entry+1); pstring = pdata + newlen; entry->blocklen = ((sizeof(struct sysctlhead) + strlen(name)+1 + newlen) + 3) & ~3; entry->namelen = strlen(name)+1; entry->flags = 0; entry->datalen = newlen; bcopy(newp, pdata, newlen); bcopy(name, pstring, strlen(name)+1); ret = do_cmd(IP_DUMMYNET3, oid, (uintptr_t)l); if (ret != 0) return -1; } else { //this is a get or a print l = sizeof(struct dn_id); oid = malloc(l); if (oid == NULL) return -1; oid->len = l; oid->type = DN_SYSCTL_GET; oid->id = DN_API_VERSION; ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); if (ret != 0) return -1; l=oid->id; free(oid); oid = malloc(l); if (oid == NULL) return -1; oid->len = l; oid->type = DN_SYSCTL_GET; oid->id = DN_API_VERSION; ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); if (ret != 0) return -1; entry = (struct sysctlhead*)(oid+1); while(entry->blocklen != 0) { pdata = (char*)(entry+1); pstring = pdata+entry->datalen; //time to check if this is a get or a print if(name != NULL && oldp != NULL && *oldlenp > 0) { //this is a get if(strcmp(name,pstring) == 0) { //match found, sanity chech on len if(*oldlenp < entry->datalen) { printf("%s error: buffer too small\n",__FUNCTION__); return -1; } *oldlenp = entry->datalen; bcopy(pdata, oldp, *oldlenp); return 0; } } else { //this is a print if( name == NULL ) goto print; if ( (strncmp(pstring,name,strlen(name)) == 0) && ( pstring[strlen(name)]=='\0' || pstring[strlen(name)]=='.' ) ) goto print; else goto skip; print: fprintf(fp, "%s: ",pstring); switch( entry->flags >> 2 ) { case SYSCTLTYPE_LONG: fprintf(fp, "%li ", *(long*)(pdata)); break; case SYSCTLTYPE_UINT: fprintf(fp, "%u ", *(unsigned int*)(pdata)); break; case SYSCTLTYPE_ULONG: fprintf(fp, "%lu ", *(unsigned long*)(pdata)); break; case SYSCTLTYPE_INT: default: fprintf(fp, "%i ", *(int*)(pdata)); } if( (entry->flags & 0x00000003) == CTLFLAG_RD ) fprintf(fp, "\t(read only)\n"); else fprintf(fp, "\n"); skip: ; } entry = (struct sysctlhead*)((unsigned char*)entry + entry->blocklen); } free(oid); return 0; } //fallback for invalid options return -1; #else /* __linux__ */ FILE *fp; char *basename = "/sys/module/ipfw_mod/parameters/"; char filename[256]; /* full filename */ char *varp; int ret = 0; /* return value */ int d; if (name == NULL) /* XXX set errno */ return -1; /* locate the filename */ varp = strrchr(name, '.'); if (varp == NULL) /* XXX set errno */ return -1; snprintf(filename, sizeof(filename), "%s%s", basename, varp+1); /* * XXX we could open the file here, in rw mode * but need to check if a file have write * permissions. */ /* check parameters */ if (oldp && oldlenp) { /* read mode */ fp = fopen(filename, "r"); if (fp == NULL) { fprintf(stderr, "%s fopen error reading filename %s\n", __FUNCTION__, filename); return -1; } if (*oldlenp == 4) { if (fscanf(fp, "%d", &d) == 1) memcpy(oldp, &d, *oldlenp); else ret = -1; } fclose(fp); } if (newp && newlen) { /* write */ fp = fopen(filename, "w"); if (fp == NULL) { fprintf(stderr, "%s fopen error writing filename %s\n", __FUNCTION__, filename); return -1; } if (newlen == 4) { if (fprintf(fp, "%d", *(int*)newp) < 1) ret = -1; } fclose(fp); } return ret; #endif /* __linux__ */ } /* * The following two functions implement getsockopt/setsockopt * replacements to talk over a TCP socket. * Because the calls are synchronous, we can run blocking code * and do not need to play special tricks to be selectable. * The wire protocol for the emulation is the following: * REQUEST: n32 req_size, level, optname; u8 data[req_size] * RESPONSE: n32 resp_size, ret_code; u8 data[resp_size] * data is only present if ret_code == 0 * * Return 0 if the message wan sent to the remote * endpoint, -1 on error. * * If the required lenght is greater then the * available buffer size, -1 is returned and * optlen is the required lenght. */ enum sock_type {GET_SOCKOPT, SET_SOCKOPT}; struct wire_hdr { uint32_t optlen; /* actual data len */ uint32_t level; /* or error */ uint32_t optname; /* or act len */ uint32_t dir; /* in or out */ }; /* do a complete write of the buffer */ static int writen(int fd, const char *buf, int len) { int i; for (; len > 0; buf += i, len -= i) { i = write(fd, buf, len); ND("have %d wrote %d", len, i); if (i < 0) { if (errno == EAGAIN) continue; return -1; } } return 0; } /* do a complete read */ static int readn(int fd, char *buf, int len) { int i, pos; for (pos = 0; pos < len; pos += i) { i = read(fd, buf + pos, len - pos); ND("have %d want %d got %d", pos, len, i); if (i < 0) { if (errno == EAGAIN) continue; return -1; } } ND("full read got %d", pos); return 0; } int __sockopt2(int s, int level, int optname, void *optval, socklen_t *optlen, enum sopt_dir dir) { struct wire_hdr r; int len = optlen && optval ? *optlen : 0; ND("dir %d optlen %d level %d optname %d", dir, len, level, optname); /* send request to the server */ r.optlen = htonl(len); r.level = htonl(level); r.optname = htonl(optname); r.dir = htonl(dir); if (writen(s, (const char *) &r, sizeof(r))) return -1; /* error writing */ /* send data, if present */ if (len < 0) { fprintf(stderr, "%s invalid args found\n", __FUNCTION__); return -1; } else if (len > 0) { if (writen(s, optval, len)) return -1; /* error writing */ } /* read response size and error code */ if (readn(s, (char *)&r, sizeof(r))) return -1; /* error reading */ len = ntohl(r.optlen); ND("got header, datalen %d", len); if (len > 0) { if (readn(s, optval, len)) return -1; /* error reading */ } if (optlen) *optlen = ntohl(r.optlen); /* actual len */ return 0; // XXX valid ntohl(r.level); } /* * getsockopt() replacement. */ int getsockopt2(int s, int level, int optname, void *optval, socklen_t *optlen) { return __sockopt2(s, level, optname, optval, optlen, SOPT_GET); } /* * setsockopt() replacement */ int setsockopt2(int s, int level, int optname, void *optval, socklen_t optlen) { /* optlen not changed, use the local address */ return __sockopt2(s, level, optname, optval, &optlen, SOPT_SET); } #ifdef socket #undef socket /* we want the real one */ #endif /* * This function replaces the socket() call to connect to * the ipfw control socket. * We actually ignore the paramerers if IPFW_HOST and IPFW_PORT * are defined. */ int do_connect(const char *addr, int port) { int conn_fd; /* open the socket */ #ifdef NETLINK struct rtnl_handle rth; conn_fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE); #else struct sockaddr_in server; /* server address */ const char *s; conn_fd = socket(AF_INET, SOCK_STREAM, 0); if (conn_fd < 0) { perror("socket"); return -1; } #endif #ifndef NETLINK /* fill the sockaddr structure with server address */ bzero(&server, sizeof(server)); server.sin_family = AF_INET; /* override the host if set in the environment */ s = getenv("IPFW_HOST"); if (s) addr = s; inet_aton(addr, &server.sin_addr); s = getenv("IPFW_PORT"); if (s && atoi(s) > 0) port = atoi(s); server.sin_port = htons(port); /* connect to the server */ if (connect(conn_fd, (struct sockaddr*) &server, sizeof(server)) < 0) { perror("connect"); return -1; } if (1) fprintf(stderr, "connected to %s:%d\n", inet_ntoa(server.sin_addr), ntohs(server.sin_port)); #endif return conn_fd; } ipfw-user/extra/nm_util.c000644 000423 000000 00000014704 12006744005 016177 0ustar00luigiwheel000000 000000 /* * Copyright (C) 2012 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD$ * $Id$ * * utilities to use netmap devices. * This does the basic functions of opening a device and issuing * ioctls() */ #include "nm_util.h" extern int verbose; int nm_do_ioctl(struct my_ring *me, int what, int subcmd) { struct ifreq ifr; int error; #if defined( __FreeBSD__ ) || defined (__APPLE__) int fd = me->fd; #endif #ifdef linux struct ethtool_value eval; int fd; fd = socket(AF_INET, SOCK_DGRAM, 0); if (fd < 0) { printf("Error: cannot get device control socket.\n"); return -1; } #endif /* linux */ (void)subcmd; // unused bzero(&ifr, sizeof(ifr)); strncpy(ifr.ifr_name, me->ifname, sizeof(ifr.ifr_name)); switch (what) { case SIOCSIFFLAGS: #ifndef __APPLE__ ifr.ifr_flagshigh = me->if_flags >> 16; #endif ifr.ifr_flags = me->if_flags & 0xffff; break; #if defined( __FreeBSD__ ) case SIOCSIFCAP: ifr.ifr_reqcap = me->if_reqcap; ifr.ifr_curcap = me->if_curcap; break; #endif #ifdef linux case SIOCETHTOOL: eval.cmd = subcmd; eval.data = 0; ifr.ifr_data = (caddr_t)&eval; break; #endif /* linux */ } error = ioctl(fd, what, &ifr); if (error) goto done; switch (what) { case SIOCGIFFLAGS: #ifndef __APPLE__ me->if_flags = (ifr.ifr_flagshigh << 16) | (0xffff & ifr.ifr_flags); #endif if (verbose) D("flags are 0x%x", me->if_flags); break; #if defined( __FreeBSD__ ) case SIOCGIFCAP: me->if_reqcap = ifr.ifr_reqcap; me->if_curcap = ifr.ifr_curcap; if (verbose) D("curcap are 0x%x", me->if_curcap); break; #endif /* __FreeBSD__ */ } done: #ifdef linux close(fd); #endif if (error) { ND("ioctl error %d 0x%x", error, what); // perror(""); } return error; } /* * open a device. if me->mem is null then do an mmap. * Returns the file descriptor. * The extra flag checks configures promisc mode. */ int netmap_open(struct my_ring *me, int ringid, int promisc) { int fd, err, l; struct nmreq req; me->fd = fd = open("/dev/netmap", O_RDWR); if (fd < 0) { D("Unable to open /dev/netmap"); return (-1); } D("/dev/netmap opened ok"); bzero(&req, sizeof(req)); req.nr_version = NETMAP_API; strncpy(req.nr_name, me->ifname, sizeof(req.nr_name)); req.nr_ringid = ringid; err = ioctl(fd, NIOCGINFO, &req); if (err) { D("cannot get info on %s, errno %d ver %d", me->ifname, errno, req.nr_version); goto error; } me->memsize = l = req.nr_memsize; if (verbose) D("memsize is %d MB", l>>20); err = ioctl(fd, NIOCREGIF, &req); if (err) { D("Unable to register %s", me->ifname); goto error; } if (me->mem == NULL) { me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); if (me->mem == MAP_FAILED) { D("Unable to mmap"); me->mem = NULL; goto error; } } /* Set the operating mode. */ if (ringid != NETMAP_SW_RING) { nm_do_ioctl(me, SIOCGIFFLAGS, 0); if ((me[0].if_flags & IFF_UP) == 0) { D("%s is down, bringing up...", me[0].ifname); me[0].if_flags |= IFF_UP; } if (promisc) { me[0].if_flags |= IFF_PPROMISC; nm_do_ioctl(me, SIOCSIFFLAGS, 0); nm_do_ioctl(me+1, SIOCGIFFLAGS, 0); me[1].if_flags |= IFF_PPROMISC; nm_do_ioctl(me+1, SIOCSIFFLAGS, 0); } #ifdef __FreeBSD__ /* also disable checksums etc. */ nm_do_ioctl(me, SIOCGIFCAP, 0); me[0].if_reqcap = me[0].if_curcap; me[0].if_reqcap &= ~(IFCAP_HWCSUM | IFCAP_TSO | IFCAP_TOE); nm_do_ioctl(me+0, SIOCSIFCAP, 0); #endif #ifdef linux /* disable: * - generic-segmentation-offload * - tcp-segmentation-offload * - rx-checksumming * - tx-checksumming * XXX check how to set back the caps. */ nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_SGSO); nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_STSO); nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_SRXCSUM); nm_do_ioctl(me, SIOCETHTOOL, ETHTOOL_STXCSUM); #endif /* linux */ } me->nifp = NETMAP_IF(me->mem, req.nr_offset); me->queueid = ringid; if (ringid & NETMAP_SW_RING) { me->begin = req.nr_rx_rings; me->end = me->begin + 1; me->tx = NETMAP_TXRING(me->nifp, req.nr_tx_rings); me->rx = NETMAP_RXRING(me->nifp, req.nr_rx_rings); } else if (ringid & NETMAP_HW_RING) { D("XXX check multiple threads"); me->begin = ringid & NETMAP_RING_MASK; me->end = me->begin + 1; me->tx = NETMAP_TXRING(me->nifp, me->begin); me->rx = NETMAP_RXRING(me->nifp, me->begin); } else { me->begin = 0; me->end = req.nr_rx_rings; // XXX max of the two me->tx = NETMAP_TXRING(me->nifp, 0); me->rx = NETMAP_RXRING(me->nifp, 0); } return (0); error: close(me->fd); return -1; } int netmap_close(struct my_ring *me) { D(""); if (me->mem) munmap(me->mem, me->memsize); ioctl(me->fd, NIOCUNREGIF, NULL); close(me->fd); return (0); } /* * how many packets on this set of queues ? */ int pkt_queued(struct my_ring *me, int tx) { u_int i, tot = 0; ND("me %p begin %d end %d", me, me->begin, me->end); for (i = me->begin; i < me->end; i++) { struct netmap_ring *ring = tx ? NETMAP_TXRING(me->nifp, i) : NETMAP_RXRING(me->nifp, i); tot += ring->avail; } if (0 && verbose && tot && !tx) D("ring %s %s %s has %d avail at %d", me->ifname, tx ? "tx": "rx", me->end >= me->nifp->ni_tx_rings ? // XXX who comes first ? "host":"net", tot, NETMAP_TXRING(me->nifp, me->begin)->cur); return tot; } ipfw-user/extra/nm_util.h000644 000423 000000 00000011600 12007702010 016163 0ustar00luigiwheel000000 000000 /* * Copyright (C) 2012 Luigi Rizzo. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $Id$ * * Some utilities to build netmap-based programs. */ #ifndef _NM_UTIL_H #define _NM_UTIL_H #include #include /* signal */ #include #include #include /* PRI* macros */ #include /* strcmp */ #include /* open */ #include /* close */ #include /* getifaddrs */ #include /* PROT_* */ #include /* ioctl */ #include #include /* sockaddr.. */ #include /* ntohs */ #include #include /* sysctl */ #include /* timersub */ #include #include /* ifreq */ #ifdef linux #define ifr_flagshigh ifr_flags #define ifr_curcap ifr_flags #define ifr_reqcap ifr_flags #define IFF_PPROMISC IFF_PROMISC #include #include #endif /* linux */ #ifdef __APPLE__ #define ifr_flagshigh ifr_flags // XXX #define IFF_PPROMISC IFF_PROMISC #endif /* __APPLE__ */ #ifdef __FreeBSD__ #include /* le64toh */ #include #endif /* __FreeBSD__ */ #include #include #include #include #include #ifndef MY_PCAP #include // XXX do we need it ? #endif // XXX hack #include /* pthread_* */ #if defined(__FreeBSD__) #include /* pthread w/ affinity */ #include /* cpu_set */ #include /* LLADDR */ #endif #if defined(__APPLE__) #include /* LLADDR */ #define clock_gettime(a,b) \ do {struct timespec t0 = {0,0}; *(b) = t0; } while (0) #endif #ifdef linux #define CLOCK_REALTIME_PRECISE CLOCK_REALTIME #include /* ether_aton */ #include /* sockaddr_ll */ #endif /* linux */ #ifndef min // conflict with glue.h static inline int min(int a, int b) { return a < b ? a : b; } #endif /* debug support */ #ifndef ND #define ND(format, ...) do {} while(0) #define D(format, ...) \ fprintf(stderr, "%s [%d] " format "\n", \ __FUNCTION__, __LINE__, ##__VA_ARGS__) #endif /* ND */ #ifndef EXPERIMENTAL #define EXPERIMENTAL 0 #endif #define prefetch(x) __builtin_prefetch(x) #if EXPERIMENTAL /* Wrapper around `rdtsc' to take reliable timestamps flushing the pipeline */ #define netmap_rdtsc(t) \ do { \ u_int __regs[4]; \ do_cpuid(0, __regs); \ (t) = rdtsc(); \ } while (0) static __inline void do_cpuid(u_int ax, u_int *p) { __asm __volatile("cpuid" : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3]) : "0" (ax)); } static __inline uint64_t rdtsc(void) { uint64_t rv; __asm __volatile("rdtsc" : "=A" (rv)); return (rv); } #endif /* EXPERIMENTAL */ //struct my_ring; /* * info on a ring we handle */ struct my_ring { const char *ifname; int fd; char *mem; /* userspace mmap address */ u_int memsize; u_int queueid; u_int begin, end; /* first..last+1 rings to check */ struct netmap_if *nifp; struct netmap_ring *tx, *rx; /* shortcuts */ uint32_t if_flags; uint32_t if_reqcap; uint32_t if_curcap; void *private; /* extra arguments */ }; /* * XXX change netmap_open so that it takes the name and size of * extra space needed, and allocates everything at once. */ int netmap_open(struct my_ring *me, int ringid, int promisc); int netmap_close(struct my_ring *me); int nm_do_ioctl(struct my_ring *me, int what, int subcmd); #endif /* _NM_UTIL_H */ ipfw-user/extra/netmap_io.c000644 000423 000000 00000017000 12007737406 016503 0ustar00luigiwheel000000 000000 /* * Glue code to implement netmap I/O for the userspace version of ipfw. */ #include #ifdef _KERNEL #undef _KERNEL #endif /* these headers need to be compiled without _KERNEL */ //#include //#include //#define __NetBSD__ // XXX conflict in bpf_filter() between pcap.h and bpf.h //#include #ifdef free /* we are built in a pseudo-kernel env so malloc and free are redefined */ #undef free #undef malloc #endif #include "nm_util.h" int verbose; #include // #include //#include #include /* read() */ #include /* EINVAL */ #include /* M_NOWAIT */ #include /* mbuf */ #include // PFIL_IN #define _KERNEL /* args for ipfw */ #include #include /* * A packet comes from either a netmap slot on the source, * or from an mbuf that must be freed. * slot != NULL means a netmap slot, otherwise use buf. * len == 0 means an empty slot. */ struct txq { struct netmap_slot *slot; /* can be an mbuf */ #define TXQ_IS_SLOT 0xc555 #define TXQ_IS_MBUF 0xaacd uint16_t flags; /* 0 if slot, len if mbuf */ }; /* * the state associated to a netmap port: * (goes into the private field of my_ring) * XXX have an ifp at the beginning so we can use rcvif to store it. */ #define MY_TXQ_LEN 32 struct my_netmap_port { struct ifnet ifp; /* contains if_xname */ struct my_ring me; struct my_netmap_port *peer; /* peer port */ struct sess *sess; /* my session */ u_int cur_txq; /* next txq slot to use for tx */ struct txq q[MY_TXQ_LEN]; /* followed by ifname */ }; /* * txq[] has a batch of n packets that possibly need to be forwarded. */ int netmap_fwd(struct my_netmap_port *port) { u_int si, i = 0; const u_int n = port->cur_txq; struct txq *x = port->q; int retry = 5; /* max retries */ if (n == 0) { D("nothing to forward to %s", port->ifp.if_xname); return 0; } again: /* scan all rings */ for (si = port->me.begin; i < n && si < port->me.end; si++) { u_int tmp; struct netmap_ring *ring = NETMAP_TXRING(port->me.nifp, si); prefetch(ring); ND("ring has %d pkts", ring->avail); if (ring->avail == 0) continue; for (; i < n && ring->avail > 0; i++) { struct netmap_slot *dst, *src; dst = &ring->slot[ring->cur]; if (x[i].flags == TXQ_IS_SLOT) { src = x[i].slot; // XXX swap buffers ND("pkt %d len %d", i, src->len); dst->len = src->len; dst->flags = src->flags = NS_BUF_CHANGED; tmp = dst->buf_idx; dst->buf_idx = src->buf_idx; src->buf_idx = tmp; } else if (x[i].flags == TXQ_IS_MBUF) { struct mbuf *m = (void *)x[i].slot; ND("copy from mbuf"); dst->len = m->__m_extlen; pkt_copy(m->__m_extbuf, NETMAP_BUF(ring, dst->buf_idx), dst->len); FREE_PKT(m); } else { panic("bad slot"); } x[i].flags = 0; ring->cur = NETMAP_RING_NEXT(ring, ring->cur); ring->avail--; } } if (i < n) { if (retry-- > 0) { ioctl(port->me.fd, NIOCTXSYNC); goto again; } ND("%d buffers leftover", n - i); for (;i < n; i++) { if (x[i].flags == TXQ_IS_MBUF) { FREE_PKT((void *)x[i].slot); } } } port->cur_txq = 0; return 0; } void netmap_enqueue(struct mbuf *m, int proto) { struct my_netmap_port *peer = m->__m_peer; struct txq *x; if (peer == NULL) { D("error missing peer in %p", m); FREE_PKT(m); } ND("start with %d packets", peer->cur_txq); if (peer->cur_txq >= MY_TXQ_LEN) netmap_fwd(peer); x = peer->q + peer->cur_txq; x->slot = (void *)m; x->flags = TXQ_IS_MBUF; peer->cur_txq++; peer->sess->flags |= WANT_RUN; ND("end, queued %d on %s", peer->cur_txq, peer->ifname); } /* * Read packets from a port, invoke the firewall and possibly * pass them to the peer. * The firewall receives a fake mbuf on the stack that refers * to the netmap slot. In this case the mbuf has two extra fields, * indicating the original buffer and length (buf = NULL if no need * to copy). * We also need to pass the pointer to a peer, though we can use ifp for that. * If the result is accept, no need to copy * and we can just pass the slot to the destination interface. * Otherwise, we need to do an explicit copy. */ int netmap_read(struct sess *sess, void *arg) { struct my_netmap_port *port = arg; u_int si; struct mbuf dm, dm0; struct ip_fw_args args; struct my_netmap_port *peer = port->peer; struct txq *x = peer->q; bzero(&dm0, sizeof(dm0)); bzero(&args, sizeof(args)); /* scan all rings */ for (si = port->me.begin; si < port->me.end; si++) { struct netmap_ring *ring = NETMAP_RXRING(port->me.nifp, si); prefetch(ring); ND("ring has %d pkts", ring->avail); if (ring->avail == 0) continue; prefetch(&ring->slot[ring->cur]); while (ring->avail > 0) { u_int dst, src, idx, len; struct netmap_slot *slot; void *buf; dst = peer->cur_txq; if (dst >= MY_TXQ_LEN) { netmap_fwd(peer); continue; } src = ring->cur; slot = &ring->slot[src]; prefetch (slot+1); idx = slot->buf_idx; buf = (u_char *)NETMAP_BUF(ring, idx); if (idx < 2) { D("%s bogus RX index at offset %d", port->me.nifp->ni_name, src); sleep(2); } prefetch(buf); ring->cur = NETMAP_RING_NEXT(ring, src); ring->avail--; /* prepare to invoke the firewall */ dm = dm0; // XXX clear all including tags args.m = &dm; len = slot->len; dm.m_flags = M_STACK; // remember original buf and peer dm.__m_extbuf = buf; dm.__m_extlen = len; dm.__m_peer = peer; dm.__m_callback = netmap_enqueue; dm.m_pkthdr.rcvif = &port->ifp; dm.m_data = buf + 14; // skip mac dm.m_len = dm.m_pkthdr.len = len - 14; ND("slot %d len %d", i, dm.m_len); // XXX ipfw_chk is slightly faster //ret = ipfw_chk(&args); ipfw_check_hook(NULL, &args.m, NULL, PFIL_IN, NULL); if (args.m != NULL) { // ok. forward /* * XXX TODO remember to clean up any tags that * ipfw may have allocated */ x[dst].slot = slot; x[dst].flags = TXQ_IS_SLOT; peer->cur_txq++; } ND("exit at slot %d", next_i); } } if (peer->cur_txq > 0) netmap_fwd(peer); if (port->cur_txq > 0) // WANT_RUN netmap_fwd(port); ND("done"); return 0; } /* * add a netmap port. We add them in pairs, so forwarding occurs * between two of them. */ void netmap_add_port(const char *dev) { static struct sess *s1 = NULL; // XXX stateful struct my_netmap_port *port; int l; struct sess *s2; D("opening netmap device %s", dev); l = strlen(dev) + 1; if (l >= IFNAMSIZ) { D("name %s too long, max %d", dev, IFNAMSIZ - 1); sleep(2); return; } port = calloc(1, sizeof(*port)); port->me.ifname = port->ifp.if_xname; strcpy(port->ifp.if_xname, dev); if (netmap_open(&port->me, 0, 0 /* promisc */)) { D("error opening %s", dev); kern_free(port); // XXX compat return; } s2 = new_session(port->me.fd, netmap_read, port, WANT_READ); port->sess = s2; D("create sess %p my_netmap_port %p", s2, port); if (s1 == NULL) { /* first of a pair */ s1 = s2; } else { /* second of a pair, cross link */ struct my_netmap_port *peer = s1->arg; port->peer = peer; peer->peer = port; D("%p %s <-> %p %s", port, port->ifp.if_xname, peer, peer->ifp.if_xname); s1 = NULL; } } ipfw-user/extra/sys/contrib/000755 000423 000000 00000000000 12006744005 016634 5ustar00luigiwheel000000 000000 ipfw-user/extra/sys/sys/000755 000423 000000 00000000000 12007737722 016023 5ustar00luigiwheel000000 000000 ipfw-user/extra/sys/sys/mbuf.h000644 000423 000000 00000021370 12007737461 017130 0ustar00luigiwheel000000 000000 /* * Copyright (C) 2012 Luigi Rizzo, Universita` di Pisa * * BSD copyright. * * A simple compatibility interface to map mbufs onto userspace structs */ #ifndef _SYS_MBUF_H_ #define _SYS_MBUF_H_ #define VM_UMA_H // kill this one // #include /* we use free() */ /* hopefully queue.h is already included by someone else */ #include #ifdef _KERNEL /* bzero not present on linux, but this should go in glue.h */ // #define bzero(s, n) memset(s, 0, n) /* * We implement a very simplified UMA allocator where the backend * is simply malloc, and uma_zone only stores the length of the components. */ typedef int uma_zone_t; /* the zone size */ #define uma_zcreate(name, len, _3, _4, _5, _6, _7, _8) (len) typedef int (*uma_init)(void *mem, int size, int flags); typedef void (*uma_fini)(void *mem, int size); #define uma_zfree(zone, item) free(item, M_IPFW) #define uma_zalloc(zone, flags) malloc(zone, M_IPFW, flags) #define uma_zdestroy(zone) do {} while (0) /*- * Macros for type conversion: * mtod(m, t) -- Convert mbuf pointer to data pointer of correct type. */ #define mtod(m, t) ((t)((m)->m_data)) #endif /* _KERNEL */ /* * Packet tag structure (see below for details). */ struct m_tag { SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */ u_int16_t m_tag_id; /* Tag ID */ u_int16_t m_tag_len; /* Length of data */ u_int32_t m_tag_cookie; /* ABI/Module ID */ // void (*m_tag_free)(struct m_tag *); }; /* * Auxiliary structure to store values from the sk_buf. * Note that we should not alter the sk_buff, and if we do * so make sure to keep the values in sync between the mbuf * and the sk_buff (especially m_len and m_pkthdr.len). */ struct skbuf; struct mbuf { struct mbuf *m_next; struct mbuf *m_nextpkt; void *m_data; int m_len; /* length in this mbuf */ int m_flags; struct { struct ifnet *rcvif; int len; /* total packet len */ SLIST_HEAD (packet_tags, m_tag) tags; } m_pkthdr; struct skbuf *m_skb; /* * in-stack mbuffers point to an external buffer, * the two variables below contain base and size, * and have M_STACK set in m_flags. * Buffers from the heap have __m_extbuf = (char *)m + MSIZE */ void *__m_extbuf; /* external buffer base */ int __m_extlen; /* data in ext buffer */ void (*__m_callback)(struct mbuf *, int); void *__m_peer; /* argument attached to the mbuf */ }; /* * note we also have M_FASTFWD_OURS mapped to M_PROTO1 0x10 */ #define M_SKIP_FIREWALL 0x01 /* skip firewall processing */ #define M_BCAST 0x02 /* send/received as link-level broadcast */ #define M_MCAST 0x04 /* send/received as link-level multicast */ #define M_STACK 0x1000 /* allocated on the stack */ void m_freem(struct mbuf *m); #ifdef _KERNEL #define M_DONTWAIT M_NOWAIT /* should not be here... */ /* * m_dup() is used in the TEE case, currently unsupported so we * just return. */ static __inline struct mbuf *m_dup(struct mbuf *m, int n) { (void)m; /* UNUSED */ (void)n; /* UNUSED */ D("unimplemented, expect panic"); return NULL; }; static __inline void m_tag_prepend(struct mbuf *m, struct m_tag *t) { ND("m %p tag %p", m, t); SLIST_INSERT_HEAD(&m->m_pkthdr.tags, t, m_tag_link); } /* * Return the next tag in the list of tags associated with an mbuf. */ static __inline struct m_tag * m_tag_next(struct mbuf *m, struct m_tag *t) { D("mbuf %p tag %p", m, t); return (SLIST_NEXT(t, m_tag_link)); } extern SLIST_HEAD (tags_freelist, m_tag) tags_freelist; extern int tags_minlen; extern int tags_freelist_count; /* * Create an mtag of the given type */ static __inline struct m_tag * m_tag_alloc(uint32_t cookie, int type, int length, int wait) { static int maxlen = 0; int l = length + sizeof(struct m_tag); struct m_tag *m = NULL; if (l > maxlen) { D("new maxlen %d (%d)", l, length ); maxlen = l; } if (l <= tags_minlen) { l = tags_minlen; m = SLIST_FIRST(&tags_freelist); } if (m) { SLIST_REMOVE_HEAD(&tags_freelist, m_tag_link); ND("allocate from freelist"); tags_freelist_count--; } else { ND("size %d allocate from malloc", l); m = malloc(l, 0, M_NOWAIT); } if (m) { bzero(m, l); m->m_tag_id = type; m->m_tag_len = length; m->m_tag_cookie = cookie; ND("tag %p cookie %d type %d", m, cookie, type); } return m; }; #define MTAG_ABI_COMPAT 0 /* compatibility ABI */ static __inline struct m_tag * m_tag_get(int type, int length, int wait) { return m_tag_alloc(MTAG_ABI_COMPAT, type, length, wait); } static __inline struct m_tag * m_tag_first(struct mbuf *m) { struct m_tag *t; t = SLIST_FIRST(&m->m_pkthdr.tags); ND("mbuf %p has %p", m, t); return t; }; static __inline void m_tag_delete(struct mbuf *m, struct m_tag *t) { D("mbuf %p tag %p, ******* unimplemented", m, t); }; static __inline struct m_tag * m_tag_locate(struct mbuf *m, u_int32_t cookie, int x, struct m_tag *t) { struct m_tag *tag; ND("search %d %d in mbuf %p at %p", cookie, x, m, t); if (t) D("--- XXX ignore non-null t %p", t); tag = SLIST_FIRST(&m->m_pkthdr.tags); if (tag == NULL) return NULL; ND("found tag %p cookie %d type %d (want %d %d)", tag, tag->m_tag_cookie, tag->m_tag_id, cookie, x); if (tag->m_tag_cookie != cookie || tag->m_tag_id != x) { ND("want %d %d have %d %d, expect panic", cookie, x, tag->m_tag_cookie, tag->m_tag_id); return NULL; } else return tag; }; static __inline struct m_tag * m_tag_find(struct mbuf *m, int type, struct m_tag *start) { D("m %p", m); return (SLIST_EMPTY(&m->m_pkthdr.tags) ? (struct m_tag *)NULL : m_tag_locate(m, MTAG_ABI_COMPAT, type, start)); }; #define M_SETFIB(_m, _fib) /* nothing on linux */ /* m_pullup is not supported, there is a macro in missing.h */ #define M_GETFIB(_m) 0 /* macro used to create a new mbuf */ #define MT_DATA 1 /* dynamic (data) allocation */ #define MSIZE 256 /* size of an mbuf */ #define MGETHDR(_m, _how, _type) ((_m) = m_gethdr((_how), (_type))) #define MY_MCLBYTES 2048 /* XXX make slightly less */ extern struct mbuf *mbuf_freelist; /* allocate and init a new mbuf using the same structure of FreeBSD */ /* * XXX for the userspace version, we actually allocate * MCLBYTES right after the buffer to store a copy of the packet. */ static __inline struct mbuf * m_gethdr(int how, short type) { struct mbuf *m; static const struct mbuf m0; /* zero-initialized */ if (mbuf_freelist) { m = mbuf_freelist; mbuf_freelist = m->m_next; *m = m0; } else { m = malloc(MY_MCLBYTES, M_IPFW, M_NOWAIT); } ND("new mbuf %p", m); if (m == NULL) { panic("mgethdr failed"); return m; } /* here we have MSIZE - sizeof(struct mbuf) available */ m->m_data = m + 1; m->__m_extbuf = (char *)m + MSIZE; m->__m_extlen = MY_MCLBYTES - MSIZE; return m; } /* * Persistent tags stay with an mbuf until the mbuf is reclaimed. Otherwise * tags are expected to ``vanish'' when they pass through a network * interface. For most interfaces this happens normally as the tags are * reclaimed when the mbuf is free'd. However in some special cases * reclaiming must be done manually. An example is packets that pass through * the loopback interface. Also, one must be careful to do this when * ``turning around'' packets (e.g., icmp_reflect). * * To mark a tag persistent bit-or this flag in when defining the tag id. * The tag will then be treated as described above. */ #define MTAG_PERSISTENT 0x800 #define PACKET_TAG_NONE 0 /* Nadda */ /* Packet tags for use with PACKET_ABI_COMPAT. */ #define PACKET_TAG_IPSEC_IN_DONE 1 /* IPsec applied, in */ #define PACKET_TAG_IPSEC_OUT_DONE 2 /* IPsec applied, out */ #define PACKET_TAG_IPSEC_IN_CRYPTO_DONE 3 /* NIC IPsec crypto done */ #define PACKET_TAG_IPSEC_OUT_CRYPTO_NEEDED 4 /* NIC IPsec crypto req'ed */ #define PACKET_TAG_IPSEC_IN_COULD_DO_CRYPTO 5 /* NIC notifies IPsec */ #define PACKET_TAG_IPSEC_PENDING_TDB 6 /* Reminder to do IPsec */ #define PACKET_TAG_BRIDGE 7 /* Bridge processing done */ #define PACKET_TAG_GIF 8 /* GIF processing done */ #define PACKET_TAG_GRE 9 /* GRE processing done */ #define PACKET_TAG_IN_PACKET_CHECKSUM 10 /* NIC checksumming done */ #define PACKET_TAG_ENCAP 11 /* Encap. processing */ #define PACKET_TAG_IPSEC_SOCKET 12 /* IPSEC socket ref */ #define PACKET_TAG_IPSEC_HISTORY 13 /* IPSEC history */ #define PACKET_TAG_IPV6_INPUT 14 /* IPV6 input processing */ #define PACKET_TAG_DUMMYNET 15 /* dummynet info */ #define PACKET_TAG_DIVERT 17 /* divert info */ #define PACKET_TAG_IPFORWARD 18 /* ipforward info */ #define PACKET_TAG_MACLABEL (19 | MTAG_PERSISTENT) /* MAC label */ #define PACKET_TAG_PF 21 /* PF + ALTQ information */ #define PACKET_TAG_RTSOCKFAM 25 /* rtsock sa family */ #define PACKET_TAG_IPOPTIONS 27 /* Saved IP options */ #define PACKET_TAG_CARP 28 /* CARP info */ #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */ ipfw-user/extra/sys/sys/systm.h000644 000423 000000 00000011135 12006744005 017343 0ustar00luigiwheel000000 000000 #ifndef _SYS_SYSTM_H_ #define _SYS_SYSTM_H_ #define CALLOUT_ACTIVE 0x0002 /* callout is currently active */ #define CALLOUT_MPSAFE 0x0008 /* callout handler is mp safe */ #if defined(USERSPACE) // freebsd userspace #include #ifdef __FreeBSD__ #include #endif /// SLIST_HEAD(callout_list, callout); struct callout; TAILQ_HEAD(callout_tailq, callout); struct callout { union { //SLIST_ENTRY(callout) sle; TAILQ_ENTRY(callout) tqe; } c_links; int c_time; /* ticks to the event */ void *c_arg; /* function argument */ void (*c_func)(void *); /* function to call */ struct lock_object *c_lock; /* lock to handle */ int c_flags; /* state of this entry */ volatile int c_cpu; /* CPU we're scheduled on */ }; int callout_drain(struct callout *c); void callout_init(struct callout *c, int safe); int callout_reset(struct callout *c, int ticks, void (*fn)(void *), void *arg); int callout_reset_on(struct callout *c, int ticks, void (*fn)(void *), void *arg, int cpu); #else /* linux or windows */ #ifndef _WIN32 /* this is the linux version */ /* callout support, in on FreeBSD */ /* * callout support on linux module is done using timers */ #include #ifdef LINUX_24 #include /* jiffies definition is here in 2.4 */ #endif #define callout timer_list static __inline int callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) { co->expires = jiffies + ticks; co->function = (void (*)(unsigned long))fn; co->data = (unsigned long)arg; /* * Linux 2.6.31 and above has add_timer_on(co, cpu), * otherwise add_timer() always schedules a callout on the same * CPU used the first time, so we don't need more. */ add_timer(co); return 0; } #define callout_init(co, safe) init_timer(co) #define callout_drain(co) del_timer(co) #define callout_stop(co) del_timer(co) #else /* _WIN32 */ #include /* This is the windows part for callout support */ struct callout { KTIMER thetimer; KDPC timerdpc; int dpcinitialized; LARGE_INTEGER duetime; }; void dummynet (void*); VOID dummynet_dpc( __in struct _KDPC *Dpc, __in_opt PVOID DeferredContext, __in_opt PVOID SystemArgument1, __in_opt PVOID SystemArgument2 ); VOID ipfw_dpc( __in struct _KDPC *Dpc, __in_opt PVOID DeferredContext, __in_opt PVOID SystemArgument1, __in_opt PVOID SystemArgument2 ); /* callout_reset must handle two problems: * - dummynet() scheduler must be run always on the same processor * because do_gettimeofday() is based on cpu performance counter, and * _occasionally_ can leap backward in time if we query another cpu. * typically this won't happen that much, and the cpu will almost always * be the same even without the affinity restriction, but better to be sure. * - ipfw_tick() does not have the granularity requirements of dummynet() * but we need to pass a pointer as argument. * * for these reasons, if we are called for dummynet() timer, * KeInitializeDpc is called only once as it should be, and the thread * is forced on cpu0 (which is always present), while if we're called * for ipfw_tick(), we re-initialize the DPC each time, using * parameter DeferredContext to pass the needed pointer. since this * timer is called only once a sec, this won't hurt that much. */ static __inline int callout_reset_on(struct callout *co, int ticks, void (*fn)(void *), void *arg, int cpu) { if(fn == &dummynet) { if(co->dpcinitialized == 0) { KeInitializeDpc(&co->timerdpc, dummynet_dpc, NULL); KeSetTargetProcessorDpc(&co->timerdpc, cpu); co->dpcinitialized = 1; } } else { KeInitializeDpc(&co->timerdpc, ipfw_dpc, arg); } co->duetime.QuadPart = (-ticks)*10000; KeSetTimer(&co->thetimer, co->duetime, &co->timerdpc); return 0; } static __inline void callout_init(struct callout* co, int safe) { printf("%s: initializing timer at %p\n",__FUNCTION__,co); KeInitializeTimer(&co->thetimer); } static __inline int callout_drain(struct callout* co) { BOOLEAN canceled = KeCancelTimer(&co->thetimer); while (canceled != TRUE) { canceled = KeCancelTimer(&co->thetimer); } printf("%s: stopping timer at %p\n",__FUNCTION__,co); return 0; } static __inline int callout_stop(struct callout* co) { return callout_drain(co); } #endif /* _WIN32 */ #endif /* linux or windows */ #endif /* _SYS_SYSTM_H_ */ ipfw-user/extra/sys/sys/kernel.h000644 000423 000000 00000001376 12006744005 017452 0ustar00luigiwheel000000 000000 /* * from freebsd's kernel.h */ #ifndef _SYS_KERNEL_H_ #define _SYS_KERNEL_H_ #define SYSINIT(a, b, c, d, e) \ void *sysinit_ ## d = d #define VNET_SYSINIT(a, b, c, d, e) \ void *sysinit_ ## d = d #define SYSUNINIT(a, b, c, d, e) \ void *sysuninit_ ## d = d #define VNET_SYSUNINIT(a, b, c, d, e) \ void *sysuninit_ ## d = d /* * Some enumerated orders; "ANY" sorts last. */ enum sysinit_elem_order { SI_ORDER_FIRST = 0x0000000, /* first*/ SI_ORDER_SECOND = 0x0000001, /* second*/ SI_ORDER_THIRD = 0x0000002, /* third*/ SI_ORDER_MIDDLE = 0x1000000, /* somewhere in the middle */ SI_ORDER_ANY = 0xfffffff /* last*/ }; #endif ipfw-user/extra/sys/sys/module.h000644 000423 000000 00000002037 12006744005 017452 0ustar00luigiwheel000000 000000 /* * trivial module support */ #ifndef _SYS_MODULE_H_ #define _SYS_MODULE_H_ typedef struct module *module_t; typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); typedef enum modeventtype { MOD_LOAD, MOD_UNLOAD, MOD_SHUTDOWN, MOD_QUIESCE } modeventtype_t; typedef struct moduledata { const char *name; /* module name */ modeventhand_t evhand; /* event handler */ void *priv; /* extra data */ } moduledata_t; /* * Hook the module descriptor, md, into our list of things to do. * We should in principle respect the order of loading. * * XXX use the gcc .init functions */ #define DECLARE_MODULE(a, md, c,d) \ moduledata_t *moddesc_##a = &md; /* * XXX MODULE_VERSION is define in linux too */ #define MODULE_DEPEND(a,b,c,d,e) #if 1 // !defined(__FreeBSD__) // defined( __linux__ ) || defined( _WIN32 ) #undef MODULE_VERSION #define MODULE_VERSION(a,b) #endif #endif /* _SYS_MODULE_H_ */ ipfw-user/extra/sys/sys/malloc.h000644 000423 000000 00000000521 12006744005 017430 0ustar00luigiwheel000000 000000 /* * $Id$ * replacement for sys/malloc.h to compile kernel in userspace */ #ifndef _SYS_MALLOC_H_ #define _SYS_MALLOC_H_ #define M_WAITOK 0x0000 /* can block */ #define M_NOWAIT 0x0001 /* do not block */ #define M_ZERO 0x0100 /* bzero the allocation */ #endif /* _SYS_MALLOC_H_ */ ipfw-user/extra/sys/sys/taskqueue.h000644 000423 000000 00000002670 12006744005 020177 0ustar00luigiwheel000000 000000 #ifndef _SYS_TASKQUEUE_H_ #define _SYS_TASKQUEUE_H_ /* * Remap taskqueue to direct calls */ #ifdef _WIN32 struct task { void (*func)(void*, int); }; #define taskqueue_enqueue(tq, ta) (ta)->func(NULL,1) #define TASK_INIT(a,b,c,d) do { \ (a)->func = (c); } while (0) #else struct task { void (*func)(void); }; #define taskqueue_enqueue(tq, ta) (ta)->func() #define TASK_INIT(a,b,c,d) do { \ (a)->func = (void (*)(void))c; } while (0) #endif typedef void (*taskqueue_enqueue_fn)(void *context); // #define taskqueue_create(_a, _b, _c, _d) NULL struct taskqueue *taskqueue_create(const char *name, int mflags, taskqueue_enqueue_fn enqueue, void *context); void taskqueue_thread_enqueue(void *context); // #define taskqueue_create_fast(_a, _b, _c, _d) NULL int taskqueue_start_threads(struct taskqueue **tqp, int count, int pri, const char *name, ...) __printflike(4, 5); // #define taskqueue_drain(_a, _b) /* XXX to be completed */ // #define taskqueue_free(_a) /* XXX to be completed */ void taskqueue_drain(struct taskqueue *queue, struct task *task); void taskqueue_free(struct taskqueue *queue); #define PRI_MIN (0) /* Highest priority. */ #define PRI_MIN_ITHD (PRI_MIN) #ifndef __FreeBSD__ #define PI_NET (PRI_MIN_ITHD + 16) #endif #endif /* !_SYS_TASKQUEUE_H_ */ ipfw-user/extra/sys/contrib/pf/000755 000423 000000 00000000000 12006744005 017241 5ustar00luigiwheel000000 000000 ipfw-user/extra/sys/contrib/pf/net/000755 000423 000000 00000000000 12006744005 020027 5ustar00luigiwheel000000 000000 ipfw-user/extra/sys/contrib/pf/net/pfvar.h000644 000423 000000 00000001223 12006744005 021314 0ustar00luigiwheel000000 000000 /* * replacement for FreeBSD's pfqueue.h */ #include #define DIOCSTARTALTQ _IO ('D', 42) #define DIOCSTOPALTQ _IO ('D', 43) struct pf_altq { TAILQ_ENTRY(pf_altq) entries; /* ... */ u_int32_t qid; /* return value */ #define PF_QNAME_SIZE 64 char qname[PF_QNAME_SIZE]; /* queue name */ }; struct pfioc_altq { u_int32_t action; u_int32_t ticket; u_int32_t nr; struct pf_altq altq; }; #define DIOCGETALTQS _IOWR('D', 47, struct pfioc_altq) #define DIOCGETALTQ _IOWR('D', 48, struct pfioc_altq) ipfw-user/sys/net/000755 000423 000000 00000000000 12006744005 014637 5ustar00luigiwheel000000 000000 ipfw-user/sys/netinet/000755 000423 000000 00000000000 12007452707 015525 5ustar00luigiwheel000000 000000 ipfw-user/sys/netgraph/000755 000423 000000 00000000000 12006744005 015661 5ustar00luigiwheel000000 000000 ipfw-user/sys/sys/000755 000423 000000 00000000000 12006744005 014667 5ustar00luigiwheel000000 000000 ipfw-user/sys/netgraph/ng_ipfw.h000644 000423 000000 00000003071 12006744005 017464 0ustar00luigiwheel000000 000000 /*- * Copyright 2005, Gleb Smirnoff * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: head/sys/netgraph/ng_ipfw.h 201124 2009-12-28 12:29:13Z luigi $ */ #ifndef _NG_IPFW_H #define _NG_IPFW_H #define NG_IPFW_NODE_TYPE "ipfw" #define NGM_IPFW_COOKIE 1105988990 #endif /* _NG_IPFW_H */ ipfw-user/sys/netinet/ipfw/000755 000423 000000 00000000000 12007737707 016500 5ustar00luigiwheel000000 000000 ipfw-user/sys/netinet/ip_dummynet.h000644 000423 000000 00000020500 12012141274 020213 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: head/sys/netinet/ip_dummynet.h 206845 2010-04-19 16:17:30Z luigi $ */ #ifndef _IP_DUMMYNET_H #define _IP_DUMMYNET_H /* * Definition of the kernel-userland API for dummynet. * * Setsockopt() and getsockopt() pass a batch of objects, each * of them starting with a "struct dn_id" which should fully identify * the object and its relation with others in the sequence. * The first object in each request should have * type= DN_CMD_*, id = DN_API_VERSION. * For other objects, type and subtype specify the object, len indicates * the total length including the header, and 'id' identifies the specific * object. * * Most objects are numbered with an identifier in the range 1..65535. * DN_MAX_ID indicates the first value outside the range. */ #define DN_API_VERSION 12500000 #define DN_MAX_ID 0x10000 struct dn_id { uint16_t len; /* total obj len including this header */ uint8_t type; uint8_t subtype; uint32_t id; /* generic id */ }; /* * These values are in the type field of struct dn_id. * To preserve the ABI, never rearrange the list or delete * entries with the exception of DN_LAST */ enum { DN_NONE = 0, DN_LINK = 1, DN_FS, DN_SCH, DN_SCH_I, DN_QUEUE, DN_DELAY_LINE, DN_PROFILE, DN_FLOW, /* struct dn_flow */ DN_TEXT, /* opaque text is the object */ DN_CMD_CONFIG = 0x80, /* objects follow */ DN_CMD_DELETE, /* subtype + list of entries */ DN_CMD_GET, /* subtype + list of entries */ DN_CMD_FLUSH, /* for compatibility with FreeBSD 7.2/8 */ DN_COMPAT_PIPE, DN_COMPAT_QUEUE, DN_GET_COMPAT, /* special commands for emulation of sysctl variables */ DN_SYSCTL_GET, DN_SYSCTL_SET, DN_LAST, }; enum { /* subtype for schedulers, flowset and the like */ DN_SCHED_UNKNOWN = 0, DN_SCHED_FIFO = 1, DN_SCHED_WF2QP = 2, /* others are in individual modules */ }; enum { /* user flags */ DN_HAVE_MASK = 0x0001, /* fs or sched has a mask */ DN_NOERROR = 0x0002, /* do not report errors */ DN_QHT_HASH = 0x0004, /* qht is a hash table */ DN_QSIZE_BYTES = 0x0008, /* queue size is in bytes */ DN_HAS_PROFILE = 0x0010, /* a link has a profile */ DN_IS_RED = 0x0020, DN_IS_GENTLE_RED= 0x0040, DN_PIPE_CMD = 0x1000, /* pipe config... */ }; /* * link template. */ struct dn_link { struct dn_id oid; /* * Userland sets bw and delay in bits/s and milliseconds. * The kernel converts this back and forth to bits/tick and ticks. * XXX what about burst ? */ int32_t link_nr; int bandwidth; /* bit/s or bits/tick. */ int delay; /* ms and ticks */ uint64_t burst; /* scaled. bits*Hz XXX */ }; /* * A flowset, which is a template for flows. Contains parameters * from the command line: id, target scheduler, queue sizes, plr, * flow masks, buckets for the flow hash, and possibly scheduler- * specific parameters (weight, quantum and so on). */ struct dn_fs { struct dn_id oid; uint32_t fs_nr; /* the flowset number */ uint32_t flags; /* userland flags */ int qsize; /* queue size in slots or bytes */ int32_t plr; /* PLR, pkt loss rate (2^31-1 means 100%) */ uint32_t buckets; /* buckets used for the queue hash table */ struct ipfw_flow_id flow_mask; uint32_t sched_nr; /* the scheduler we attach to */ /* generic scheduler parameters. Leave them at -1 if unset. * Now we use 0: weight, 1: lmax, 2: priority */ int par[4]; /* RED/GRED parameters. * weight and probabilities are in the range 0..1 represented * in fixed point arithmetic with SCALE_RED decimal bits. */ #define SCALE_RED 16 #define SCALE(x) ( (x) << SCALE_RED ) #define SCALE_VAL(x) ( (x) >> SCALE_RED ) #define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) int w_q ; /* queue weight (scaled) */ int max_th ; /* maximum threshold for queue (scaled) */ int min_th ; /* minimum threshold for queue (scaled) */ int max_p ; /* maximum value for p_b (scaled) */ }; /* * dn_flow collects flow_id and stats for queues and scheduler * instances, and is used to pass these info to userland. * oid.type/oid.subtype describe the object, oid.id is number * of the parent object. */ struct dn_flow { struct dn_id oid; struct ipfw_flow_id fid; uint64_t tot_pkts; /* statistics counters */ uint64_t tot_bytes; uint32_t length; /* Queue length, in packets */ uint32_t len_bytes; /* Queue length, in bytes */ uint32_t drops; }; /* * Scheduler template, mostly indicating the name, number, * sched_mask and buckets. */ struct dn_sch { struct dn_id oid; uint32_t sched_nr; /* N, scheduler number */ uint32_t buckets; /* number of buckets for the instances */ uint32_t flags; /* have_mask, ... */ char name[16]; /* null terminated */ /* mask to select the appropriate scheduler instance */ struct ipfw_flow_id sched_mask; /* M */ }; /* A delay profile is attached to a link. * Note that a profile, as any other object, cannot be longer than 2^16 */ #define ED_MAX_SAMPLES_NO 1024 struct dn_profile { struct dn_id oid; /* fields to simulate a delay profile */ #define ED_MAX_NAME_LEN 32 char name[ED_MAX_NAME_LEN]; int link_nr; int loss_level; int bandwidth; // XXX use link bandwidth? int samples_no; /* actual len of samples[] */ int samples[ED_MAX_SAMPLES_NO]; /* may be shorter */ }; /* * Overall structure of dummynet In dummynet, packets are selected with the firewall rules, and passed to two different objects: PIPE or QUEUE (bad name). A QUEUE defines a classifier, which groups packets into flows according to a 'mask', puts them into independent queues (one per flow) with configurable size and queue management policy, and passes flows to a scheduler: (flow_mask|sched_mask) sched_mask +---------+ weight Wx +-------------+ | |->-[flow]-->--| |-+ -->--| QUEUE x | ... | | | | |->-[flow]-->--| SCHEDuler N | | +---------+ | | | ... | +--[LINK N]-->-- +---------+ weight Wy | | +--[LINK N]-->-- | |->-[flow]-->--| | | -->--| QUEUE y | ... | | | | |->-[flow]-->--| | | +---------+ +-------------+ | +-------------+ Many QUEUE objects can connect to the same scheduler, each QUEUE object can have its own set of parameters. In turn, the SCHEDuler 'forks' multiple instances according to a 'sched_mask', each instance manages its own set of queues and transmits on a private instance of a configurable LINK. A PIPE is a simplified version of the above, where there is no flow_mask, and each scheduler instance handles a single queue. The following data structures (visible from userland) describe the objects used by dummynet: + dn_link, contains the main configuration parameters related to delay and bandwidth; + dn_profile describes a delay profile; + dn_flow describes the flow status (flow id, statistics) + dn_sch describes a scheduler + dn_fs describes a flowset (msk, weight, queue parameters) * */ #endif /* _IP_DUMMYNET_H */ ipfw-user/sys/netinet/in_cksum.c000644 000423 000000 00000010210 12007435564 017475 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1988, 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD: head/sys/netinet/in_cksum.c 238941 2012-07-31 08:04:49Z luigi $"); #include #include /* * Checksum routine for Internet Protocol family headers (Portable Version). * * This routine is very heavily used in the network * code and should be modified for each CPU to be as fast as possible. */ #define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) #define REDUCE {l_util.l = sum; sum = l_util.s[0] + l_util.s[1]; ADDCARRY(sum);} int in_cksum(struct mbuf *m, int len) { register u_short *w; register int sum = 0; register int mlen = 0; int byte_swapped = 0; union { char c[2]; u_short s; } s_util; union { u_short s[2]; long l; } l_util; for (;m && len; m = m->m_next) { if (m->m_len == 0) continue; w = mtod(m, u_short *); if (mlen == -1) { /* * The first byte of this mbuf is the continuation * of a word spanning between this mbuf and the * last mbuf. * * s_util.c[0] is already saved when scanning previous * mbuf. */ s_util.c[1] = *(char *)w; sum += s_util.s; w = (u_short *)((char *)w + 1); mlen = m->m_len - 1; len--; } else mlen = m->m_len; if (len < mlen) mlen = len; len -= mlen; /* * Force to even boundary. */ if ((1 & (uintptr_t) w) && (mlen > 0)) { REDUCE; sum <<= 8; s_util.c[0] = *(u_char *)w; w = (u_short *)((char *)w + 1); mlen--; byte_swapped = 1; } /* * Unroll the loop to make overhead from * branches &c small. */ while ((mlen -= 32) >= 0) { sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; sum += w[4]; sum += w[5]; sum += w[6]; sum += w[7]; sum += w[8]; sum += w[9]; sum += w[10]; sum += w[11]; sum += w[12]; sum += w[13]; sum += w[14]; sum += w[15]; w += 16; } mlen += 32; while ((mlen -= 8) >= 0) { sum += w[0]; sum += w[1]; sum += w[2]; sum += w[3]; w += 4; } mlen += 8; if (mlen == 0 && byte_swapped == 0) continue; REDUCE; while ((mlen -= 2) >= 0) { sum += *w++; } if (byte_swapped) { REDUCE; sum <<= 8; byte_swapped = 0; if (mlen == -1) { s_util.c[1] = *(char *)w; sum += s_util.s; mlen = 0; } else mlen = -1; } else if (mlen == -1) s_util.c[0] = *(char *)w; } if (len) printf("cksum: out of data\n"); if (mlen == -1) { /* The last mbuf has odd # of bytes. Follow the standard (the odd byte may be shifted left by 8 bits or not as determined by endian-ness of the machine) */ s_util.c[1] = 0; sum += s_util.s; } REDUCE; return (~sum & 0xffff); } ipfw-user/sys/netinet/ip_fw.h000644 000423 000000 00000046474 12007435564 017023 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: head/sys/netinet/ip_fw.h 234946 2012-05-03 08:56:43Z melifaro $ */ #ifndef _IPFW2_H #define _IPFW2_H /* * The default rule number. By the design of ip_fw, the default rule * is the last one, so its number can also serve as the highest number * allowed for a rule. The ip_fw code relies on both meanings of this * constant. */ #define IPFW_DEFAULT_RULE 65535 /* * Default number of ipfw tables. */ #define IPFW_TABLES_MAX 65535 #define IPFW_TABLES_DEFAULT 128 /* * Most commands (queue, pipe, tag, untag, limit...) can have a 16-bit * argument between 1 and 65534. The value 0 is unused, the value * 65535 (IP_FW_TABLEARG) is used to represent 'tablearg', i.e. the * can be 1..65534, or 65535 to indicate the use of a 'tablearg' * result of the most recent table() lookup. * Note that 16bit is only a historical limit, resulting from * the use of a 16-bit fields for that value. In reality, we can have * 2^32 pipes, queues, tag values and so on, and use 0 as a tablearg. */ #define IPFW_ARG_MIN 1 #define IPFW_ARG_MAX 65534 #define IP_FW_TABLEARG 65535 /* XXX should use 0 */ /* * Number of entries in the call stack of the call/return commands. * Call stack currently is an uint16_t array with rule numbers. */ #define IPFW_CALLSTACK_SIZE 16 /* IP_FW3 header/opcodes */ typedef struct _ip_fw3_opheader { uint16_t opcode; /* Operation opcode */ uint16_t reserved[3]; /* Align to 64-bit boundary */ } ip_fw3_opheader; /* IPFW extented tables support */ #define IP_FW_TABLE_XADD 86 /* add entry */ #define IP_FW_TABLE_XDEL 87 /* delete entry */ #define IP_FW_TABLE_XGETSIZE 88 /* get table size */ #define IP_FW_TABLE_XLIST 89 /* list table contents */ /* * The kernel representation of ipfw rules is made of a list of * 'instructions' (for all practical purposes equivalent to BPF * instructions), which specify which fields of the packet * (or its metadata) should be analysed. * * Each instruction is stored in a structure which begins with * "ipfw_insn", and can contain extra fields depending on the * instruction type (listed below). * Note that the code is written so that individual instructions * have a size which is a multiple of 32 bits. This means that, if * such structures contain pointers or other 64-bit entities, * (there is just one instance now) they may end up unaligned on * 64-bit architectures, so the must be handled with care. * * "enum ipfw_opcodes" are the opcodes supported. We can have up * to 256 different opcodes. When adding new opcodes, they should * be appended to the end of the opcode list before O_LAST_OPCODE, * this will prevent the ABI from being broken, otherwise users * will have to recompile ipfw(8) when they update the kernel. */ enum ipfw_opcodes { /* arguments (4 byte each) */ O_NOP, O_IP_SRC, /* u32 = IP */ O_IP_SRC_MASK, /* ip = IP/mask */ O_IP_SRC_ME, /* none */ O_IP_SRC_SET, /* u32=base, arg1=len, bitmap */ O_IP_DST, /* u32 = IP */ O_IP_DST_MASK, /* ip = IP/mask */ O_IP_DST_ME, /* none */ O_IP_DST_SET, /* u32=base, arg1=len, bitmap */ O_IP_SRCPORT, /* (n)port list:mask 4 byte ea */ O_IP_DSTPORT, /* (n)port list:mask 4 byte ea */ O_PROTO, /* arg1=protocol */ O_MACADDR2, /* 2 mac addr:mask */ O_MAC_TYPE, /* same as srcport */ O_LAYER2, /* none */ O_IN, /* none */ O_FRAG, /* none */ O_RECV, /* none */ O_XMIT, /* none */ O_VIA, /* none */ O_IPOPT, /* arg1 = 2*u8 bitmap */ O_IPLEN, /* arg1 = len */ O_IPID, /* arg1 = id */ O_IPTOS, /* arg1 = id */ O_IPPRECEDENCE, /* arg1 = precedence << 5 */ O_IPTTL, /* arg1 = TTL */ O_IPVER, /* arg1 = version */ O_UID, /* u32 = id */ O_GID, /* u32 = id */ O_ESTAB, /* none (tcp established) */ O_TCPFLAGS, /* arg1 = 2*u8 bitmap */ O_TCPWIN, /* arg1 = desired win */ O_TCPSEQ, /* u32 = desired seq. */ O_TCPACK, /* u32 = desired seq. */ O_ICMPTYPE, /* u32 = icmp bitmap */ O_TCPOPTS, /* arg1 = 2*u8 bitmap */ O_VERREVPATH, /* none */ O_VERSRCREACH, /* none */ O_PROBE_STATE, /* none */ O_KEEP_STATE, /* none */ O_LIMIT, /* ipfw_insn_limit */ O_LIMIT_PARENT, /* dyn_type, not an opcode. */ /* * These are really 'actions'. */ O_LOG, /* ipfw_insn_log */ O_PROB, /* u32 = match probability */ O_CHECK_STATE, /* none */ O_ACCEPT, /* none */ O_DENY, /* none */ O_REJECT, /* arg1=icmp arg (same as deny) */ O_COUNT, /* none */ O_SKIPTO, /* arg1=next rule number */ O_PIPE, /* arg1=pipe number */ O_QUEUE, /* arg1=queue number */ O_DIVERT, /* arg1=port number */ O_TEE, /* arg1=port number */ O_FORWARD_IP, /* fwd sockaddr */ O_FORWARD_MAC, /* fwd mac */ O_NAT, /* nope */ O_REASS, /* none */ /* * More opcodes. */ O_IPSEC, /* has ipsec history */ O_IP_SRC_LOOKUP, /* arg1=table number, u32=value */ O_IP_DST_LOOKUP, /* arg1=table number, u32=value */ O_ANTISPOOF, /* none */ O_JAIL, /* u32 = id */ O_ALTQ, /* u32 = altq classif. qid */ O_DIVERTED, /* arg1=bitmap (1:loop, 2:out) */ O_TCPDATALEN, /* arg1 = tcp data len */ O_IP6_SRC, /* address without mask */ O_IP6_SRC_ME, /* my addresses */ O_IP6_SRC_MASK, /* address with the mask */ O_IP6_DST, O_IP6_DST_ME, O_IP6_DST_MASK, O_FLOW6ID, /* for flow id tag in the ipv6 pkt */ O_ICMP6TYPE, /* icmp6 packet type filtering */ O_EXT_HDR, /* filtering for ipv6 extension header */ O_IP6, /* * actions for ng_ipfw */ O_NETGRAPH, /* send to ng_ipfw */ O_NGTEE, /* copy to ng_ipfw */ O_IP4, O_UNREACH6, /* arg1=icmpv6 code arg (deny) */ O_TAG, /* arg1=tag number */ O_TAGGED, /* arg1=tag number */ O_SETFIB, /* arg1=FIB number */ O_FIB, /* arg1=FIB desired fib number */ O_SOCKARG, /* socket argument */ O_CALLRETURN, /* arg1=called rule number */ O_FORWARD_IP6, /* fwd sockaddr_in6 */ O_LAST_OPCODE /* not an opcode! */ }; /* * The extension header are filtered only for presence using a bit * vector with a flag for each header. */ #define EXT_FRAGMENT 0x1 #define EXT_HOPOPTS 0x2 #define EXT_ROUTING 0x4 #define EXT_AH 0x8 #define EXT_ESP 0x10 #define EXT_DSTOPTS 0x20 #define EXT_RTHDR0 0x40 #define EXT_RTHDR2 0x80 /* * Template for instructions. * * ipfw_insn is used for all instructions which require no operands, * a single 16-bit value (arg1), or a couple of 8-bit values. * * For other instructions which require different/larger arguments * we have derived structures, ipfw_insn_*. * * The size of the instruction (in 32-bit words) is in the low * 6 bits of "len". The 2 remaining bits are used to implement * NOT and OR on individual instructions. Given a type, you can * compute the length to be put in "len" using F_INSN_SIZE(t) * * F_NOT negates the match result of the instruction. * * F_OR is used to build or blocks. By default, instructions * are evaluated as part of a logical AND. An "or" block * { X or Y or Z } contains F_OR set in all but the last * instruction of the block. A match will cause the code * to skip past the last instruction of the block. * * NOTA BENE: in a couple of places we assume that * sizeof(ipfw_insn) == sizeof(u_int32_t) * this needs to be fixed. * */ typedef struct _ipfw_insn { /* template for instructions */ u_int8_t opcode; u_int8_t len; /* number of 32-bit words */ #define F_NOT 0x80 #define F_OR 0x40 #define F_LEN_MASK 0x3f #define F_LEN(cmd) ((cmd)->len & F_LEN_MASK) u_int16_t arg1; } ipfw_insn; /* * The F_INSN_SIZE(type) computes the size, in 4-byte words, of * a given type. */ #define F_INSN_SIZE(t) ((sizeof (t))/sizeof(u_int32_t)) /* * This is used to store an array of 16-bit entries (ports etc.) */ typedef struct _ipfw_insn_u16 { ipfw_insn o; u_int16_t ports[2]; /* there may be more */ } ipfw_insn_u16; /* * This is used to store an array of 32-bit entries * (uid, single IPv4 addresses etc.) */ typedef struct _ipfw_insn_u32 { ipfw_insn o; u_int32_t d[1]; /* one or more */ } ipfw_insn_u32; /* * This is used to store IP addr-mask pairs. */ typedef struct _ipfw_insn_ip { ipfw_insn o; struct in_addr addr; struct in_addr mask; } ipfw_insn_ip; /* * This is used to forward to a given address (ip). */ typedef struct _ipfw_insn_sa { ipfw_insn o; struct sockaddr_in sa; } ipfw_insn_sa; /* * This is used to forward to a given address (ipv6). */ typedef struct _ipfw_insn_sa6 { ipfw_insn o; struct sockaddr_in6 sa; } ipfw_insn_sa6; /* * This is used for MAC addr-mask pairs. */ typedef struct _ipfw_insn_mac { ipfw_insn o; u_char addr[12]; /* dst[6] + src[6] */ u_char mask[12]; /* dst[6] + src[6] */ } ipfw_insn_mac; /* * This is used for interface match rules (recv xx, xmit xx). */ typedef struct _ipfw_insn_if { ipfw_insn o; union { struct in_addr ip; int glob; } p; char name[IFNAMSIZ]; } ipfw_insn_if; /* * This is used for storing an altq queue id number. */ typedef struct _ipfw_insn_altq { ipfw_insn o; u_int32_t qid; } ipfw_insn_altq; /* * This is used for limit rules. */ typedef struct _ipfw_insn_limit { ipfw_insn o; u_int8_t _pad; u_int8_t limit_mask; /* combination of DYN_* below */ #define DYN_SRC_ADDR 0x1 #define DYN_SRC_PORT 0x2 #define DYN_DST_ADDR 0x4 #define DYN_DST_PORT 0x8 u_int16_t conn_limit; } ipfw_insn_limit; /* * This is used for log instructions. */ typedef struct _ipfw_insn_log { ipfw_insn o; u_int32_t max_log; /* how many do we log -- 0 = all */ u_int32_t log_left; /* how many left to log */ } ipfw_insn_log; /* * Data structures required by both ipfw(8) and ipfw(4) but not part of the * management API are protected by IPFW_INTERNAL. */ #ifdef IPFW_INTERNAL /* Server pool support (LSNAT). */ struct cfg_spool { LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ struct in_addr addr; u_short port; }; #endif /* Redirect modes id. */ #define REDIR_ADDR 0x01 #define REDIR_PORT 0x02 #define REDIR_PROTO 0x04 #ifdef IPFW_INTERNAL /* Nat redirect configuration. */ struct cfg_redir { LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ u_int16_t mode; /* type of redirect mode */ struct in_addr laddr; /* local ip address */ struct in_addr paddr; /* public ip address */ struct in_addr raddr; /* remote ip address */ u_short lport; /* local port */ u_short pport; /* public port */ u_short rport; /* remote port */ u_short pport_cnt; /* number of public ports */ u_short rport_cnt; /* number of remote ports */ int proto; /* protocol: tcp/udp */ struct alias_link **alink; /* num of entry in spool chain */ u_int16_t spool_cnt; /* chain of spool instances */ LIST_HEAD(spool_chain, cfg_spool) spool_chain; }; #endif #ifdef IPFW_INTERNAL /* Nat configuration data struct. */ struct cfg_nat { /* chain of nat instances */ LIST_ENTRY(cfg_nat) _next; int id; /* nat id */ struct in_addr ip; /* nat ip address */ char if_name[IF_NAMESIZE]; /* interface name */ int mode; /* aliasing mode */ struct libalias *lib; /* libalias instance */ /* number of entry in spool chain */ int redir_cnt; /* chain of redir instances */ LIST_HEAD(redir_chain, cfg_redir) redir_chain; }; #endif #define SOF_NAT sizeof(struct cfg_nat) #define SOF_REDIR sizeof(struct cfg_redir) #define SOF_SPOOL sizeof(struct cfg_spool) /* Nat command. */ typedef struct _ipfw_insn_nat { ipfw_insn o; struct cfg_nat *nat; } ipfw_insn_nat; /* Apply ipv6 mask on ipv6 addr */ #define APPLY_MASK(addr,mask) \ (addr)->__u6_addr.__u6_addr32[0] &= (mask)->__u6_addr.__u6_addr32[0]; \ (addr)->__u6_addr.__u6_addr32[1] &= (mask)->__u6_addr.__u6_addr32[1]; \ (addr)->__u6_addr.__u6_addr32[2] &= (mask)->__u6_addr.__u6_addr32[2]; \ (addr)->__u6_addr.__u6_addr32[3] &= (mask)->__u6_addr.__u6_addr32[3]; /* Structure for ipv6 */ typedef struct _ipfw_insn_ip6 { ipfw_insn o; struct in6_addr addr6; struct in6_addr mask6; } ipfw_insn_ip6; /* Used to support icmp6 types */ typedef struct _ipfw_insn_icmp6 { ipfw_insn o; uint32_t d[7]; /* XXX This number si related to the netinet/icmp6.h * define ICMP6_MAXTYPE * as follows: n = ICMP6_MAXTYPE/32 + 1 * Actually is 203 */ } ipfw_insn_icmp6; /* * Here we have the structure representing an ipfw rule. * * It starts with a general area (with link fields and counters) * followed by an array of one or more instructions, which the code * accesses as an array of 32-bit values. * * Given a rule pointer r: * * r->cmd is the start of the first instruction. * ACTION_PTR(r) is the start of the first action (things to do * once a rule matched). * * When assembling instruction, remember the following: * * + if a rule has a "keep-state" (or "limit") option, then the * first instruction (at r->cmd) MUST BE an O_PROBE_STATE * + if a rule has a "log" option, then the first action * (at ACTION_PTR(r)) MUST be O_LOG * + if a rule has an "altq" option, it comes after "log" * + if a rule has an O_TAG option, it comes after "log" and "altq" * * NOTE: we use a simple linked list of rules because we never need * to delete a rule without scanning the list. We do not use * queue(3) macros for portability and readability. */ struct ip_fw { struct ip_fw *x_next; /* linked list of rules */ struct ip_fw *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ #define RESVD_SET 31 /* set for default and persistent rules */ uint8_t _pad; /* padding */ uint32_t id; /* rule id */ /* These fields are present in all rules. */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ ipfw_insn cmd[1]; /* storage for commands */ }; #define ACTION_PTR(rule) \ (ipfw_insn *)( (u_int32_t *)((rule)->cmd) + ((rule)->act_ofs) ) #define RULESIZE(rule) (sizeof(struct ip_fw) + \ ((struct ip_fw *)(rule))->cmd_len * 4 - 4) #if 1 // should be moved to in.h /* * This structure is used as a flow mask and a flow id for various * parts of the code. * addr_type is used in userland and kernel to mark the address type. * fib is used in the kernel to record the fib in use. * _flags is used in the kernel to store tcp flags for dynamic rules. */ struct ipfw_flow_id { uint32_t dst_ip; uint32_t src_ip; uint16_t dst_port; uint16_t src_port; uint8_t fib; uint8_t proto; uint8_t _flags; /* protocol-specific flags */ uint8_t addr_type; /* 4=ip4, 6=ip6, 1=ether ? */ struct in6_addr dst_ip6; struct in6_addr src_ip6; uint32_t flow_id6; uint32_t extra; /* queue/pipe or frag_id */ }; #endif #define IS_IP6_FLOW_ID(id) ((id)->addr_type == 6) /* * Dynamic ipfw rule. */ typedef struct _ipfw_dyn_rule ipfw_dyn_rule; struct _ipfw_dyn_rule { ipfw_dyn_rule *next; /* linked list of rules. */ struct ip_fw *rule; /* pointer to rule */ /* 'rule' is used to pass up the rule number (from the parent) */ ipfw_dyn_rule *parent; /* pointer to parent rule */ u_int64_t pcnt; /* packet match counter */ u_int64_t bcnt; /* byte match counter */ struct ipfw_flow_id id; /* (masked) flow id */ u_int32_t expire; /* expire time */ u_int32_t bucket; /* which bucket in hash table */ u_int32_t state; /* state of this rule (typically a * combination of TCP flags) */ u_int32_t ack_fwd; /* most recent ACKs in forward */ u_int32_t ack_rev; /* and reverse directions (used */ /* to generate keepalives) */ u_int16_t dyn_type; /* rule type */ u_int16_t count; /* refcount */ }; /* * Definitions for IP option names. */ #define IP_FW_IPOPT_LSRR 0x01 #define IP_FW_IPOPT_SSRR 0x02 #define IP_FW_IPOPT_RR 0x04 #define IP_FW_IPOPT_TS 0x08 /* * Definitions for TCP option names. */ #define IP_FW_TCPOPT_MSS 0x01 #define IP_FW_TCPOPT_WINDOW 0x02 #define IP_FW_TCPOPT_SACK 0x04 #define IP_FW_TCPOPT_TS 0x08 #define IP_FW_TCPOPT_CC 0x10 #define ICMP_REJECT_RST 0x100 /* fake ICMP code (send a TCP RST) */ #define ICMP6_UNREACH_RST 0x100 /* fake ICMPv6 code (send a TCP RST) */ /* * These are used for lookup tables. */ #define IPFW_TABLE_CIDR 1 /* Table for holding IPv4/IPv6 prefixes */ #define IPFW_TABLE_INTERFACE 2 /* Table for holding interface names */ #define IPFW_TABLE_MAXTYPE 2 /* Maximum valid number */ typedef struct _ipfw_table_entry { in_addr_t addr; /* network address */ u_int32_t value; /* value */ u_int16_t tbl; /* table number */ u_int8_t masklen; /* mask length */ } ipfw_table_entry; typedef struct _ipfw_table_xentry { uint16_t len; /* Total entry length */ uint8_t type; /* entry type */ uint8_t masklen; /* mask length */ uint16_t tbl; /* table number */ uint32_t value; /* value */ union { /* Longest field needs to be aligned by 4-byte boundary */ struct in6_addr addr6; /* IPv6 address */ char iface[IF_NAMESIZE]; /* interface name */ } k; } ipfw_table_xentry; typedef struct _ipfw_table { u_int32_t size; /* size of entries in bytes */ u_int32_t cnt; /* # of entries */ u_int16_t tbl; /* table number */ ipfw_table_entry ent[0]; /* entries */ } ipfw_table; typedef struct _ipfw_xtable { ip_fw3_opheader opheader; /* eXtended tables are controlled via IP_FW3 */ uint32_t size; /* size of entries in bytes */ uint32_t cnt; /* # of entries */ uint16_t tbl; /* table number */ uint8_t type; /* table type */ ipfw_table_xentry xent[0]; /* entries */ } ipfw_xtable; #endif /* _IPFW2_H */ ipfw-user/sys/netinet/tcp.h000644 000423 000000 00000021573 12006744005 016466 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp.h 8.1 (Berkeley) 6/10/93 * $FreeBSD: head/sys/netinet/tcp.h 231025 2012-02-05 16:53:02Z glebius $ */ #ifndef _NETINET_TCP_H_ #define _NETINET_TCP_H_ #include #include #if __BSD_VISIBLE typedef u_int32_t tcp_seq; #define tcp6_seq tcp_seq /* for KAME src sync over BSD*'s */ #define tcp6hdr tcphdr /* for KAME src sync over BSD*'s */ /* * TCP header. * Per RFC 793, September, 1981. */ struct tcphdr { u_short th_sport; /* source port */ u_short th_dport; /* destination port */ tcp_seq th_seq; /* sequence number */ tcp_seq th_ack; /* acknowledgement number */ #if BYTE_ORDER == LITTLE_ENDIAN u_char th_x2:4, /* (unused) */ th_off:4; /* data offset */ #endif #if BYTE_ORDER == BIG_ENDIAN u_char th_off:4, /* data offset */ th_x2:4; /* (unused) */ #endif u_char th_flags; #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 #define TH_PUSH 0x08 #define TH_ACK 0x10 #define TH_URG 0x20 #define TH_ECE 0x40 #define TH_CWR 0x80 #define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) #define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR" u_short th_win; /* window */ u_short th_sum; /* checksum */ u_short th_urp; /* urgent pointer */ }; #define TCPOPT_EOL 0 #define TCPOLEN_EOL 1 #define TCPOPT_PAD 0 /* padding after EOL */ #define TCPOLEN_PAD 1 #define TCPOPT_NOP 1 #define TCPOLEN_NOP 1 #define TCPOPT_MAXSEG 2 #define TCPOLEN_MAXSEG 4 #define TCPOPT_WINDOW 3 #define TCPOLEN_WINDOW 3 #define TCPOPT_SACK_PERMITTED 4 #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 #define TCPOLEN_SACKHDR 2 #define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 /* Miscellaneous constants */ #define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ #define TCP_MAX_SACK 4 /* MAX # SACKs sent in any segment */ /* * The default maximum segment size (MSS) to be used for new TCP connections * when path MTU discovery is not enabled. * * RFC879 derives the default MSS from the largest datagram size hosts are * minimally required to handle directly or through IP reassembly minus the * size of the IP and TCP header. With IPv6 the minimum MTU is specified * in RFC2460. * * For IPv4 the MSS is 576 - sizeof(struct tcpiphdr) * For IPv6 the MSS is IPV6_MMTU - sizeof(struct ip6_hdr) - sizeof(struct tcphdr) * * We use explicit numerical definition here to avoid header pollution. */ #define TCP_MSS 536 #define TCP6_MSS 1220 /* * Limit the lowest MSS we accept for path MTU discovery and the TCP SYN MSS * option. Allowing low values of MSS can consume significant resources and * be used to mount a resource exhaustion attack. * Connections requesting lower MSS values will be rounded up to this value * and the IP_DF flag will be cleared to allow fragmentation along the path. * * See tcp_subr.c tcp_minmss SYSCTL declaration for more comments. Setting * it to "0" disables the minmss check. * * The default value is fine for TCP across the Internet's smallest official * link MTU (256 bytes for AX.25 packet radio). However, a connection is very * unlikely to come across such low MTU interfaces these days (anno domini 2003). */ #define TCP_MINMSS 216 #define TCP_MAXWIN 65535 /* largest value for (unscaled) window */ #define TTCP_CLIENT_SND_WND 4096 /* dflt send window for T/TCP client */ #define TCP_MAX_WINSHIFT 14 /* maximum window shift */ #define TCP_MAXBURST 4 /* maximum segments in a burst */ #define TCP_MAXHLEN (0xf<<2) /* max length of header in bytes */ #define TCP_MAXOLEN (TCP_MAXHLEN - sizeof(struct tcphdr)) /* max space left for options */ #endif /* __BSD_VISIBLE */ /* * User-settable options (used with setsockopt). */ #define TCP_NODELAY 0x01 /* don't delay send to coalesce packets */ #if __BSD_VISIBLE #define TCP_MAXSEG 0x02 /* set maximum segment size */ #define TCP_NOPUSH 0x04 /* don't push last block of write */ #define TCP_NOOPT 0x08 /* don't use TCP options */ #define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */ #define TCP_INFO 0x20 /* retrieve tcp_info structure */ #define TCP_CONGESTION 0x40 /* get/set congestion control algorithm */ #define TCP_KEEPINIT 0x80 /* N, time to establish connection */ #define TCP_KEEPIDLE 0x100 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 0x200 /* L,N interval between keepalives */ #define TCP_KEEPCNT 0x400 /* L,N number of keepalives before close */ #define TCP_CA_NAME_MAX 16 /* max congestion control name length */ #define TCPI_OPT_TIMESTAMPS 0x01 #define TCPI_OPT_SACK 0x02 #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 #define TCPI_OPT_TOE 0x10 /* * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits * the caller to query certain information about the state of a TCP * connection. We provide an overlapping set of fields with the Linux * implementation, but since this is a fixed size structure, room has been * left for growth. In order to maximize potential future compatibility with * the Linux API, the same variable names and order have been adopted, and * padding left to make room for omitted fields in case they are added later. * * XXX: This is currently an unstable ABI/API, in that it is expected to * change. */ struct tcp_info { u_int8_t tcpi_state; /* TCP FSM state. */ u_int8_t __tcpi_ca_state; u_int8_t __tcpi_retransmits; u_int8_t __tcpi_probes; u_int8_t __tcpi_backoff; u_int8_t tcpi_options; /* Options enabled on conn. */ u_int8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ u_int32_t tcpi_rto; /* Retransmission timeout (usec). */ u_int32_t __tcpi_ato; u_int32_t tcpi_snd_mss; /* Max segment size for send. */ u_int32_t tcpi_rcv_mss; /* Max segment size for receive. */ u_int32_t __tcpi_unacked; u_int32_t __tcpi_sacked; u_int32_t __tcpi_lost; u_int32_t __tcpi_retrans; u_int32_t __tcpi_fackets; /* Times; measurements in usecs. */ u_int32_t __tcpi_last_data_sent; u_int32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ u_int32_t tcpi_last_data_recv; /* Time since last recv data. */ u_int32_t __tcpi_last_ack_recv; /* Metrics; variable units. */ u_int32_t __tcpi_pmtu; u_int32_t __tcpi_rcv_ssthresh; u_int32_t tcpi_rtt; /* Smoothed RTT in usecs. */ u_int32_t tcpi_rttvar; /* RTT variance in usecs. */ u_int32_t tcpi_snd_ssthresh; /* Slow start threshold. */ u_int32_t tcpi_snd_cwnd; /* Send congestion window. */ u_int32_t __tcpi_advmss; u_int32_t __tcpi_reordering; u_int32_t __tcpi_rcv_rtt; u_int32_t tcpi_rcv_space; /* Advertised recv window. */ /* FreeBSD extensions to tcp_info. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ u_int32_t tcpi_snd_bwnd; /* No longer used. */ u_int32_t tcpi_snd_nxt; /* Next egress seqno */ u_int32_t tcpi_rcv_nxt; /* Next ingress seqno */ u_int32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ u_int32_t tcpi_snd_rexmitpack; /* Retransmitted packets */ u_int32_t tcpi_rcv_ooopack; /* Out-of-order packets */ u_int32_t tcpi_snd_zerowin; /* Zero-sized windows sent */ /* Padding to grow without breaking ABI. */ u_int32_t __tcpi_pad[26]; /* Padding. */ }; #endif #endif /* !_NETINET_TCP_H_ */ ipfw-user/sys/netinet/udp.h000644 000423 000000 00000004676 12006744005 016475 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)udp.h 8.1 (Berkeley) 6/10/93 * $FreeBSD: head/sys/netinet/udp.h 217126 2011-01-07 21:40:34Z jhb $ */ #ifndef _NETINET_UDP_H_ #define _NETINET_UDP_H_ /* * UDP protocol header. * Per RFC 768, September, 1981. */ struct udphdr { u_short uh_sport; /* source port */ u_short uh_dport; /* destination port */ u_short uh_ulen; /* udp length */ u_short uh_sum; /* udp checksum */ }; /* * User-settable options (used with setsockopt). */ #define UDP_ENCAP 0x01 /* * UDP Encapsulation of IPsec Packets options. */ /* Encapsulation types. */ #define UDP_ENCAP_ESPINUDP_NON_IKE 1 /* draft-ietf-ipsec-nat-t-ike-00/01 */ #define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-02+ */ /* Default ESP in UDP encapsulation port. */ #define UDP_ENCAP_ESPINUDP_PORT 500 /* Maximum UDP fragment size for ESP over UDP. */ #define UDP_ENCAP_ESPINUDP_MAXFRAGLEN 552 #endif ipfw-user/sys/netinet/ipfw/dn_heap.c000644 000423 000000 00000034636 12006744005 020242 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Binary heap and hash tables, used in dummynet * * $FreeBSD: head/sys/netinet/ipfw/dn_heap.c 227293 2011-11-07 06:44:47Z ed $ */ #include #include #ifdef _KERNEL __FBSDID("$FreeBSD: head/sys/netinet/ipfw/dn_heap.c 227293 2011-11-07 06:44:47Z ed $"); #include #include #include #include #ifndef log #define log(x, arg...) #endif #else /* !_KERNEL */ #include #include #include #include #include "dn_heap.h" #define log(x, arg...) fprintf(stderr, ## arg) #define panic(x...) fprintf(stderr, ## x), exit(1) #define MALLOC_DEFINE(a, b, c) static void *my_malloc(int s) { return malloc(s); } static void my_free(void *p) { free(p); } #define malloc(s, t, w) my_malloc(s) #define free(p, t) my_free(p) #endif /* !_KERNEL */ static MALLOC_DEFINE(M_DN_HEAP, "dummynet", "dummynet heap"); /* * Heap management functions. * * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. * Some macros help finding parent/children so we can optimize them. * * heap_init() is called to expand the heap when needed. * Increment size in blocks of 16 entries. * Returns 1 on error, 0 on success */ #define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) #define HEAP_LEFT(x) ( (x)+(x) + 1 ) #define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } #define HEAP_INCREMENT 15 static int heap_resize(struct dn_heap *h, unsigned int new_size) { struct dn_heap_entry *p; if (h->size >= new_size ) /* have enough room */ return 0; #if 1 /* round to the next power of 2 */ new_size |= new_size >> 1; new_size |= new_size >> 2; new_size |= new_size >> 4; new_size |= new_size >> 8; new_size |= new_size >> 16; #else new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT; #endif p = malloc(new_size * sizeof(*p), M_DN_HEAP, M_NOWAIT); if (p == NULL) { printf("--- %s, resize %d failed\n", __func__, new_size ); return 1; /* error */ } if (h->size > 0) { bcopy(h->p, p, h->size * sizeof(*p) ); free(h->p, M_DN_HEAP); } h->p = p; h->size = new_size; return 0; } int heap_init(struct dn_heap *h, int size, int ofs) { if (heap_resize(h, size)) return 1; h->elements = 0; h->ofs = ofs; return 0; } /* * Insert element in heap. Normally, p != NULL, we insert p in * a new position and bubble up. If p == NULL, then the element is * already in place, and key is the position where to start the * bubble-up. * Returns 1 on failure (cannot allocate new heap entry) * * If ofs > 0 the position (index, int) of the element in the heap is * also stored in the element itself at the given offset in bytes. */ #define SET_OFFSET(h, i) do { \ if (h->ofs > 0) \ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = i; \ } while (0) /* * RESET_OFFSET is used for sanity checks. It sets ofs * to an invalid value. */ #define RESET_OFFSET(h, i) do { \ if (h->ofs > 0) \ *((int32_t *)((char *)(h->p[i].object) + h->ofs)) = -16; \ } while (0) int heap_insert(struct dn_heap *h, uint64_t key1, void *p) { int son = h->elements; //log("%s key %llu p %p\n", __FUNCTION__, key1, p); if (p == NULL) { /* data already there, set starting point */ son = key1; } else { /* insert new element at the end, possibly resize */ son = h->elements; if (son == h->size) /* need resize... */ // XXX expand by 16 or so if (heap_resize(h, h->elements+16) ) return 1; /* failure... */ h->p[son].object = p; h->p[son].key = key1; h->elements++; } /* make sure that son >= father along the path */ while (son > 0) { int father = HEAP_FATHER(son); struct dn_heap_entry tmp; if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) break; /* found right position */ /* son smaller than father, swap and repeat */ HEAP_SWAP(h->p[son], h->p[father], tmp); SET_OFFSET(h, son); son = father; } SET_OFFSET(h, son); return 0; } /* * remove top element from heap, or obj if obj != NULL */ void heap_extract(struct dn_heap *h, void *obj) { int child, father, max = h->elements - 1; if (max < 0) { printf("--- %s: empty heap 0x%p\n", __FUNCTION__, h); return; } if (obj == NULL) father = 0; /* default: move up smallest child */ else { /* extract specific element, index is at offset */ if (h->ofs <= 0) panic("%s: extract from middle not set on %p\n", __FUNCTION__, h); father = *((int *)((char *)obj + h->ofs)); if (father < 0 || father >= h->elements) { panic("%s: father %d out of bound 0..%d\n", __FUNCTION__, father, h->elements); } } /* * below, father is the index of the empty element, which * we replace at each step with the smallest child until we * reach the bottom level. */ // XXX why removing RESET_OFFSET increases runtime by 10% ? RESET_OFFSET(h, father); while ( (child = HEAP_LEFT(father)) <= max ) { if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) child++; /* take right child, otherwise left */ h->p[father] = h->p[child]; SET_OFFSET(h, father); father = child; } h->elements--; if (father != max) { /* * Fill hole with last entry and bubble up, * reusing the insert code */ h->p[father] = h->p[max]; heap_insert(h, father, NULL); } } #if 0 /* * change object position and update references * XXX this one is never used! */ static void heap_move(struct dn_heap *h, uint64_t new_key, void *object) { int temp, i, max = h->elements-1; struct dn_heap_entry *p, buf; if (h->ofs <= 0) panic("cannot move items on this heap"); p = h->p; /* shortcut */ i = *((int *)((char *)object + h->ofs)); if (DN_KEY_LT(new_key, p[i].key) ) { /* must move up */ p[i].key = new_key; for (; i>0 && DN_KEY_LT(new_key, p[(temp = HEAP_FATHER(i))].key); i = temp ) { /* bubble up */ HEAP_SWAP(p[i], p[temp], buf); SET_OFFSET(h, i); } } else { /* must move down */ p[i].key = new_key; while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */ if (temp != max && DN_KEY_LT(p[temp+1].key, p[temp].key)) temp++; /* select child with min key */ if (DN_KEY_LT(>p[temp].key, new_key)) { /* go down */ HEAP_SWAP(p[i], p[temp], buf); SET_OFFSET(h, i); } else break; i = temp; } } SET_OFFSET(h, i); } #endif /* heap_move, unused */ /* * heapify() will reorganize data inside an array to maintain the * heap property. It is needed when we delete a bunch of entries. */ static void heapify(struct dn_heap *h) { int i; for (i = 0; i < h->elements; i++ ) heap_insert(h, i , NULL); } int heap_scan(struct dn_heap *h, int (*fn)(void *, uintptr_t), uintptr_t arg) { int i, ret, found; for (i = found = 0 ; i < h->elements ;) { ret = fn(h->p[i].object, arg); if (ret & HEAP_SCAN_DEL) { h->elements-- ; h->p[i] = h->p[h->elements] ; found++ ; } else i++ ; if (ret & HEAP_SCAN_END) break; } if (found) heapify(h); return found; } /* * cleanup the heap and free data structure */ void heap_free(struct dn_heap *h) { if (h->size >0 ) free(h->p, M_DN_HEAP); bzero(h, sizeof(*h) ); } /* * hash table support. */ struct dn_ht { int buckets; /* how many buckets, really buckets - 1*/ int entries; /* how many entries */ int ofs; /* offset of link field */ uint32_t (*hash)(uintptr_t, int, void *arg); int (*match)(void *_el, uintptr_t key, int, void *); void *(*newh)(uintptr_t, int, void *); void **ht; /* bucket heads */ }; /* * Initialize, allocating bucket pointers inline. * Recycle previous record if possible. * If the 'newh' function is not supplied, we assume that the * key passed to ht_find is the same object to be stored in. */ struct dn_ht * dn_ht_init(struct dn_ht *ht, int buckets, int ofs, uint32_t (*h)(uintptr_t, int, void *), int (*match)(void *, uintptr_t, int, void *), void *(*newh)(uintptr_t, int, void *)) { int l; /* * Notes about rounding bucket size to a power of two. * Given the original bucket size, we compute the nearest lower and * higher power of two, minus 1 (respectively b_min and b_max) because * this value will be used to do an AND with the index returned * by hash function. * To choice between these two values, the original bucket size is * compared with b_min. If the original size is greater than 4/3 b_min, * we round the bucket size to b_max, else to b_min. * This ratio try to round to the nearest power of two, advantaging * the greater size if the different between two power is relatively * big. * Rounding the bucket size to a power of two avoid the use of * module when calculating the correct bucket. * The ht->buckets variable store the bucket size - 1 to simply * do an AND between the index returned by hash function and ht->bucket * instead of a module. */ int b_min; /* min buckets */ int b_max; /* max buckets */ int b_ori; /* original buckets */ if (h == NULL || match == NULL) { printf("--- missing hash or match function"); return NULL; } if (buckets < 1 || buckets > 65536) return NULL; b_ori = buckets; /* calculate next power of 2, - 1*/ buckets |= buckets >> 1; buckets |= buckets >> 2; buckets |= buckets >> 4; buckets |= buckets >> 8; buckets |= buckets >> 16; b_max = buckets; /* Next power */ b_min = buckets >> 1; /* Previous power */ /* Calculate the 'nearest' bucket size */ if (b_min * 4000 / 3000 < b_ori) buckets = b_max; else buckets = b_min; if (ht) { /* see if we can reuse */ if (buckets <= ht->buckets) { ht->buckets = buckets; } else { /* free pointers if not allocated inline */ if (ht->ht != (void *)(ht + 1)) free(ht->ht, M_DN_HEAP); free(ht, M_DN_HEAP); ht = NULL; } } if (ht == NULL) { /* Allocate buckets + 1 entries because buckets is use to * do the AND with the index returned by hash function */ l = sizeof(*ht) + (buckets + 1) * sizeof(void **); ht = malloc(l, M_DN_HEAP, M_NOWAIT | M_ZERO); } if (ht) { ht->ht = (void **)(ht + 1); ht->buckets = buckets; ht->ofs = ofs; ht->hash = h; ht->match = match; ht->newh = newh; } return ht; } /* dummy callback for dn_ht_free to unlink all */ static int do_del(void *obj, void *arg) { return DNHT_SCAN_DEL; } void dn_ht_free(struct dn_ht *ht, int flags) { if (ht == NULL) return; if (flags & DNHT_REMOVE) { (void)dn_ht_scan(ht, do_del, NULL); } else { if (ht->ht && ht->ht != (void *)(ht + 1)) free(ht->ht, M_DN_HEAP); free(ht, M_DN_HEAP); } } int dn_ht_entries(struct dn_ht *ht) { return ht ? ht->entries : 0; } /* lookup and optionally create or delete element */ void * dn_ht_find(struct dn_ht *ht, uintptr_t key, int flags, void *arg) { int i; void **pp, *p; if (ht == NULL) /* easy on an empty hash */ return NULL; i = (ht->buckets == 1) ? 0 : (ht->hash(key, flags, arg) & ht->buckets); for (pp = &ht->ht[i]; (p = *pp); pp = (void **)((char *)p + ht->ofs)) { if (flags & DNHT_MATCH_PTR) { if (key == (uintptr_t)p) break; } else if (ht->match(p, key, flags, arg)) /* found match */ break; } if (p) { if (flags & DNHT_REMOVE) { /* link in the next element */ *pp = *(void **)((char *)p + ht->ofs); *(void **)((char *)p + ht->ofs) = NULL; ht->entries--; } } else if (flags & DNHT_INSERT) { // printf("%s before calling new, bucket %d ofs %d\n", // __FUNCTION__, i, ht->ofs); p = ht->newh ? ht->newh(key, flags, arg) : (void *)key; // printf("%s newh returns %p\n", __FUNCTION__, p); if (p) { ht->entries++; *(void **)((char *)p + ht->ofs) = ht->ht[i]; ht->ht[i] = p; } } return p; } /* * do a scan with the option to delete the object. Extract next before * running the callback because the element may be destroyed there. */ int dn_ht_scan(struct dn_ht *ht, int (*fn)(void *, void *), void *arg) { int i, ret, found = 0; void **curp, *cur, *next; if (ht == NULL || fn == NULL) return 0; for (i = 0; i <= ht->buckets; i++) { curp = &ht->ht[i]; while ( (cur = *curp) != NULL) { next = *(void **)((char *)cur + ht->ofs); ret = fn(cur, arg); if (ret & DNHT_SCAN_DEL) { found++; ht->entries--; *curp = next; } else { curp = (void **)((char *)cur + ht->ofs); } if (ret & DNHT_SCAN_END) return found; } } return found; } /* * Similar to dn_ht_scan(), except that the scan is performed only * in the bucket 'bucket'. The function returns a correct bucket number if * the original is invalid. * If the callback returns DNHT_SCAN_END, the function move the ht->ht[i] * pointer to the last entry processed. Moreover, the bucket number passed * by caller is decremented, because usually the caller increment it. */ int dn_ht_scan_bucket(struct dn_ht *ht, int *bucket, int (*fn)(void *, void *), void *arg) { int i, ret, found = 0; void **curp, *cur, *next; if (ht == NULL || fn == NULL) return 0; if (*bucket > ht->buckets) *bucket = 0; i = *bucket; curp = &ht->ht[i]; while ( (cur = *curp) != NULL) { next = *(void **)((char *)cur + ht->ofs); ret = fn(cur, arg); if (ret & DNHT_SCAN_DEL) { found++; ht->entries--; *curp = next; } else { curp = (void **)((char *)cur + ht->ofs); } if (ret & DNHT_SCAN_END) return found; } return found; } ipfw-user/sys/netinet/ipfw/dn_sched_prio.c000644 000423 000000 00000014337 12006744005 021440 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD: head/sys/netinet/ipfw/dn_sched_prio.c 205417 2010-03-21 16:30:32Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #define DN_SCHED_PRIO 5 //XXX #if !defined(_KERNEL) || !defined(__linux__) #define test_bit(ix, pData) ((*pData) & (1<<(ix))) #define __set_bit(ix, pData) (*pData) |= (1<<(ix)) #define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) #endif #ifdef __MIPSEL__ #define __clear_bit(ix, pData) (*pData) &= ~(1<<(ix)) #endif /* Size of the array of queues pointers. */ #define BITMAP_T unsigned long #define MAXPRIO (sizeof(BITMAP_T) * 8) /* * The scheduler instance contains an array of pointers to queues, * one for each priority, and a bitmap listing backlogged queues. */ struct prio_si { BITMAP_T bitmap; /* array bitmap */ struct dn_queue *q_array[MAXPRIO]; /* Array of queues pointers */ }; /* * If a queue with the same priority is already backlogged, use * that one instead of the queue passed as argument. */ static int prio_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) { struct prio_si *si = (struct prio_si *)(_si + 1); int prio = q->fs->fs.par[0]; if (test_bit(prio, &si->bitmap) == 0) { /* No queue with this priority, insert */ __set_bit(prio, &si->bitmap); si->q_array[prio] = q; } else { /* use the existing queue */ q = si->q_array[prio]; } if (dn_enqueue(q, m, 0)) return 1; return 0; } /* * Packets are dequeued only from the highest priority queue. * The function ffs() return the lowest bit in the bitmap that rapresent * the array index (-1) which contains the pointer to the highest priority * queue. * After the dequeue, if this queue become empty, it is index is removed * from the bitmap. * Scheduler is idle if the bitmap is empty * * NOTE: highest priority is 0, lowest is sched->max_prio_q */ static struct mbuf * prio_dequeue(struct dn_sch_inst *_si) { struct prio_si *si = (struct prio_si *)(_si + 1); struct mbuf *m; struct dn_queue *q; int prio; if (si->bitmap == 0) /* scheduler idle */ return NULL; prio = ffs(si->bitmap) - 1; /* Take the highest priority queue in the scheduler */ q = si->q_array[prio]; // assert(q) m = dn_dequeue(q); if (q->mq.head == NULL) { /* Queue is now empty, remove from scheduler * and mark it */ si->q_array[prio] = NULL; __clear_bit(prio, &si->bitmap); } return m; } static int prio_new_sched(struct dn_sch_inst *_si) { struct prio_si *si = (struct prio_si *)(_si + 1); bzero(si->q_array, sizeof(si->q_array)); si->bitmap = 0; return 0; } static int prio_new_fsk(struct dn_fsk *fs) { /* Check if the prioritiy is between 0 and MAXPRIO-1 */ ipdn_bound_var(&fs->fs.par[0], 0, 0, MAXPRIO - 1, "PRIO priority"); return 0; } static int prio_new_queue(struct dn_queue *q) { struct prio_si *si = (struct prio_si *)(q->_si + 1); int prio = q->fs->fs.par[0]; struct dn_queue *oldq; q->ni.oid.subtype = DN_SCHED_PRIO; if (q->mq.head == NULL) return 0; /* Queue already full, must insert in the scheduler or append * mbufs to existing queue. This partly duplicates prio_enqueue */ if (test_bit(prio, &si->bitmap) == 0) { /* No queue with this priority, insert */ __set_bit(prio, &si->bitmap); si->q_array[prio] = q; } else if ( (oldq = si->q_array[prio]) != q) { /* must append to the existing queue. * can simply append q->mq.head to q2->... * and add the counters to those of q2 */ oldq->mq.tail->m_nextpkt = q->mq.head; oldq->mq.tail = q->mq.tail; oldq->ni.length += q->ni.length; q->ni.length = 0; oldq->ni.len_bytes += q->ni.len_bytes; q->ni.len_bytes = 0; q->mq.tail = q->mq.head = NULL; } return 0; } static int prio_free_queue(struct dn_queue *q) { int prio = q->fs->fs.par[0]; struct prio_si *si = (struct prio_si *)(q->_si + 1); if (si->q_array[prio] == q) { si->q_array[prio] = NULL; __clear_bit(prio, &si->bitmap); } return 0; } static struct dn_alg prio_desc = { _SI( .type = ) DN_SCHED_PRIO, _SI( .name = ) "PRIO", _SI( .flags = ) DN_MULTIQUEUE, /* we need extra space in the si and the queue */ _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct prio_si), _SI( .q_datalen = ) 0, _SI( .enqueue = ) prio_enqueue, _SI( .dequeue = ) prio_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) prio_new_sched, _SI( .free_sched = ) NULL, _SI( .new_fsk = ) prio_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) prio_new_queue, _SI( .free_queue = ) prio_free_queue, }; DECLARE_DNSCHED_MODULE(dn_prio, &prio_desc); ipfw-user/sys/netinet/ipfw/ip_fw_private.h000644 000423 000000 00000024067 12007435564 021514 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD: head/sys/netinet/ipfw/ip_fw_private.h 234946 2012-05-03 08:56:43Z melifaro $ */ #ifndef _IPFW2_PRIVATE_H #define _IPFW2_PRIVATE_H /* * Internal constants and data structures used by ipfw components * and not meant to be exported outside the kernel. */ #ifdef _KERNEL /* * For platforms that do not have SYSCTL support, we wrap the * SYSCTL_* into a function (one per file) to collect the values * into an array at module initialization. The wrapping macros, * SYSBEGIN() and SYSEND, are empty in the default case. */ #ifndef SYSBEGIN #define SYSBEGIN(x) #endif #ifndef SYSEND #define SYSEND #endif /* Return values from ipfw_chk() */ enum { IP_FW_PASS = 0, IP_FW_DENY, IP_FW_DIVERT, IP_FW_TEE, IP_FW_DUMMYNET, IP_FW_NETGRAPH, IP_FW_NGTEE, IP_FW_NAT, IP_FW_REASS, }; /* * Structure for collecting parameters to dummynet for ip6_output forwarding */ struct _ip6dn_args { struct ip6_pktopts *opt_or; struct route_in6 ro_or; int flags_or; struct ip6_moptions *im6o_or; struct ifnet *origifp_or; struct ifnet *ifp_or; struct sockaddr_in6 dst_or; u_long mtu_or; struct route_in6 ro_pmtu_or; }; /* * Arguments for calling ipfw_chk() and dummynet_io(). We put them * all into a structure because this way it is easier and more * efficient to pass variables around and extend the interface. */ struct ip_fw_args { struct mbuf *m; /* the mbuf chain */ struct ifnet *oif; /* output interface */ struct sockaddr_in *next_hop; /* forward address */ struct sockaddr_in6 *next_hop6; /* ipv6 forward address */ /* * On return, it points to the matching rule. * On entry, rule.slot > 0 means the info is valid and * contains the starting rule for an ipfw search. * If chain_id == chain->id && slot >0 then jump to that slot. * Otherwise, we locate the first rule >= rulenum:rule_id */ struct ipfw_rule_ref rule; /* match/restart info */ struct ether_header *eh; /* for bridged packets */ struct ipfw_flow_id f_id; /* grabbed from IP header */ //uint32_t cookie; /* a cookie depending on rule action */ struct inpcb *inp; struct _ip6dn_args dummypar; /* dummynet->ip6_output */ struct sockaddr_in hopstore; /* store here if cannot use a pointer */ }; MALLOC_DECLARE(M_IPFW); /* * Hooks sometime need to know the direction of the packet * (divert, dummynet, netgraph, ...) * We use a generic definition here, with bit0-1 indicating the * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the * specific protocol * indicating the protocol (if necessary) */ enum { DIR_MASK = 0x3, DIR_OUT = 0, DIR_IN = 1, DIR_FWD = 2, DIR_DROP = 3, PROTO_LAYER2 = 0x4, /* set for layer 2 */ /* PROTO_DEFAULT = 0, */ PROTO_IPV4 = 0x08, PROTO_IPV6 = 0x10, PROTO_IFB = 0x0c, /* layer2 + ifbridge */ /* PROTO_OLDBDG = 0x14, unused, old bridge */ }; /* wrapper for freeing a packet, in case we need to do more work */ #ifndef FREE_PKT #if defined(__linux__) || defined(_WIN32) #define FREE_PKT(m) netisr_dispatch(-1, m) #else #define FREE_PKT(m) m_freem(m) #endif #endif /* !FREE_PKT */ /* * Function definitions. */ /* attach (arg = 1) or detach (arg = 0) hooks */ int ipfw_attach_hooks(int); #ifdef NOTYET void ipfw_nat_destroy(void); #endif /* In ip_fw_log.c */ struct ip; void ipfw_log_bpf(int); void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip); VNET_DECLARE(u_int64_t, norule_counter); #define V_norule_counter VNET(norule_counter) VNET_DECLARE(int, verbose_limit); #define V_verbose_limit VNET(verbose_limit) /* In ip_fw_dynamic.c */ enum { /* result for matching dynamic rules */ MATCH_REVERSE = 0, MATCH_FORWARD, MATCH_NONE, MATCH_UNKNOWN, }; /* * The lock for dynamic rules is only used once outside the file, * and only to release the result of lookup_dyn_rule(). * Eventually we may implement it with a callback on the function. */ void ipfw_dyn_unlock(void); struct tcphdr; struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, u_int32_t, u_int32_t, int); int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg); ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp); void ipfw_remove_dyn_children(struct ip_fw *rule); void ipfw_get_dynamic(char **bp, const char *ep); void ipfw_dyn_attach(void); /* uma_zcreate .... */ void ipfw_dyn_detach(void); /* uma_zdestroy ... */ void ipfw_dyn_init(void); /* per-vnet initialization */ void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ int ipfw_dyn_len(void); /* common variables */ VNET_DECLARE(int, fw_one_pass); #define V_fw_one_pass VNET(fw_one_pass) VNET_DECLARE(int, fw_verbose); #define V_fw_verbose VNET(fw_verbose) VNET_DECLARE(struct ip_fw_chain, layer3_chain); #define V_layer3_chain VNET(layer3_chain) VNET_DECLARE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DECLARE(int, autoinc_step); #define V_autoinc_step VNET(autoinc_step) VNET_DECLARE(unsigned int, fw_tables_max); #define V_fw_tables_max VNET(fw_tables_max) struct ip_fw_chain { struct ip_fw *rules; /* list of rules */ struct ip_fw *reap; /* list of rules to reap */ struct ip_fw *default_rule; int n_rules; /* number of static rules */ int static_len; /* total len of static rules */ struct ip_fw **map; /* array of rule ptrs to ease lookup */ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ struct radix_node_head **tables; /* IPv4 tables */ struct radix_node_head **xtables; /* extended tables */ uint8_t *tabletype; /* Array of table types */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rwmtx; spinlock_t uh_lock; #else struct rwlock rwmtx; struct rwlock uh_lock; /* lock for upper half */ #endif uint32_t id; /* ruleset id */ uint32_t gencnt; /* generation count */ }; struct sockopt; /* used by tcp_var.h */ /* * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c * so the variable and the macros must be here. */ #define IPFW_LOCK_INIT(_chain) do { \ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rw_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) #define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) #define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) #define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) #define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) #define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) #define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) /* In ip_fw_sockopt.c */ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule); int ipfw_ctl(struct sockopt *sopt); int ipfw_chk(struct ip_fw_args *args); void ipfw_reap_rules(struct ip_fw *head); /* In ip_fw_pfil */ int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp); /* In ip_fw_table.c */ struct radix_node; int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val); int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, uint32_t *val, int type); int ipfw_init_tables(struct ip_fw_chain *ch); void ipfw_destroy_tables(struct ip_fw_chain *ch); int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl); int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value); int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, uint8_t plen, uint8_t mlen, uint8_t type); int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); int ipfw_dump_table_entry(struct radix_node *rn, void *arg); int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl); int ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt); int ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl); int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables); /* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); typedef int ipfw_nat_cfg_t(struct sockopt *); extern ipfw_nat_t *ipfw_nat_ptr; #define IPFW_NAT_LOADED (ipfw_nat_ptr != NULL) extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #endif /* _KERNEL */ #endif /* _IPFW2_PRIVATE_H */ ipfw-user/sys/netinet/ipfw/dn_heap.h000644 000423 000000 00000017001 12006744005 020232 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1998-2010 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Binary heap and hash tables, header file * * $FreeBSD: head/sys/netinet/ipfw/dn_heap.h 204865 2010-03-08 11:27:08Z luigi $ */ #ifndef _IP_DN_HEAP_H #define _IP_DN_HEAP_H #define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) #define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) /* * This module implements a binary heap supporting random extraction. * * A heap entry contains an uint64_t key and a pointer to object. * DN_KEY_LT(a,b) returns true if key 'a' is smaller than 'b' * * The heap is a struct dn_heap plus a dynamically allocated * array of dn_heap_entry entries. 'size' represents the size of * the array, 'elements' count entries in use. The topmost * element has the smallest key. * The heap supports ordered insert, and extract from the top. * To extract an object from the middle of the heap, we the object * must reserve an 'int32_t' to store the position of the object * in the heap itself, and the location of this field must be * passed as an argument to heap_init() -- use -1 if the feature * is not used. */ struct dn_heap_entry { uint64_t key; /* sorting key, smallest comes first */ void *object; /* object pointer */ }; struct dn_heap { int size; /* the size of the array */ int elements; /* elements in use */ int ofs; /* offset in the object of heap index */ struct dn_heap_entry *p; /* array of "size" entries */ }; enum { HEAP_SCAN_DEL = 1, HEAP_SCAN_END = 2, }; /* * heap_init() reinitializes the heap setting the size and the offset * of the index for random extraction (use -1 if not used). * The 'elements' counter is set to 0. * * SET_HEAP_OFS() indicates where, in the object, is stored the index * for random extractions from the heap. * * heap_free() frees the memory associated to a heap. * * heap_insert() adds a key-pointer pair to the heap * * HEAP_TOP() returns a pointer to the top element of the heap, * but makes no checks on its existance (XXX should we change ?) * * heap_extract() removes the entry at the top, returing the pointer. * (the key should have been read before). * * heap_scan() invokes a callback on each entry of the heap. * The callback can return a combination of HEAP_SCAN_DEL and * HEAP_SCAN_END. HEAP_SCAN_DEL means the current element must * be removed, and HEAP_SCAN_END means to terminate the scan. * heap_scan() returns the number of elements removed. * Because the order is not guaranteed, we should use heap_scan() * only as a last resort mechanism. */ #define HEAP_TOP(h) ((h)->p) #define SET_HEAP_OFS(h, n) do { (h)->ofs = n; } while (0) int heap_init(struct dn_heap *h, int size, int ofs); int heap_insert(struct dn_heap *h, uint64_t key1, void *p); void heap_extract(struct dn_heap *h, void *obj); void heap_free(struct dn_heap *h); int heap_scan(struct dn_heap *, int (*)(void *, uintptr_t), uintptr_t); /*------------------------------------------------------ * This module implements a generic hash table with support for * running callbacks on the entire table. To avoid allocating * memory during hash table operations, objects must reserve * space for a link field. XXX if the heap is moderately full, * an SLIST suffices, and we can tolerate the cost of a hash * computation on each removal. * * dn_ht_init() initializes the table, setting the number of * buckets, the offset of the link field, the main callbacks. * Callbacks are: * * hash(key, flags, arg) called to return a bucket index. * match(obj, key, flags, arg) called to determine if key * matches the current 'obj' in the heap * newh(key, flags, arg) optional, used to allocate a new * object during insertions. * * dn_ht_free() frees the heap or unlink elements. * DNHT_REMOVE unlink elements, 0 frees the heap. * You need two calls to do both. * * dn_ht_find() is the main lookup function, which can also be * used to insert or delete elements in the hash table. * The final 'arg' is passed to all callbacks. * * dn_ht_scan() is used to invoke a callback on all entries of * the heap, or possibly on just one bucket. The callback * is invoked with a pointer to the object, and must return * one of DNHT_SCAN_DEL or DNHT_SCAN_END to request the * removal of the object from the heap and the end of the * scan, respectively. * * dn_ht_scan_bucket() is similar to dn_ht_scan(), except that it scans * only the specific bucket of the table. The bucket is a in-out * parameter and return a valid bucket number if the original * is invalid. * * A combination of flags can be used to modify the operation * of the dn_ht_find(), and of the callbacks: * * DNHT_KEY_IS_OBJ means the key is the object pointer. * It is usally of interest for the hash and match functions. * * DNHT_MATCH_PTR during a lookup, match pointers instead * of calling match(). Normally used when removing specific * entries. Does not imply KEY_IS_OBJ as the latter _is_ used * by the match function. * * DNHT_INSERT insert the element if not found. * Calls new() to allocates a new object unless * DNHT_KEY_IS_OBJ is set. * * DNHT_UNIQUE only insert if object not found. * XXX should it imply DNHT_INSERT ? * * DNHT_REMOVE remove objects if we find them. */ struct dn_ht; /* should be opaque */ struct dn_ht *dn_ht_init(struct dn_ht *, int buckets, int ofs, uint32_t (*hash)(uintptr_t, int, void *), int (*match)(void *, uintptr_t, int, void *), void *(*newh)(uintptr_t, int, void *)); void dn_ht_free(struct dn_ht *, int flags); void *dn_ht_find(struct dn_ht *, uintptr_t, int, void *); int dn_ht_scan(struct dn_ht *, int (*)(void *, void *), void *); int dn_ht_scan_bucket(struct dn_ht *, int * , int (*)(void *, void *), void *); int dn_ht_entries(struct dn_ht *); enum { /* flags values. * first two are returned by the scan callback to indicate * to delete the matching element or to end the scan */ DNHT_SCAN_DEL = 0x0001, DNHT_SCAN_END = 0x0002, DNHT_KEY_IS_OBJ = 0x0004, /* key is the obj pointer */ DNHT_MATCH_PTR = 0x0008, /* match by pointer, not match() */ DNHT_INSERT = 0x0010, /* insert if not found */ DNHT_UNIQUE = 0x0020, /* report error if already there */ DNHT_REMOVE = 0x0040, /* remove on find or dn_ht_free */ }; #endif /* _IP_DN_HEAP_H */ ipfw-user/sys/netinet/ipfw/ip_dn_glue.c000644 000423 000000 00000054411 12006744005 020742 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD: head/sys/netinet/ipfw/ip_dn_glue.c 221521 2011-05-06 07:13:34Z ae $ * * Binary compatibility support for /sbin/ipfw RELENG_7 and RELENG_8 */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ #include #include /* ip_output(), IP_FORWARDING */ #include #include #include #include #include #include /* FREEBSD7.2 ip_dummynet.h r191715*/ struct dn_heap_entry7 { int64_t key; /* sorting key. Topmost element is smallest one */ void *object; /* object pointer */ }; struct dn_heap7 { int size; int elements; int offset; /* XXX if > 0 this is the offset of direct ptr to obj */ struct dn_heap_entry7 *p; /* really an array of "size" entries */ }; /* Common to 7.2 and 8 */ struct dn_flow_set { SLIST_ENTRY(dn_flow_set) next; /* linked list in a hash slot */ u_short fs_nr ; /* flow_set number */ u_short flags_fs; #define DNOLD_HAVE_FLOW_MASK 0x0001 #define DNOLD_IS_RED 0x0002 #define DNOLD_IS_GENTLE_RED 0x0004 #define DNOLD_QSIZE_IS_BYTES 0x0008 /* queue size is measured in bytes */ #define DNOLD_NOERROR 0x0010 /* do not report ENOBUFS on drops */ #define DNOLD_HAS_PROFILE 0x0020 /* the pipe has a delay profile. */ #define DNOLD_IS_PIPE 0x4000 #define DNOLD_IS_QUEUE 0x8000 struct dn_pipe7 *pipe ; /* pointer to parent pipe */ u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ int weight ; /* WFQ queue weight */ int qsize ; /* queue size in slots or bytes */ int plr ; /* pkt loss rate (2^31-1 means 100%) */ struct ipfw_flow_id flow_mask ; /* hash table of queues onto this flow_set */ int rq_size ; /* number of slots */ int rq_elements ; /* active elements */ struct dn_flow_queue7 **rq; /* array of rq_size entries */ u_int32_t last_expired ; /* do not expire too frequently */ int backlogged ; /* #active queues for this flowset */ /* RED parameters */ #define SCALE_RED 16 #define SCALE(x) ( (x) << SCALE_RED ) #define SCALE_VAL(x) ( (x) >> SCALE_RED ) #define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) int w_q ; /* queue weight (scaled) */ int max_th ; /* maximum threshold for queue (scaled) */ int min_th ; /* minimum threshold for queue (scaled) */ int max_p ; /* maximum value for p_b (scaled) */ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ u_int lookup_depth ; /* depth of lookup table */ int lookup_step ; /* granularity inside the lookup table */ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ int avg_pkt_size ; /* medium packet size */ int max_pkt_size ; /* max packet size */ }; SLIST_HEAD(dn_flow_set_head, dn_flow_set); #define DN_IS_PIPE 0x4000 #define DN_IS_QUEUE 0x8000 struct dn_flow_queue7 { struct dn_flow_queue7 *next ; struct ipfw_flow_id id ; struct mbuf *head, *tail ; /* queue of packets */ u_int len ; u_int len_bytes ; u_long numbytes; u_int64_t tot_pkts ; /* statistics counters */ u_int64_t tot_bytes ; u_int32_t drops ; int hash_slot ; /* debugging/diagnostic */ /* RED parameters */ int avg ; /* average queue length est. (scaled) */ int count ; /* arrivals since last RED drop */ int random ; /* random value (scaled) */ u_int32_t q_time; /* start of queue idle time */ /* WF2Q+ support */ struct dn_flow_set *fs ; /* parent flow set */ int heap_pos ; /* position (index) of struct in heap */ int64_t sched_time ; /* current time when queue enters ready_heap */ int64_t S,F ; /* start time, finish time */ }; struct dn_pipe7 { /* a pipe */ SLIST_ENTRY(dn_pipe7) next; /* linked list in a hash slot */ int pipe_nr ; /* number */ int bandwidth; /* really, bytes/tick. */ int delay ; /* really, ticks */ struct mbuf *head, *tail ; /* packets in delay line */ /* WF2Q+ */ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ int64_t V ; /* virtual time */ int sum; /* sum of weights of all active sessions */ int numbytes; int64_t sched_time ; /* time pipe was scheduled in ready_heap */ /* * When the tx clock come from an interface (if_name[0] != '\0'), its name * is stored below, whereas the ifp is filled when the rule is configured. */ char if_name[IFNAMSIZ]; struct ifnet *ifp ; int ready ; /* set if ifp != NULL and we got a signal from it */ struct dn_flow_set fs ; /* used with fixed-rate flows */ }; SLIST_HEAD(dn_pipe_head7, dn_pipe7); /* FREEBSD8 ip_dummynet.h r196045 */ struct dn_flow_queue8 { struct dn_flow_queue8 *next ; struct ipfw_flow_id id ; struct mbuf *head, *tail ; /* queue of packets */ u_int len ; u_int len_bytes ; uint64_t numbytes ; /* credit for transmission (dynamic queues) */ int64_t extra_bits; /* extra bits simulating unavailable channel */ u_int64_t tot_pkts ; /* statistics counters */ u_int64_t tot_bytes ; u_int32_t drops ; int hash_slot ; /* debugging/diagnostic */ /* RED parameters */ int avg ; /* average queue length est. (scaled) */ int count ; /* arrivals since last RED drop */ int random ; /* random value (scaled) */ int64_t idle_time; /* start of queue idle time */ /* WF2Q+ support */ struct dn_flow_set *fs ; /* parent flow set */ int heap_pos ; /* position (index) of struct in heap */ int64_t sched_time ; /* current time when queue enters ready_heap */ int64_t S,F ; /* start time, finish time */ }; struct dn_pipe8 { /* a pipe */ SLIST_ENTRY(dn_pipe8) next; /* linked list in a hash slot */ int pipe_nr ; /* number */ int bandwidth; /* really, bytes/tick. */ int delay ; /* really, ticks */ struct mbuf *head, *tail ; /* packets in delay line */ /* WF2Q+ */ struct dn_heap7 scheduler_heap ; /* top extract - key Finish time*/ struct dn_heap7 not_eligible_heap; /* top extract- key Start time */ struct dn_heap7 idle_heap ; /* random extract - key Start=Finish time */ int64_t V ; /* virtual time */ int sum; /* sum of weights of all active sessions */ /* Same as in dn_flow_queue, numbytes can become large */ int64_t numbytes; /* bits I can transmit (more or less). */ uint64_t burst; /* burst size, scaled: bits * hz */ int64_t sched_time ; /* time pipe was scheduled in ready_heap */ int64_t idle_time; /* start of pipe idle time */ char if_name[IFNAMSIZ]; struct ifnet *ifp ; int ready ; /* set if ifp != NULL and we got a signal from it */ struct dn_flow_set fs ; /* used with fixed-rate flows */ /* fields to simulate a delay profile */ #define ED_MAX_NAME_LEN 32 char name[ED_MAX_NAME_LEN]; int loss_level; int samples_no; int *samples; }; #define ED_MAX_SAMPLES_NO 1024 struct dn_pipe_max8 { struct dn_pipe8 pipe; int samples[ED_MAX_SAMPLES_NO]; }; SLIST_HEAD(dn_pipe_head8, dn_pipe8); /* * Changes from 7.2 to 8: * dn_pipe: * numbytes from int to int64_t * add burst (int64_t) * add idle_time (int64_t) * add profile * add struct dn_pipe_max * add flag DN_HAS_PROFILE * * dn_flow_queue * numbytes from u_long to int64_t * add extra_bits (int64_t) * q_time from u_int32_t to int64_t and name idle_time * * dn_flow_set unchanged * */ /* NOTE:XXX copied from dummynet.c */ #define O_NEXT(p, len) ((void *)((char *)p + len)) static void oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) { oid->len = len; oid->type = type; oid->subtype = 0; oid->id = id; } /* make room in the buffer and move the pointer forward */ static void * o_next(struct dn_id **o, int len, int type) { struct dn_id *ret = *o; oid_fill(ret, len, type, 0); *o = O_NEXT(*o, len); return ret; } static size_t pipesize7 = sizeof(struct dn_pipe7); static size_t pipesize8 = sizeof(struct dn_pipe8); static size_t pipesizemax8 = sizeof(struct dn_pipe_max8); /* Indicate 'ipfw' version * 1: from FreeBSD 7.2 * 0: from FreeBSD 8 * -1: unknow (for now is unused) * * It is update when a IP_DUMMYNET_DEL or IP_DUMMYNET_CONFIGURE request arrives * NOTE: if a IP_DUMMYNET_GET arrives and the 'ipfw' version is unknow, * it is suppose to be the FreeBSD 8 version. */ static int is7 = 0; static int convertflags2new(int src) { int dst = 0; if (src & DNOLD_HAVE_FLOW_MASK) dst |= DN_HAVE_MASK; if (src & DNOLD_QSIZE_IS_BYTES) dst |= DN_QSIZE_BYTES; if (src & DNOLD_NOERROR) dst |= DN_NOERROR; if (src & DNOLD_IS_RED) dst |= DN_IS_RED; if (src & DNOLD_IS_GENTLE_RED) dst |= DN_IS_GENTLE_RED; if (src & DNOLD_HAS_PROFILE) dst |= DN_HAS_PROFILE; return dst; } static int convertflags2old(int src) { int dst = 0; if (src & DN_HAVE_MASK) dst |= DNOLD_HAVE_FLOW_MASK; if (src & DN_IS_RED) dst |= DNOLD_IS_RED; if (src & DN_IS_GENTLE_RED) dst |= DNOLD_IS_GENTLE_RED; if (src & DN_NOERROR) dst |= DNOLD_NOERROR; if (src & DN_HAS_PROFILE) dst |= DNOLD_HAS_PROFILE; if (src & DN_QSIZE_BYTES) dst |= DNOLD_QSIZE_IS_BYTES; return dst; } static int dn_compat_del(void *v) { struct dn_pipe7 *p = (struct dn_pipe7 *) v; struct dn_pipe8 *p8 = (struct dn_pipe8 *) v; struct { struct dn_id oid; uintptr_t a[1]; /* add more if we want a list */ } cmd; /* XXX DN_API_VERSION ??? */ oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); if (is7) { if (p->pipe_nr == 0 && p->fs.fs_nr == 0) return EINVAL; if (p->pipe_nr != 0 && p->fs.fs_nr != 0) return EINVAL; } else { if (p8->pipe_nr == 0 && p8->fs.fs_nr == 0) return EINVAL; if (p8->pipe_nr != 0 && p8->fs.fs_nr != 0) return EINVAL; } if (p->pipe_nr != 0) { /* pipe x delete */ cmd.a[0] = p->pipe_nr; cmd.oid.subtype = DN_LINK; } else { /* queue x delete */ cmd.oid.subtype = DN_FS; cmd.a[0] = (is7) ? p->fs.fs_nr : p8->fs.fs_nr; } return do_config(&cmd, cmd.oid.len); } static int dn_compat_config_queue(struct dn_fs *fs, void* v) { struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; struct dn_flow_set *f; if (is7) f = &p7->fs; else f = &p8->fs; fs->fs_nr = f->fs_nr; fs->sched_nr = f->parent_nr; fs->flow_mask = f->flow_mask; fs->buckets = f->rq_size; fs->qsize = f->qsize; fs->plr = f->plr; fs->par[0] = f->weight; fs->flags = convertflags2new(f->flags_fs); if (fs->flags & DN_IS_GENTLE_RED || fs->flags & DN_IS_RED) { fs->w_q = f->w_q; fs->max_th = f->max_th; fs->min_th = f->min_th; fs->max_p = f->max_p; } return 0; } static int dn_compat_config_pipe(struct dn_sch *sch, struct dn_link *p, struct dn_fs *fs, void* v) { struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; int i = p7->pipe_nr; sch->sched_nr = i; sch->oid.subtype = 0; p->link_nr = i; fs->fs_nr = i + 2*DN_MAX_ID; fs->sched_nr = i + DN_MAX_ID; /* Common to 7 and 8 */ p->bandwidth = p7->bandwidth; p->delay = p7->delay; if (!is7) { /* FreeBSD 8 has burst */ p->burst = p8->burst; } /* fill the fifo flowset */ dn_compat_config_queue(fs, v); fs->fs_nr = i + 2*DN_MAX_ID; fs->sched_nr = i + DN_MAX_ID; /* Move scheduler related parameter from fs to sch */ sch->buckets = fs->buckets; /*XXX*/ fs->buckets = 0; if (fs->flags & DN_HAVE_MASK) { sch->flags |= DN_HAVE_MASK; fs->flags &= ~DN_HAVE_MASK; sch->sched_mask = fs->flow_mask; bzero(&fs->flow_mask, sizeof(struct ipfw_flow_id)); } return 0; } static int dn_compat_config_profile(struct dn_profile *pf, struct dn_link *p, void *v) { struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; p8->samples = &(((struct dn_pipe_max8 *)p8)->samples[0]); pf->link_nr = p->link_nr; pf->loss_level = p8->loss_level; // pf->bandwidth = p->bandwidth; //XXX bandwidth redundant? pf->samples_no = p8->samples_no; strncpy(pf->name, p8->name,sizeof(pf->name)); bcopy(p8->samples, pf->samples, sizeof(pf->samples)); return 0; } /* * If p->pipe_nr != 0 the command is 'pipe x config', so need to create * the three main struct, else only a flowset is created */ static int dn_compat_configure(void *v) { struct dn_id *buf = NULL, *base; struct dn_sch *sch = NULL; struct dn_link *p = NULL; struct dn_fs *fs = NULL; struct dn_profile *pf = NULL; int lmax; int error; struct dn_pipe7 *p7 = (struct dn_pipe7 *)v; struct dn_pipe8 *p8 = (struct dn_pipe8 *)v; int i; /* number of object to configure */ lmax = sizeof(struct dn_id); /* command header */ lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + sizeof(struct dn_fs) + sizeof(struct dn_profile); base = buf = malloc(lmax, M_DUMMYNET, M_WAIT|M_ZERO); o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); base->id = DN_API_VERSION; /* pipe_nr is the same in p7 and p8 */ i = p7->pipe_nr; if (i != 0) { /* pipe config */ sch = o_next(&buf, sizeof(*sch), DN_SCH); p = o_next(&buf, sizeof(*p), DN_LINK); fs = o_next(&buf, sizeof(*fs), DN_FS); error = dn_compat_config_pipe(sch, p, fs, v); if (error) { free(buf, M_DUMMYNET); return error; } if (!is7 && p8->samples_no > 0) { /* Add profiles*/ pf = o_next(&buf, sizeof(*pf), DN_PROFILE); error = dn_compat_config_profile(pf, p, v); if (error) { free(buf, M_DUMMYNET); return error; } } } else { /* queue config */ fs = o_next(&buf, sizeof(*fs), DN_FS); error = dn_compat_config_queue(fs, v); if (error) { free(buf, M_DUMMYNET); return error; } } error = do_config(base, (char *)buf - (char *)base); if (buf) free(buf, M_DUMMYNET); return error; } int dn_compat_calc_size(void) { int need = 0; /* XXX use FreeBSD 8 struct size */ /* NOTE: * - half scheduler: schk_count/2 * - all flowset: fsk_count * - all flowset queues: queue_count * - all pipe queue: si_count */ need += dn_cfg.schk_count * sizeof(struct dn_pipe8) / 2; need += dn_cfg.fsk_count * sizeof(struct dn_flow_set); need += dn_cfg.si_count * sizeof(struct dn_flow_queue8); need += dn_cfg.queue_count * sizeof(struct dn_flow_queue8); return need; } int dn_c_copy_q (void *_ni, void *arg) { struct copy_args *a = arg; struct dn_flow_queue7 *fq7 = (struct dn_flow_queue7 *)*a->start; struct dn_flow_queue8 *fq8 = (struct dn_flow_queue8 *)*a->start; struct dn_flow *ni = (struct dn_flow *)_ni; int size = 0; /* XXX hash slot not set */ /* No difference between 7.2/8 */ fq7->len = ni->length; fq7->len_bytes = ni->len_bytes; fq7->id = ni->fid; if (is7) { size = sizeof(struct dn_flow_queue7); fq7->tot_pkts = ni->tot_pkts; fq7->tot_bytes = ni->tot_bytes; fq7->drops = ni->drops; } else { size = sizeof(struct dn_flow_queue8); fq8->tot_pkts = ni->tot_pkts; fq8->tot_bytes = ni->tot_bytes; fq8->drops = ni->drops; } *a->start += size; return 0; } int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq) { struct dn_link *l = &s->link; struct dn_fsk *f = s->fs; struct dn_pipe7 *pipe7 = (struct dn_pipe7 *)*a->start; struct dn_pipe8 *pipe8 = (struct dn_pipe8 *)*a->start; struct dn_flow_set *fs; int size = 0; if (is7) { fs = &pipe7->fs; size = sizeof(struct dn_pipe7); } else { fs = &pipe8->fs; size = sizeof(struct dn_pipe8); } /* These 4 field are the same in pipe7 and pipe8 */ pipe7->next.sle_next = (struct dn_pipe7 *)DN_IS_PIPE; pipe7->bandwidth = l->bandwidth; pipe7->delay = l->delay * 1000 / hz; pipe7->pipe_nr = l->link_nr - DN_MAX_ID; if (!is7) { if (s->profile) { struct dn_profile *pf = s->profile; strncpy(pipe8->name, pf->name, sizeof(pf->name)); pipe8->loss_level = pf->loss_level; pipe8->samples_no = pf->samples_no; } pipe8->burst = div64(l->burst , 8 * hz); } fs->flow_mask = s->sch.sched_mask; fs->rq_size = s->sch.buckets ? s->sch.buckets : 1; fs->parent_nr = l->link_nr - DN_MAX_ID; fs->qsize = f->fs.qsize; fs->plr = f->fs.plr; fs->w_q = f->fs.w_q; fs->max_th = f->max_th; fs->min_th = f->min_th; fs->max_p = f->fs.max_p; fs->rq_elements = nq; fs->flags_fs = convertflags2old(f->fs.flags); *a->start += size; return 0; } int dn_compat_copy_pipe(struct copy_args *a, void *_o) { int have = a->end - *a->start; int need = 0; int pipe_size = sizeof(struct dn_pipe8); int queue_size = sizeof(struct dn_flow_queue8); int n_queue = 0; /* number of queues */ struct dn_schk *s = (struct dn_schk *)_o; /* calculate needed space: * - struct dn_pipe * - if there are instances, dn_queue * n_instances */ n_queue = (s->sch.flags & DN_HAVE_MASK ? dn_ht_entries(s->siht) : (s->siht ? 1 : 0)); need = pipe_size + queue_size * n_queue; if (have < need) { D("have %d < need %d", have, need); return 1; } /* copy pipe */ dn_c_copy_pipe(s, a, n_queue); /* copy queues */ if (s->sch.flags & DN_HAVE_MASK) dn_ht_scan(s->siht, dn_c_copy_q, a); else if (s->siht) dn_c_copy_q(s->siht, a); return 0; } int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq) { struct dn_flow_set *fs = (struct dn_flow_set *)*a->start; fs->next.sle_next = (struct dn_flow_set *)DN_IS_QUEUE; fs->fs_nr = f->fs.fs_nr; fs->qsize = f->fs.qsize; fs->plr = f->fs.plr; fs->w_q = f->fs.w_q; fs->max_th = f->max_th; fs->min_th = f->min_th; fs->max_p = f->fs.max_p; fs->flow_mask = f->fs.flow_mask; fs->rq_elements = nq; fs->rq_size = (f->fs.buckets ? f->fs.buckets : 1); fs->parent_nr = f->fs.sched_nr; fs->weight = f->fs.par[0]; fs->flags_fs = convertflags2old(f->fs.flags); *a->start += sizeof(struct dn_flow_set); return 0; } int dn_compat_copy_queue(struct copy_args *a, void *_o) { int have = a->end - *a->start; int need = 0; int fs_size = sizeof(struct dn_flow_set); int queue_size = sizeof(struct dn_flow_queue8); struct dn_fsk *fs = (struct dn_fsk *)_o; int n_queue = 0; /* number of queues */ n_queue = (fs->fs.flags & DN_HAVE_MASK ? dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0)); need = fs_size + queue_size * n_queue; if (have < need) { D("have < need"); return 1; } /* copy flowset */ dn_c_copy_fs(fs, a, n_queue); /* copy queues */ if (fs->fs.flags & DN_HAVE_MASK) dn_ht_scan(fs->qht, dn_c_copy_q, a); else if (fs->qht) dn_c_copy_q(fs->qht, a); return 0; } int copy_data_helper_compat(void *_o, void *_arg) { struct copy_args *a = _arg; if (a->type == DN_COMPAT_PIPE) { struct dn_schk *s = _o; if (s->sch.oid.subtype != 1 || s->sch.sched_nr <= DN_MAX_ID) { return 0; /* not old type */ } /* copy pipe parameters, and if instance exists, copy * other parameters and eventually queues. */ if(dn_compat_copy_pipe(a, _o)) return DNHT_SCAN_END; } else if (a->type == DN_COMPAT_QUEUE) { struct dn_fsk *fs = _o; if (fs->fs.fs_nr >= DN_MAX_ID) return 0; if (dn_compat_copy_queue(a, _o)) return DNHT_SCAN_END; } return 0; } /* Main function to manage old requests */ int ip_dummynet_compat(struct sockopt *sopt) { int error=0; void *v = NULL; struct dn_id oid; /* Lenght of data, used to found ipfw version... */ int len = sopt->sopt_valsize; /* len can be 0 if command was dummynet_flush */ if (len == pipesize7) { D("setting compatibility with FreeBSD 7.2"); is7 = 1; } else if (len == pipesize8 || len == pipesizemax8) { D("setting compatibility with FreeBSD 8"); is7 = 0; } switch (sopt->sopt_name) { default: printf("dummynet: -- unknown option %d", sopt->sopt_name); error = EINVAL; break; case IP_DUMMYNET_FLUSH: oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); do_config(&oid, oid.len); break; case IP_DUMMYNET_DEL: v = malloc(len, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, v, len, len); if (error) break; error = dn_compat_del(v); free(v, M_TEMP); break; case IP_DUMMYNET_CONFIGURE: v = malloc(len, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, v, len, len); if (error) break; error = dn_compat_configure(v); free(v, M_TEMP); break; case IP_DUMMYNET_GET: { void *buf; int ret; int original_size = sopt->sopt_valsize; int size; ret = dummynet_get(sopt, &buf); if (ret) return 0;//XXX ? size = sopt->sopt_valsize; sopt->sopt_valsize = original_size; D("size=%d, buf=%p", size, buf); ret = sooptcopyout(sopt, buf, size); if (ret) printf(" %s ERROR sooptcopyout\n", __FUNCTION__); if (buf) free(buf, M_DUMMYNET); } } return error; } ipfw-user/sys/netinet/ipfw/ip_fw_log.c000644 000423 000000 00000033126 12007461121 020576 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_log.c 238978 2012-08-01 18:52:07Z luigi $"); /* * Logging support for ipfw */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include /* for IFT_ETHER */ #include /* for BPF */ #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include /* ip6_sprintf() */ #endif #ifdef MAC #include #endif /* * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) #ifdef __APPLE__ #undef snprintf #define snprintf sprintf #define SNPARGS(buf, len) buf + len #define SNP(buf) buf #else /* !__APPLE__ */ #define SNPARGS(buf, len) buf + len, sizeof(buf) > len ? sizeof(buf) - len : 0 #define SNP(buf) buf, sizeof(buf) #endif /* !__APPLE__ */ #ifdef WITHOUT_BPF void ipfw_log_bpf(int onoff) { } #else /* !WITHOUT_BPF */ static struct ifnet *log_if; /* hook to attach to bpf */ static struct rwlock log_if_lock; #define LOGIF_LOCK_INIT(x) rw_init(&log_if_lock, "ipfw log_if lock") #define LOGIF_LOCK_DESTROY(x) rw_destroy(&log_if_lock) #define LOGIF_RLOCK(x) rw_rlock(&log_if_lock) #define LOGIF_RUNLOCK(x) rw_runlock(&log_if_lock) #define LOGIF_WLOCK(x) rw_wlock(&log_if_lock) #define LOGIF_WUNLOCK(x) rw_wunlock(&log_if_lock) #define IPFWNAME "ipfw" /* we use this dummy function for all ifnet callbacks */ static int log_dummy(struct ifnet *ifp, u_long cmd, caddr_t addr) { return EINVAL; } static int ipfw_log_output(struct ifnet *ifp, struct mbuf *m, struct sockaddr *dst, struct route *ro) { if (m != NULL) FREE_PKT(m); return EINVAL; } static void ipfw_log_start(struct ifnet* ifp) { panic("ipfw_log_start() must not be called"); } static const u_char ipfwbroadcastaddr[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; static int ipfw_log_clone_match(struct if_clone *ifc, const char *name) { return (strncmp(name, IPFWNAME, sizeof(IPFWNAME) - 1) == 0); } static int ipfw_log_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params) { int error; int unit; struct ifnet *ifp; error = ifc_name2unit(name, &unit); if (error) return (error); error = ifc_alloc_unit(ifc, &unit); if (error) return (error); ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { ifc_free_unit(ifc, unit); return (ENOSPC); } ifp->if_dname = IPFWNAME; ifp->if_dunit = unit; snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", IPFWNAME, unit); strlcpy(name, ifp->if_xname, len); ifp->if_mtu = 65536; ifp->if_flags = IFF_UP | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_init = (void *)log_dummy; ifp->if_ioctl = log_dummy; ifp->if_start = ipfw_log_start; ifp->if_output = ipfw_log_output; ifp->if_addrlen = 6; ifp->if_hdrlen = 14; ifp->if_broadcastaddr = ipfwbroadcastaddr; ifp->if_baudrate = IF_Mbps(10); LOGIF_WLOCK(); if (log_if == NULL) log_if = ifp; else { LOGIF_WUNLOCK(); if_free(ifp); ifc_free_unit(ifc, unit); return (EEXIST); } LOGIF_WUNLOCK(); if_attach(ifp); bpfattach(ifp, DLT_EN10MB, 14); return (0); } static int ipfw_log_clone_destroy(struct if_clone *ifc, struct ifnet *ifp) { int unit; if (ifp == NULL) return (0); LOGIF_WLOCK(); if (log_if != NULL && ifp == log_if) log_if = NULL; else { LOGIF_WUNLOCK(); return (EINVAL); } LOGIF_WUNLOCK(); unit = ifp->if_dunit; bpfdetach(ifp); if_detach(ifp); if_free(ifp); ifc_free_unit(ifc, unit); return (0); } static struct if_clone ipfw_log_cloner = IFC_CLONE_INITIALIZER( IPFWNAME, NULL, IF_MAXUNIT, NULL, ipfw_log_clone_match, ipfw_log_clone_create, ipfw_log_clone_destroy); void ipfw_log_bpf(int onoff) { if (onoff) { LOGIF_LOCK_INIT(); if_clone_attach(&ipfw_log_cloner); } else { if_clone_detach(&ipfw_log_cloner); LOGIF_LOCK_DESTROY(); } } #endif /* !WITHOUT_BPF */ /* * We enter here when we have a rule with O_LOG. * XXX this function alone takes about 2Kbytes of code! */ void ipfw_log(struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip) { char *action; int limit_reached = 0; char action2[92], proto[128], fragment[32]; if (V_fw_verbose == 0) { #ifndef WITHOUT_BPF LOGIF_RLOCK(); if (log_if == NULL || log_if->if_bpf == NULL) { LOGIF_RUNLOCK(); return; } if (args->eh) /* layer2, use orig hdr */ BPF_MTAP2(log_if, args->eh, ETHER_HDR_LEN, m); else /* Add fake header. Later we will store * more info in the header. */ BPF_MTAP2(log_if, "DDDDDDSSSSSS\x08\x00", ETHER_HDR_LEN, m); LOGIF_RUNLOCK(); #endif /* !WITHOUT_BPF */ return; } /* the old 'log' function */ fragment[0] = '\0'; proto[0] = '\0'; if (f == NULL) { /* bogus pkt */ if (V_verbose_limit != 0 && V_norule_counter >= V_verbose_limit) return; V_norule_counter++; if (V_norule_counter == V_verbose_limit) limit_reached = V_verbose_limit; action = "Refuse"; } else { /* O_LOG is the first action, find the real one */ ipfw_insn *cmd = ACTION_PTR(f); ipfw_insn_log *l = (ipfw_insn_log *)cmd; if (l->max_log != 0 && l->log_left == 0) return; l->log_left--; if (l->log_left == 0) limit_reached = l->max_log; cmd += F_LEN(cmd); /* point to first action */ if (cmd->opcode == O_ALTQ) { ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; snprintf(SNPARGS(action2, 0), "Altq %d", altq->qid); cmd += F_LEN(cmd); } if (cmd->opcode == O_PROB) cmd += F_LEN(cmd); if (cmd->opcode == O_TAG) cmd += F_LEN(cmd); action = action2; switch (cmd->opcode) { case O_DENY: action = "Deny"; break; case O_REJECT: if (cmd->arg1==ICMP_REJECT_RST) action = "Reset"; else if (cmd->arg1==ICMP_UNREACH_HOST) action = "Reject"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_UNREACH6: if (cmd->arg1==ICMP6_UNREACH_RST) action = "Reset"; else snprintf(SNPARGS(action2, 0), "Unreach %d", cmd->arg1); break; case O_ACCEPT: action = "Accept"; break; case O_COUNT: action = "Count"; break; case O_DIVERT: snprintf(SNPARGS(action2, 0), "Divert %d", cmd->arg1); break; case O_TEE: snprintf(SNPARGS(action2, 0), "Tee %d", cmd->arg1); break; case O_SETFIB: snprintf(SNPARGS(action2, 0), "SetFib %d", cmd->arg1); break; case O_SKIPTO: snprintf(SNPARGS(action2, 0), "SkipTo %d", cmd->arg1); break; case O_PIPE: snprintf(SNPARGS(action2, 0), "Pipe %d", cmd->arg1); break; case O_QUEUE: snprintf(SNPARGS(action2, 0), "Queue %d", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *sa = (ipfw_insn_sa *)cmd; int len; struct in_addr dummyaddr; if (sa->sa.sin_addr.s_addr == INADDR_ANY) dummyaddr.s_addr = htonl(tablearg); else dummyaddr.s_addr = sa->sa.sin_addr.s_addr; len = snprintf(SNPARGS(action2, 0), "Forward to %s", inet_ntoa(dummyaddr)); if (sa->sa.sin_port) snprintf(SNPARGS(action2, len), ":%d", sa->sa.sin_port); } break; #ifdef INET6 case O_FORWARD_IP6: { char buf[INET6_ADDRSTRLEN]; ipfw_insn_sa6 *sa = (ipfw_insn_sa6 *)cmd; int len; len = snprintf(SNPARGS(action2, 0), "Forward to [%s]", ip6_sprintf(buf, &sa->sa.sin6_addr)); if (sa->sa.sin6_port) snprintf(SNPARGS(action2, len), ":%u", sa->sa.sin6_port); } break; #endif case O_NETGRAPH: snprintf(SNPARGS(action2, 0), "Netgraph %d", cmd->arg1); break; case O_NGTEE: snprintf(SNPARGS(action2, 0), "Ngtee %d", cmd->arg1); break; case O_NAT: action = "Nat"; break; case O_REASS: action = "Reass"; break; case O_CALLRETURN: if (cmd->len & F_NOT) action = "Return"; else snprintf(SNPARGS(action2, 0), "Call %d", cmd->arg1); break; default: action = "UNKNOWN"; break; } } if (hlen == 0) { /* non-ip */ snprintf(SNPARGS(proto, 0), "MAC"); } else { int len; #ifdef INET6 char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif struct icmphdr *icmp; struct tcphdr *tcp; struct udphdr *udp; #ifdef INET6 struct ip6_hdr *ip6 = NULL; struct icmp6_hdr *icmp6; u_short ip6f_mf; #endif src[0] = '\0'; dst[0] = '\0'; #ifdef INET6 ip6f_mf = offset & IP6F_MORE_FRAG; offset &= IP6F_OFF_MASK; if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); ip6 = (struct ip6_hdr *)ip; tcp = (struct tcphdr *)(((char *)ip) + hlen); udp = (struct udphdr *)(((char *)ip) + hlen); } else #endif { tcp = L3HDR(struct tcphdr, ip); udp = L3HDR(struct udphdr, ip); inet_ntop(AF_INET, &ip->ip_src, src, sizeof(src)); inet_ntop(AF_INET, &ip->ip_dst, dst, sizeof(dst)); } switch (args->f_id.proto) { case IPPROTO_TCP: len = snprintf(SNPARGS(proto, 0), "TCP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(tcp->th_sport), dst, ntohs(tcp->th_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_UDP: len = snprintf(SNPARGS(proto, 0), "UDP %s", src); if (offset == 0) snprintf(SNPARGS(proto, len), ":%d %s:%d", ntohs(udp->uh_sport), dst, ntohs(udp->uh_dport)); else snprintf(SNPARGS(proto, len), " %s", dst); break; case IPPROTO_ICMP: icmp = L3HDR(struct icmphdr, ip); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMP:%u.%u ", icmp->icmp_type, icmp->icmp_code); else len = snprintf(SNPARGS(proto, 0), "ICMP "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #ifdef INET6 case IPPROTO_ICMPV6: icmp6 = (struct icmp6_hdr *)(((char *)ip) + hlen); if (offset == 0) len = snprintf(SNPARGS(proto, 0), "ICMPv6:%u.%u ", icmp6->icmp6_type, icmp6->icmp6_code); else len = snprintf(SNPARGS(proto, 0), "ICMPv6 "); len += snprintf(SNPARGS(proto, len), "%s", src); snprintf(SNPARGS(proto, len), " %s", dst); break; #endif default: len = snprintf(SNPARGS(proto, 0), "P:%d %s", args->f_id.proto, src); snprintf(SNPARGS(proto, len), " %s", dst); break; } #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { if (offset & (IP6F_OFF_MASK | IP6F_MORE_FRAG)) snprintf(SNPARGS(fragment, 0), " (frag %08x:%d@%d%s)", args->f_id.extra, ntohs(ip6->ip6_plen) - hlen, ntohs(offset) << 3, ip6f_mf ? "+" : ""); } else #endif { int ipoff, iplen; ipoff = ntohs(ip->ip_off); iplen = ntohs(ip->ip_len); if (ipoff & (IP_MF | IP_OFFMASK)) snprintf(SNPARGS(fragment, 0), " (frag %d:%d@%d%s)", ntohs(ip->ip_id), iplen - (ip->ip_hl << 2), offset << 3, (ipoff & IP_MF) ? "+" : ""); } } #ifdef __FreeBSD__ if (oif || m->m_pkthdr.rcvif) log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s %s via %s%s\n", f ? f->rulenum : -1, action, proto, oif ? "out" : "in", oif ? oif->if_xname : m->m_pkthdr.rcvif->if_xname, fragment); else #endif log(LOG_SECURITY | LOG_INFO, "ipfw: %d %s %s [no if info]%s\n", f ? f->rulenum : -1, action, proto, fragment); if (limit_reached) log(LOG_SECURITY | LOG_NOTICE, "ipfw: limit %d reached on entry %d\n", limit_reached, f ? f->rulenum : -1); } /* end of file */ ipfw-user/sys/netinet/ipfw/ip_fw_sockopt.c000644 000423 000000 00000106504 12007435564 021514 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Supported by: Valeria Paoli * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_sockopt.c 233745 2012-03-31 11:20:48Z glebius $"); /* * Sockopt support for ipfw. The routines here implement * the upper half of the ipfw code. */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include /* struct m_tag used by nested headers */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* hooks */ #include #include #ifdef MAC #include #endif MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's"); /* * static variables followed by global ones (none in this file) */ /* * Find the smallest rule >= key, id. * We could use bsearch but it is so simple that we code it directly */ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id) { int i, lo, hi; struct ip_fw *r; for (lo = 0, hi = chain->n_rules - 1; lo < hi;) { i = (lo + hi) / 2; r = chain->map[i]; if (r->rulenum < key) lo = i + 1; /* continue from the next one */ else if (r->rulenum > key) hi = i; /* this might be good */ else if (r->id < id) lo = i + 1; /* continue from the next one */ else /* r->id >= id */ hi = i; /* this might be good */ }; return hi; } /* * allocate a new map, returns the chain locked. extra is the number * of entries to add or delete. */ static struct ip_fw ** get_map(struct ip_fw_chain *chain, int extra, int locked) { for (;;) { struct ip_fw **map; int i; i = chain->n_rules + extra; map = malloc(i * sizeof(struct ip_fw *), M_IPFW, locked ? M_NOWAIT : M_WAITOK); if (map == NULL) { printf("%s: cannot allocate map\n", __FUNCTION__); return NULL; } if (!locked) IPFW_UH_WLOCK(chain); if (i >= chain->n_rules + extra) /* good */ return map; /* otherwise we lost the race, free and retry */ if (!locked) IPFW_UH_WUNLOCK(chain); free(map, M_IPFW); } } /* * swap the maps. It is supposed to be called with IPFW_UH_WLOCK */ static struct ip_fw ** swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len) { struct ip_fw **old_map; IPFW_WLOCK(chain); chain->id++; chain->n_rules = new_len; old_map = chain->map; chain->map = new_map; IPFW_WUNLOCK(chain); return old_map; } /* * Add a new rule to the list. Copy the rule into a malloc'ed area, then * possibly create a rule number and add the rule to the list. * Update the rule_number in the input struct so the caller knows it as well. * XXX DO NOT USE FOR THE DEFAULT RULE. * Must be called without IPFW_UH held */ int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule) { struct ip_fw *rule; int i, l, insert_before; struct ip_fw **map; /* the new array of pointers */ if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1) return (EINVAL); l = RULESIZE(input_rule); rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO); /* get_map returns with IPFW_UH_WLOCK if successful */ map = get_map(chain, 1, 0 /* not locked */); if (map == NULL) { free(rule, M_IPFW); return ENOSPC; } bcopy(input_rule, rule, l); /* clear fields not settable from userland */ rule->x_next = NULL; rule->next_rule = NULL; rule->pcnt = 0; rule->bcnt = 0; rule->timestamp = 0; if (V_autoinc_step < 1) V_autoinc_step = 1; else if (V_autoinc_step > 1000) V_autoinc_step = 1000; /* find the insertion point, we will insert before */ insert_before = rule->rulenum ? rule->rulenum + 1 : IPFW_DEFAULT_RULE; i = ipfw_find_rule(chain, insert_before, 0); /* duplicate first part */ if (i > 0) bcopy(chain->map, map, i * sizeof(struct ip_fw *)); map[i] = rule; /* duplicate remaining part, we always have the default rule */ bcopy(chain->map + i, map + i + 1, sizeof(struct ip_fw *) *(chain->n_rules - i)); if (rule->rulenum == 0) { /* write back the number */ rule->rulenum = i > 0 ? map[i-1]->rulenum : 0; if (rule->rulenum < IPFW_DEFAULT_RULE - V_autoinc_step) rule->rulenum += V_autoinc_step; input_rule->rulenum = rule->rulenum; } rule->id = chain->id + 1; map = swap_map(chain, map, chain->n_rules + 1); chain->static_len += l; IPFW_UH_WUNLOCK(chain); if (map) free(map, M_IPFW); return (0); } /* * Reclaim storage associated with a list of rules. This is * typically the list created using remove_rule. * A NULL pointer on input is handled correctly. */ void ipfw_reap_rules(struct ip_fw *head) { struct ip_fw *rule; while ((rule = head) != NULL) { head = head->x_next; free(rule, M_IPFW); } } /* * Used by del_entry() to check if a rule should be kept. * Returns 1 if the rule must be kept, 0 otherwise. * * Called with cmd = {0,1,5}. * cmd == 0 matches on rule numbers, excludes rules in RESVD_SET if n == 0 ; * cmd == 1 matches on set numbers only, rule numbers are ignored; * cmd == 5 matches on rule and set numbers. * * n == 0 is a wildcard for rule numbers, there is no wildcard for sets. * * Rules to keep are * (default || reserved || !match_set || !match_number) * where * default ::= (rule->rulenum == IPFW_DEFAULT_RULE) * // the default rule is always protected * * reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET) * // RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush") * * match_set ::= (cmd == 0 || rule->set == set) * // set number is ignored for cmd == 0 * * match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum) * // number is ignored for cmd == 1 or n == 0 * */ static int keep_rule(struct ip_fw *rule, uint8_t cmd, uint8_t set, uint32_t n) { return (rule->rulenum == IPFW_DEFAULT_RULE) || (cmd == 0 && n == 0 && rule->set == RESVD_SET) || !(cmd == 0 || rule->set == set) || !(cmd == 1 || n == 0 || n == rule->rulenum); } /** * Remove all rules with given number, or do set manipulation. * Assumes chain != NULL && *chain != NULL. * * The argument is an uint32_t. The low 16 bit are the rule or set number; * the next 8 bits are the new set; the top 8 bits indicate the command: * * 0 delete rules numbered "rulenum" * 1 delete rules in set "rulenum" * 2 move rules "rulenum" to set "new_set" * 3 move rules from set "rulenum" to set "new_set" * 4 swap sets "rulenum" and "new_set" * 5 delete rules "rulenum" and set "new_set" */ static int del_entry(struct ip_fw_chain *chain, uint32_t arg) { struct ip_fw *rule; uint32_t num; /* rule number or old_set */ uint8_t cmd, new_set; int start, end, i, ofs, n; struct ip_fw **map = NULL; int error = 0; num = arg & 0xffff; cmd = (arg >> 24) & 0xff; new_set = (arg >> 16) & 0xff; if (cmd > 5 || new_set > RESVD_SET) return EINVAL; if (cmd == 0 || cmd == 2 || cmd == 5) { if (num >= IPFW_DEFAULT_RULE) return EINVAL; } else { if (num > RESVD_SET) /* old_set */ return EINVAL; } IPFW_UH_WLOCK(chain); /* arbitrate writers */ chain->reap = NULL; /* prepare for deletions */ switch (cmd) { case 0: /* delete rules "num" (num == 0 matches all) */ case 1: /* delete all rules in set N */ case 5: /* delete rules with number N and set "new_set". */ /* * Locate first rule to delete (start), the rule after * the last one to delete (end), and count how many * rules to delete (n). Always use keep_rule() to * determine which rules to keep. */ n = 0; if (cmd == 1) { /* look for a specific set including RESVD_SET. * Must scan the entire range, ignore num. */ new_set = num; for (start = -1, end = i = 0; i < chain->n_rules; i++) { if (keep_rule(chain->map[i], cmd, new_set, 0)) continue; if (start < 0) start = i; end = i; n++; } end++; /* first non-matching */ } else { /* Optimized search on rule numbers */ start = ipfw_find_rule(chain, num, 0); for (end = start; end < chain->n_rules; end++) { rule = chain->map[end]; if (num > 0 && rule->rulenum != num) break; if (!keep_rule(rule, cmd, new_set, num)) n++; } } if (n == 0) { /* A flush request (arg == 0 or cmd == 1) on empty * ruleset returns with no error. On the contrary, * if there is no match on a specific request, * we return EINVAL. */ if (arg != 0 && cmd != 1) error = EINVAL; break; } /* We have something to delete. Allocate the new map */ map = get_map(chain, -n, 1 /* locked */); if (map == NULL) { error = EINVAL; break; } /* 1. bcopy the initial part of the map */ if (start > 0) bcopy(chain->map, map, start * sizeof(struct ip_fw *)); /* 2. copy active rules between start and end */ for (i = ofs = start; i < end; i++) { rule = chain->map[i]; if (keep_rule(rule, cmd, new_set, num)) map[ofs++] = rule; } /* 3. copy the final part of the map */ bcopy(chain->map + end, map + ofs, (chain->n_rules - end) * sizeof(struct ip_fw *)); /* 4. swap the maps (under BH_LOCK) */ map = swap_map(chain, map, chain->n_rules - n); /* 5. now remove the rules deleted from the old map */ for (i = start; i < end; i++) { int l; rule = map[i]; if (keep_rule(rule, cmd, new_set, num)) continue; l = RULESIZE(rule); chain->static_len -= l; ipfw_remove_dyn_children(rule); rule->x_next = chain->reap; chain->reap = rule; } break; /* * In the next 3 cases the loop stops at (n_rules - 1) * because the default rule is never eligible.. */ case 2: /* move rules with given RULE number to new set */ for (i = 0; i < chain->n_rules - 1; i++) { rule = chain->map[i]; if (rule->rulenum == num) rule->set = new_set; } break; case 3: /* move rules with given SET number to new set */ for (i = 0; i < chain->n_rules - 1; i++) { rule = chain->map[i]; if (rule->set == num) rule->set = new_set; } break; case 4: /* swap two sets */ for (i = 0; i < chain->n_rules - 1; i++) { rule = chain->map[i]; if (rule->set == num) rule->set = new_set; else if (rule->set == new_set) rule->set = num; } break; } rule = chain->reap; chain->reap = NULL; IPFW_UH_WUNLOCK(chain); ipfw_reap_rules(rule); if (map) free(map, M_IPFW); return error; } /* * Clear counters for a specific rule. * Normally run under IPFW_UH_RLOCK, but these are idempotent ops * so we only care that rules do not disappear. */ static void clear_counters(struct ip_fw *rule, int log_only) { ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule); if (log_only == 0) { rule->bcnt = rule->pcnt = 0; rule->timestamp = 0; } if (l->o.opcode == O_LOG) l->log_left = l->max_log; } /** * Reset some or all counters on firewall rules. * The argument `arg' is an u_int32_t. The low 16 bit are the rule number, * the next 8 bits are the set number, the top 8 bits are the command: * 0 work with rules from all set's; * 1 work with rules only from specified set. * Specified rule number is zero if we want to clear all entries. * log_only is 1 if we only want to reset logs, zero otherwise. */ static int zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only) { struct ip_fw *rule; char *msg; int i; uint16_t rulenum = arg & 0xffff; uint8_t set = (arg >> 16) & 0xff; uint8_t cmd = (arg >> 24) & 0xff; if (cmd > 1) return (EINVAL); if (cmd == 1 && set > RESVD_SET) return (EINVAL); IPFW_UH_RLOCK(chain); if (rulenum == 0) { V_norule_counter = 0; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; /* Skip rules not in our set. */ if (cmd == 1 && rule->set != set) continue; clear_counters(rule, log_only); } msg = log_only ? "All logging counts reset" : "Accounting cleared"; } else { int cleared = 0; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (rule->rulenum == rulenum) { if (cmd == 0 || rule->set == set) clear_counters(rule, log_only); cleared = 1; } if (rule->rulenum > rulenum) break; } if (!cleared) { /* we did not find any matching rules */ IPFW_UH_RUNLOCK(chain); return (EINVAL); } msg = log_only ? "logging count reset" : "cleared"; } IPFW_UH_RUNLOCK(chain); if (V_fw_verbose) { int lev = LOG_SECURITY | LOG_NOTICE; if (rulenum) log(lev, "ipfw: Entry %d %s.\n", rulenum, msg); else log(lev, "ipfw: %s.\n", msg); } return (0); } /* * Check validity of the structure before insert. * Rules are simple, so this mostly need to check rule sizes. */ static int check_ipfw_struct(struct ip_fw *rule, int size) { int l, cmdlen = 0; int have_action=0; ipfw_insn *cmd; if (size < sizeof(*rule)) { printf("ipfw: rule too short\n"); return (EINVAL); } /* first, check for valid size */ l = RULESIZE(rule); if (l != size) { printf("ipfw: size mismatch (have %d want %d)\n", size, l); return (EINVAL); } if (rule->act_ofs >= rule->cmd_len) { printf("ipfw: bogus action offset (%u > %u)\n", rule->act_ofs, rule->cmd_len - 1); return (EINVAL); } /* * Now go for the individual checks. Very simple ones, basically only * instruction sizes. */ for (l = rule->cmd_len, cmd = rule->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (cmdlen > l) { printf("ipfw: opcode %d size truncated\n", cmd->opcode); return EINVAL; } switch (cmd->opcode) { case O_PROBE_STATE: case O_KEEP_STATE: case O_PROTO: case O_IP_SRC_ME: case O_IP_DST_ME: case O_LAYER2: case O_IN: case O_FRAG: case O_DIVERTED: case O_IPOPT: case O_IPTOS: case O_IPPRECEDENCE: case O_IPVER: case O_SOCKARG: case O_TCPFLAGS: case O_TCPOPTS: case O_ESTAB: case O_VERREVPATH: case O_VERSRCREACH: case O_ANTISPOOF: case O_IPSEC: #ifdef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: #endif case O_IP4: case O_TAG: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_FIB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; if (cmd->arg1 >= rt_numfibs) { printf("ipfw: invalid fib number %d\n", cmd->arg1); return EINVAL; } break; case O_SETFIB: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; if ((cmd->arg1 != IP_FW_TABLEARG) && (cmd->arg1 >= rt_numfibs)) { printf("ipfw: invalid fib number %d\n", cmd->arg1); return EINVAL; } goto check_action; case O_UID: case O_GID: case O_JAIL: case O_IP_SRC: case O_IP_DST: case O_TCPSEQ: case O_TCPACK: case O_PROB: case O_ICMPTYPE: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_LIMIT: if (cmdlen != F_INSN_SIZE(ipfw_insn_limit)) goto bad_size; break; case O_LOG: if (cmdlen != F_INSN_SIZE(ipfw_insn_log)) goto bad_size; ((ipfw_insn_log *)cmd)->log_left = ((ipfw_insn_log *)cmd)->max_log; break; case O_IP_SRC_MASK: case O_IP_DST_MASK: /* only odd command lengths */ if ( !(cmdlen & 1) || cmdlen > 31) goto bad_size; break; case O_IP_SRC_SET: case O_IP_DST_SET: if (cmd->arg1 == 0 || cmd->arg1 > 256) { printf("ipfw: invalid set size %d\n", cmd->arg1); return EINVAL; } if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + (cmd->arg1+31)/32 ) goto bad_size; break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (cmd->arg1 >= IPFW_TABLES_MAX) { printf("ipfw: invalid table number %d\n", cmd->arg1); return (EINVAL); } if (cmdlen != F_INSN_SIZE(ipfw_insn) && cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 && cmdlen != F_INSN_SIZE(ipfw_insn_u32)) goto bad_size; break; case O_MACADDR2: if (cmdlen != F_INSN_SIZE(ipfw_insn_mac)) goto bad_size; break; case O_NOP: case O_IPID: case O_IPTTL: case O_IPLEN: case O_TCPDATALEN: case O_TCPWIN: case O_TAGGED: if (cmdlen < 1 || cmdlen > 31) goto bad_size; break; case O_MAC_TYPE: case O_IP_SRCPORT: case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */ if (cmdlen < 2 || cmdlen > 31) goto bad_size; break; case O_RECV: case O_XMIT: case O_VIA: if (cmdlen != F_INSN_SIZE(ipfw_insn_if)) goto bad_size; break; case O_ALTQ: if (cmdlen != F_INSN_SIZE(ipfw_insn_altq)) goto bad_size; break; case O_PIPE: case O_QUEUE: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; goto check_action; case O_FORWARD_IP: #ifdef IPFIREWALL_FORWARD if (cmdlen != F_INSN_SIZE(ipfw_insn_sa)) goto bad_size; goto check_action; #else return EINVAL; #endif #ifdef INET6 case O_FORWARD_IP6: #ifdef IPFIREWALL_FORWARD if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6)) goto bad_size; goto check_action; #else return (EINVAL); #endif #endif /* INET6 */ case O_DIVERT: case O_TEE: if (ip_divert_ptr == NULL) return EINVAL; else goto check_size; case O_NETGRAPH: case O_NGTEE: if (ng_ipfw_input_p == NULL) return EINVAL; else goto check_size; case O_NAT: if (!IPFW_NAT_LOADED) return EINVAL; if (cmdlen != F_INSN_SIZE(ipfw_insn_nat)) goto bad_size; goto check_action; case O_FORWARD_MAC: /* XXX not implemented yet */ case O_CHECK_STATE: case O_COUNT: case O_ACCEPT: case O_DENY: case O_REJECT: #ifdef INET6 case O_UNREACH6: #endif case O_SKIPTO: case O_REASS: case O_CALLRETURN: check_size: if (cmdlen != F_INSN_SIZE(ipfw_insn)) goto bad_size; check_action: if (have_action) { printf("ipfw: opcode %d, multiple actions" " not allowed\n", cmd->opcode); return EINVAL; } have_action = 1; if (l != cmdlen) { printf("ipfw: opcode %d, action must be" " last opcode\n", cmd->opcode); return EINVAL; } break; #ifdef INET6 case O_IP6_SRC: case O_IP6_DST: if (cmdlen != F_INSN_SIZE(struct in6_addr) + F_INSN_SIZE(ipfw_insn)) goto bad_size; break; case O_FLOW6ID: if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + ((ipfw_insn_u32 *)cmd)->o.arg1) goto bad_size; break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if ( !(cmdlen & 1) || cmdlen > 127) goto bad_size; break; case O_ICMP6TYPE: if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) ) goto bad_size; break; #endif default: switch (cmd->opcode) { #ifndef INET6 case O_IP6_SRC_ME: case O_IP6_DST_ME: case O_EXT_HDR: case O_IP6: case O_UNREACH6: case O_IP6_SRC: case O_IP6_DST: case O_FLOW6ID: case O_IP6_SRC_MASK: case O_IP6_DST_MASK: case O_ICMP6TYPE: printf("ipfw: no IPv6 support in kernel\n"); return EPROTONOSUPPORT; #endif default: printf("ipfw: opcode %d, unknown opcode\n", cmd->opcode); return EINVAL; } } } if (have_action == 0) { printf("ipfw: missing action\n"); return EINVAL; } return 0; bad_size: printf("ipfw: opcode %d size %d wrong\n", cmd->opcode, cmdlen); return EINVAL; } /* * Translation of requests for compatibility with FreeBSD 7.2/8. * a static variable tells us if we have an old client from userland, * and if necessary we translate requests and responses between the * two formats. */ static int is7 = 0; struct ip_fw7 { struct ip_fw7 *next; /* linked list of rules */ struct ip_fw7 *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ // #define RESVD_SET 31 /* set for default and persistent rules */ uint8_t _pad; /* padding */ // uint32_t id; /* rule id, only in v.8 */ /* These fields are present in all rules. */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ ipfw_insn cmd[1]; /* storage for commands */ }; int convert_rule_to_7(struct ip_fw *rule); int convert_rule_to_8(struct ip_fw *rule); #ifndef RULESIZE7 #define RULESIZE7(rule) (sizeof(struct ip_fw7) + \ ((struct ip_fw7 *)(rule))->cmd_len * 4 - 4) #endif /* * Copy the static and dynamic rules to the supplied buffer * and return the amount of space actually used. * Must be run under IPFW_UH_RLOCK */ static size_t ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space) { char *bp = buf; char *ep = bp + space; struct ip_fw *rule, *dst; int l, i; time_t boot_seconds; boot_seconds = boottime.tv_sec; for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; if (is7) { /* Convert rule to FreeBSd 7.2 format */ l = RULESIZE7(rule); if (bp + l + sizeof(uint32_t) <= ep) { int error; bcopy(rule, bp, l + sizeof(uint32_t)); error = convert_rule_to_7((struct ip_fw *) bp); if (error) return 0; /*XXX correct? */ /* * XXX HACK. Store the disable mask in the "next" * pointer in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? */ bcopy(&V_set_disable, &(((struct ip_fw7 *)bp)->next_rule), sizeof(V_set_disable)); if (((struct ip_fw7 *)bp)->timestamp) ((struct ip_fw7 *)bp)->timestamp += boot_seconds; bp += l; } continue; /* go to next rule */ } /* normal mode, don't touch rules */ l = RULESIZE(rule); if (bp + l > ep) { /* should not happen */ printf("overflow dumping static rules\n"); break; } dst = (struct ip_fw *)bp; bcopy(rule, dst, l); /* * XXX HACK. Store the disable mask in the "next" * pointer in a wild attempt to keep the ABI the same. * Why do we do this on EVERY rule? */ bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable)); if (dst->timestamp) dst->timestamp += boot_seconds; bp += l; } ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */ return (bp - (char *)buf); } #define IP_FW3_OPLENGTH(x) ((x)->sopt_valsize - sizeof(ip_fw3_opheader)) /** * {set|get}sockopt parser. */ int ipfw_ctl(struct sockopt *sopt) { #define RULE_MAXSIZE (256*sizeof(u_int32_t)) int error; size_t size, len, valsize; struct ip_fw *buf, *rule; struct ip_fw_chain *chain; u_int32_t rulenum[2]; uint32_t opt; char xbuf[128]; ip_fw3_opheader *op3 = NULL; error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW); if (error) return (error); /* * Disallow modifications in really-really secure mode, but still allow * the logging counters to be reset. */ if (sopt->sopt_name == IP_FW_ADD || (sopt->sopt_dir == SOPT_SET && sopt->sopt_name != IP_FW_RESETLOG)) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); } chain = &V_layer3_chain; error = 0; /* Save original valsize before it is altered via sooptcopyin() */ valsize = sopt->sopt_valsize; if ((opt = sopt->sopt_name) == IP_FW3) { /* * Copy not less than sizeof(ip_fw3_opheader). * We hope any IP_FW3 command will fit into 128-byte buffer. */ if ((error = sooptcopyin(sopt, xbuf, sizeof(xbuf), sizeof(ip_fw3_opheader))) != 0) return (error); op3 = (ip_fw3_opheader *)xbuf; opt = op3->opcode; } switch (opt) { case IP_FW_GET: /* * pass up a copy of the current rules. Static rules * come first (the last of which has number IPFW_DEFAULT_RULE), * followed by a possibly empty list of dynamic rule. * The last dynamic rule has NULL in the "next" field. * * Note that the calculated size is used to bound the * amount of data returned to the user. The rule set may * change between calculating the size and returning the * data in which case we'll just return what fits. */ for (;;) { int len = 0, want; size = chain->static_len; size += ipfw_dyn_len(); if (size >= sopt->sopt_valsize) break; buf = malloc(size, M_TEMP, M_WAITOK); IPFW_UH_RLOCK(chain); /* check again how much space we need */ want = chain->static_len + ipfw_dyn_len(); if (size >= want) len = ipfw_getrules(chain, buf, size); IPFW_UH_RUNLOCK(chain); if (size >= want) error = sooptcopyout(sopt, buf, len); free(buf, M_TEMP); if (size >= want) break; } break; case IP_FW_FLUSH: /* locking is done within del_entry() */ error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */ break; case IP_FW_ADD: rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, rule, RULE_MAXSIZE, sizeof(struct ip_fw7) ); /* * If the size of commands equals RULESIZE7 then we assume * a FreeBSD7.2 binary is talking to us (set is7=1). * is7 is persistent so the next 'ipfw list' command * will use this format. * NOTE: If wrong version is guessed (this can happen if * the first ipfw command is 'ipfw [pipe] list') * the ipfw binary may crash or loop infinitly... */ if (sopt->sopt_valsize == RULESIZE7(rule)) { is7 = 1; error = convert_rule_to_8(rule); if (error) return error; if (error == 0) error = check_ipfw_struct(rule, RULESIZE(rule)); } else { is7 = 0; if (error == 0) error = check_ipfw_struct(rule, sopt->sopt_valsize); } if (error == 0) { /* locking is done within ipfw_add_rule() */ error = ipfw_add_rule(chain, rule); size = RULESIZE(rule); if (!error && sopt->sopt_dir == SOPT_GET) { if (is7) { error = convert_rule_to_7(rule); size = RULESIZE7(rule); if (error) return error; } error = sooptcopyout(sopt, rule, size); } } free(rule, M_TEMP); break; case IP_FW_DEL: /* * IP_FW_DEL is used for deleting single rules or sets, * and (ab)used to atomically manipulate sets. Argument size * is used to distinguish between the two: * sizeof(u_int32_t) * delete single rule or set of rules, * or reassign rules (or sets) to a different set. * 2*sizeof(u_int32_t) * atomic disable/enable sets. * first u_int32_t contains sets to be disabled, * second u_int32_t contains sets to be enabled. */ error = sooptcopyin(sopt, rulenum, 2*sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; size = sopt->sopt_valsize; if (size == sizeof(u_int32_t) && rulenum[0] != 0) { /* delete or reassign, locking done in del_entry() */ error = del_entry(chain, rulenum[0]); } else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */ IPFW_UH_WLOCK(chain); V_set_disable = (V_set_disable | rulenum[0]) & ~rulenum[1] & ~(1<sopt_val != 0) { error = sooptcopyin(sopt, rulenum, sizeof(u_int32_t), sizeof(u_int32_t)); if (error) break; } error = zero_entry(chain, rulenum[0], sopt->sopt_name == IP_FW_RESETLOG); break; /*--- TABLE manipulations are protected by the IPFW_LOCK ---*/ case IP_FW_TABLE_ADD: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = ipfw_add_table_entry(chain, ent.tbl, &ent.addr, sizeof(ent.addr), ent.masklen, IPFW_TABLE_CIDR, ent.value); } break; case IP_FW_TABLE_DEL: { ipfw_table_entry ent; error = sooptcopyin(sopt, &ent, sizeof(ent), sizeof(ent)); if (error) break; error = ipfw_del_table_entry(chain, ent.tbl, &ent.addr, sizeof(ent.addr), ent.masklen, IPFW_TABLE_CIDR); } break; case IP_FW_TABLE_XADD: /* IP_FW3 */ case IP_FW_TABLE_XDEL: /* IP_FW3 */ { ipfw_table_xentry *xent = (ipfw_table_xentry *)(op3 + 1); /* Check minimum header size */ if (IP_FW3_OPLENGTH(sopt) < offsetof(ipfw_table_xentry, k)) { error = EINVAL; break; } /* Check if len field is valid */ if (xent->len > sizeof(ipfw_table_xentry)) { error = EINVAL; break; } len = xent->len - offsetof(ipfw_table_xentry, k); error = (opt == IP_FW_TABLE_XADD) ? ipfw_add_table_entry(chain, xent->tbl, &xent->k, len, xent->masklen, xent->type, xent->value) : ipfw_del_table_entry(chain, xent->tbl, &xent->k, len, xent->masklen, xent->type); } break; case IP_FW_TABLE_FLUSH: { u_int16_t tbl; error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)); if (error) break; error = ipfw_flush_table(chain, tbl); } break; case IP_FW_TABLE_GETSIZE: { u_int32_t tbl, cnt; if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl), sizeof(tbl)))) break; IPFW_RLOCK(chain); error = ipfw_count_table(chain, tbl, &cnt); IPFW_RUNLOCK(chain); if (error) break; error = sooptcopyout(sopt, &cnt, sizeof(cnt)); } break; case IP_FW_TABLE_LIST: { ipfw_table *tbl; if (sopt->sopt_valsize < sizeof(*tbl)) { error = EINVAL; break; } size = sopt->sopt_valsize; tbl = malloc(size, M_TEMP, M_WAITOK); error = sooptcopyin(sopt, tbl, size, sizeof(*tbl)); if (error) { free(tbl, M_TEMP); break; } tbl->size = (size - sizeof(*tbl)) / sizeof(ipfw_table_entry); IPFW_RLOCK(chain); error = ipfw_dump_table(chain, tbl); IPFW_RUNLOCK(chain); if (error) { free(tbl, M_TEMP); break; } error = sooptcopyout(sopt, tbl, size); free(tbl, M_TEMP); } break; case IP_FW_TABLE_XGETSIZE: /* IP_FW3 */ { uint32_t *tbl; if (IP_FW3_OPLENGTH(sopt) < sizeof(uint32_t)) { error = EINVAL; break; } tbl = (uint32_t *)(op3 + 1); IPFW_RLOCK(chain); error = ipfw_count_xtable(chain, *tbl, tbl); IPFW_RUNLOCK(chain); if (error) break; error = sooptcopyout(sopt, op3, sopt->sopt_valsize); } break; case IP_FW_TABLE_XLIST: /* IP_FW3 */ { ipfw_xtable *tbl; if ((size = valsize) < sizeof(ipfw_xtable)) { error = EINVAL; break; } tbl = malloc(size, M_TEMP, M_ZERO | M_WAITOK); memcpy(tbl, op3, sizeof(ipfw_xtable)); /* Get maximum number of entries we can store */ tbl->size = (size - sizeof(ipfw_xtable)) / sizeof(ipfw_table_xentry); IPFW_RLOCK(chain); error = ipfw_dump_xtable(chain, tbl); IPFW_RUNLOCK(chain); if (error) { free(tbl, M_TEMP); break; } /* Revert size field back to bytes */ tbl->size = tbl->size * sizeof(ipfw_table_xentry) + sizeof(ipfw_table); /* * Since we call sooptcopyin() with small buffer, sopt_valsize is * decreased to reflect supplied buffer size. Set it back to original value */ sopt->sopt_valsize = valsize; error = sooptcopyout(sopt, tbl, size); free(tbl, M_TEMP); } break; /*--- NAT operations are protected by the IPFW_LOCK ---*/ case IP_FW_NAT_CFG: if (IPFW_NAT_LOADED) error = ipfw_nat_cfg_ptr(sopt); else { printf("IP_FW_NAT_CFG: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; case IP_FW_NAT_DEL: if (IPFW_NAT_LOADED) error = ipfw_nat_del_ptr(sopt); else { printf("IP_FW_NAT_DEL: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; case IP_FW_NAT_GET_CONFIG: if (IPFW_NAT_LOADED) error = ipfw_nat_get_cfg_ptr(sopt); else { printf("IP_FW_NAT_GET_CFG: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; case IP_FW_NAT_GET_LOG: if (IPFW_NAT_LOADED) error = ipfw_nat_get_log_ptr(sopt); else { printf("IP_FW_NAT_GET_LOG: %s\n", "ipfw_nat not present, please load it"); error = EINVAL; } break; default: printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name); error = EINVAL; } return (error); #undef RULE_MAXSIZE } #define RULE_MAXSIZE (256*sizeof(u_int32_t)) /* Functions to convert rules 7.2 <==> 8.0 */ int convert_rule_to_7(struct ip_fw *rule) { /* Used to modify original rule */ struct ip_fw7 *rule7 = (struct ip_fw7 *)rule; /* copy of original rule, version 8 */ struct ip_fw *tmp; /* Used to copy commands */ ipfw_insn *ccmd, *dst; int ll = 0, ccmdlen = 0; tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); if (tmp == NULL) { return 1; //XXX error } bcopy(rule, tmp, RULE_MAXSIZE); /* Copy fields */ rule7->_pad = tmp->_pad; rule7->set = tmp->set; rule7->rulenum = tmp->rulenum; rule7->cmd_len = tmp->cmd_len; rule7->act_ofs = tmp->act_ofs; rule7->next_rule = (struct ip_fw7 *)tmp->next_rule; rule7->next = (struct ip_fw7 *)tmp->x_next; rule7->cmd_len = tmp->cmd_len; rule7->pcnt = tmp->pcnt; rule7->bcnt = tmp->bcnt; rule7->timestamp = tmp->timestamp; /* Copy commands */ for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ; ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { ccmdlen = F_LEN(ccmd); bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); if (dst->opcode > O_NAT) /* O_REASS doesn't exists in 7.2 version, so * decrement opcode if it is after O_REASS */ dst->opcode--; if (ccmdlen > ll) { printf("ipfw: opcode %d size truncated\n", ccmd->opcode); return EINVAL; } } free(tmp, M_TEMP); return 0; } int convert_rule_to_8(struct ip_fw *rule) { /* Used to modify original rule */ struct ip_fw7 *rule7 = (struct ip_fw7 *) rule; /* Used to copy commands */ ipfw_insn *ccmd, *dst; int ll = 0, ccmdlen = 0; /* Copy of original rule */ struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO); if (tmp == NULL) { return 1; //XXX error } bcopy(rule7, tmp, RULE_MAXSIZE); for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ; ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) { ccmdlen = F_LEN(ccmd); bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t)); if (dst->opcode > O_NAT) /* O_REASS doesn't exists in 7.2 version, so * increment opcode if it is after O_REASS */ dst->opcode++; if (ccmdlen > ll) { printf("ipfw: opcode %d size truncated\n", ccmd->opcode); return EINVAL; } } rule->_pad = tmp->_pad; rule->set = tmp->set; rule->rulenum = tmp->rulenum; rule->cmd_len = tmp->cmd_len; rule->act_ofs = tmp->act_ofs; rule->next_rule = (struct ip_fw *)tmp->next_rule; rule->x_next = (struct ip_fw *)tmp->next; rule->cmd_len = tmp->cmd_len; rule->id = 0; /* XXX see if is ok = 0 */ rule->pcnt = tmp->pcnt; rule->bcnt = tmp->bcnt; rule->timestamp = tmp->timestamp; free (tmp, M_TEMP); return 0; } /* end of file */ ipfw-user/sys/netinet/ipfw/dn_sched_wf2q.c000644 000423 000000 00000027446 12006744005 021353 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD: head/sys/netinet/ipfw/dn_sched_wf2q.c 213267 2010-09-29 09:40:20Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #ifndef MAX64 #define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) #endif /* * timestamps are computed on 64 bit using fixed point arithmetic. * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len * and sum of weights, respectively. FRAC_BITS is the number of * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w * using an unsigned 32-bit division, and to avoid wraparounds we need * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64 * As an example * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19 */ #ifndef FRAC_BITS #define FRAC_BITS 28 /* shift for fixed point arithmetic */ #define ONE_FP (1UL << FRAC_BITS) #endif /* * Private information for the scheduler instance: * sch_heap (key is Finish time) returns the next queue to serve * ne_heap (key is Start time) stores not-eligible queues * idle_heap (key=start/finish time) stores idle flows. It must * support extract-from-middle. * A flow is only in 1 of the three heaps. * XXX todo: use a more efficient data structure, e.g. a tree sorted * by F with min_subtree(S) in each node */ struct wf2qp_si { struct dn_heap sch_heap; /* top extract - key Finish time */ struct dn_heap ne_heap; /* top extract - key Start time */ struct dn_heap idle_heap; /* random extract - key Start=Finish time */ uint64_t V; /* virtual time */ uint32_t inv_wsum; /* inverse of sum of weights */ uint32_t wsum; /* sum of weights */ }; struct wf2qp_queue { struct dn_queue _q; uint64_t S, F; /* start time, finish time */ uint32_t inv_w; /* ONE_FP / weight */ int32_t heap_pos; /* position (index) of struct in heap */ }; /* * This file implements a WF2Q+ scheduler as it has been in dummynet * since 2000. * The scheduler supports per-flow queues and has O(log N) complexity. * * WF2Q+ needs to drain entries from the idle heap so that we * can keep the sum of weights up to date. We can do it whenever * we get a chance, or periodically, or following some other * strategy. The function idle_check() drains at most N elements * from the idle heap. */ static void idle_check(struct wf2qp_si *si, int n, int force) { struct dn_heap *h = &si->idle_heap; while (n-- > 0 && h->elements > 0 && (force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) { struct dn_queue *q = HEAP_TOP(h)->object; struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; heap_extract(h, NULL); /* XXX to let the flowset delete the queue we should * mark it as 'unused' by the scheduler. */ alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */ si->wsum -= q->fs->fs.par[0]; /* adjust sum of weights */ if (si->wsum > 0) si->inv_wsum = ONE_FP/si->wsum; } } static int wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) { struct dn_fsk *fs = q->fs; struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); struct wf2qp_queue *alg_fq; uint64_t len = m->m_pkthdr.len; if (m != q->mq.head) { if (dn_enqueue(q, m, 0)) /* packet was dropped */ return 1; if (m != q->mq.head) /* queue was already busy */ return 0; } /* If reach this point, queue q was idle */ alg_fq = (struct wf2qp_queue *)q; if (DN_KEY_LT(alg_fq->F, alg_fq->S)) { /* Fbrand new queue. */ alg_fq->S = si->V; /* init start time */ si->wsum += fs->fs.par[0]; /* add weight of new queue. */ si->inv_wsum = ONE_FP/si->wsum; } else { /* if it was idle then it was in the idle heap */ heap_extract(&si->idle_heap, q); alg_fq->S = MAX64(alg_fq->F, si->V); /* compute new S */ } alg_fq->F = alg_fq->S + len * alg_fq->inv_w; /* if nothing is backlogged, make sure this flow is eligible */ if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0) si->V = MAX64(alg_fq->S, si->V); /* * Look at eligibility. A flow is not eligibile if S>V (when * this happens, it means that there is some other flow already * scheduled for the same pipe, so the sch_heap cannot be * empty). If the flow is not eligible we just store it in the * ne_heap. Otherwise, we store in the sch_heap. * Note that for all flows in sch_heap (SCH), S_i <= V, * and for all flows in ne_heap (NEH), S_i > V. * So when we need to compute max(V, min(S_i)) forall i in * SCH+NEH, we only need to look into NEH. */ if (DN_KEY_LT(si->V, alg_fq->S)) { /* S>V means flow Not eligible. */ if (si->sch_heap.elements == 0) D("++ ouch! not eligible but empty scheduler!"); heap_insert(&si->ne_heap, alg_fq->S, q); } else { heap_insert(&si->sch_heap, alg_fq->F, q); } return 0; } /* XXX invariant: sch > 0 || V >= min(S in neh) */ static struct mbuf * wf2qp_dequeue(struct dn_sch_inst *_si) { /* Access scheduler instance private data */ struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); struct mbuf *m; struct dn_queue *q; struct dn_heap *sch = &si->sch_heap; struct dn_heap *neh = &si->ne_heap; struct wf2qp_queue *alg_fq; if (sch->elements == 0 && neh->elements == 0) { /* we have nothing to do. We could kill the idle heap * altogether and reset V */ idle_check(si, 0x7fffffff, 1); si->V = 0; si->wsum = 0; /* should be set already */ return NULL; /* quick return if nothing to do */ } idle_check(si, 1, 0); /* drain something from the idle heap */ /* make sure at least one element is eligible, bumping V * and moving entries that have become eligible. * We need to repeat the first part twice, before and * after extracting the candidate, or enqueue() will * find the data structure in a wrong state. */ m = NULL; for(;;) { /* * Compute V = max(V, min(S_i)). Remember that all elements * in sch have by definition S_i <= V so if sch is not empty, * V is surely the max and we must not update it. Conversely, * if sch is empty we only need to look at neh. * We don't need to move the queues, as it will be done at the * next enqueue */ if (sch->elements == 0 && neh->elements > 0) { si->V = MAX64(si->V, HEAP_TOP(neh)->key); } while (neh->elements > 0 && DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) { q = HEAP_TOP(neh)->object; alg_fq = (struct wf2qp_queue *)q; heap_extract(neh, NULL); heap_insert(sch, alg_fq->F, q); } if (m) /* pkt found in previous iteration */ break; /* ok we have at least one eligible pkt */ q = HEAP_TOP(sch)->object; alg_fq = (struct wf2qp_queue *)q; m = dn_dequeue(q); heap_extract(sch, NULL); /* Remove queue from heap. */ si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum; alg_fq->S = alg_fq->F; /* Update start time. */ if (q->mq.head == 0) { /* not backlogged any more. */ heap_insert(&si->idle_heap, alg_fq->F, q); } else { /* Still backlogged. */ /* Update F, store in neh or sch */ uint64_t len = q->mq.head->m_pkthdr.len; alg_fq->F += len * alg_fq->inv_w; if (DN_KEY_LEQ(alg_fq->S, si->V)) { heap_insert(sch, alg_fq->F, q); } else { heap_insert(neh, alg_fq->S, q); } } } return m; } static int wf2qp_new_sched(struct dn_sch_inst *_si) { struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); int ofs = offsetof(struct wf2qp_queue, heap_pos); /* all heaps support extract from middle */ if (heap_init(&si->idle_heap, 16, ofs) || heap_init(&si->sch_heap, 16, ofs) || heap_init(&si->ne_heap, 16, ofs)) { heap_free(&si->ne_heap); heap_free(&si->sch_heap); heap_free(&si->idle_heap); return ENOMEM; } return 0; } static int wf2qp_free_sched(struct dn_sch_inst *_si) { struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1); heap_free(&si->sch_heap); heap_free(&si->ne_heap); heap_free(&si->idle_heap); return 0; } static int wf2qp_new_fsk(struct dn_fsk *fs) { ipdn_bound_var(&fs->fs.par[0], 1, 1, 100, "WF2Q+ weight"); return 0; } static int wf2qp_new_queue(struct dn_queue *_q) { struct wf2qp_queue *q = (struct wf2qp_queue *)_q; _q->ni.oid.subtype = DN_SCHED_WF2QP; q->F = 0; /* not strictly necessary */ q->S = q->F + 1; /* mark timestamp as invalid. */ q->inv_w = ONE_FP / _q->fs->fs.par[0]; if (_q->mq.head != NULL) { wf2qp_enqueue(_q->_si, _q, _q->mq.head); } return 0; } /* * Called when the infrastructure removes a queue (e.g. flowset * is reconfigured). Nothing to do if we did not 'own' the queue, * otherwise remove it from the right heap and adjust the sum * of weights. */ static int wf2qp_free_queue(struct dn_queue *q) { struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q; struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1); if (alg_fq->S >= alg_fq->F + 1) return 0; /* nothing to do, not in any heap */ si->wsum -= q->fs->fs.par[0]; if (si->wsum > 0) si->inv_wsum = ONE_FP/si->wsum; /* extract from the heap. XXX TODO we may need to adjust V * to make sure the invariants hold. */ if (q->mq.head == NULL) { heap_extract(&si->idle_heap, q); } else if (DN_KEY_LT(si->V, alg_fq->S)) { heap_extract(&si->ne_heap, q); } else { heap_extract(&si->sch_heap, q); } return 0; } /* * WF2Q+ scheduler descriptor * contains the type of the scheduler, the name, the size of the * structures and function pointers. */ static struct dn_alg wf2qp_desc = { _SI( .type = ) DN_SCHED_WF2QP, _SI( .name = ) "WF2Q+", _SI( .flags = ) DN_MULTIQUEUE, /* we need extra space in the si and the queue */ _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct wf2qp_si), _SI( .q_datalen = ) sizeof(struct wf2qp_queue) - sizeof(struct dn_queue), _SI( .enqueue = ) wf2qp_enqueue, _SI( .dequeue = ) wf2qp_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) wf2qp_new_sched, _SI( .free_sched = ) wf2qp_free_sched, _SI( .new_fsk = ) wf2qp_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) wf2qp_new_queue, _SI( .free_queue = ) wf2qp_free_queue, }; DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc); ipfw-user/sys/netinet/ipfw/ip_fw_dynamic.c000644 000423 000000 00000101064 12007435564 021452 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2002 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_dynamic.c 238978 2012-08-01 18:52:07Z luigi $"); #define DEB(x) #define DDB(x) x /* * Dynamic rule support for ipfw */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include /* ip_defttl */ #include #include #include #include #include /* IN6_ARE_ADDR_EQUAL */ #ifdef INET6 #include #include #endif #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * Description of dynamic rules. * * Dynamic rules are stored in lists accessed through a hash table * (ipfw_dyn_v) whose size is curr_dyn_buckets. This value can * be modified through the sysctl variable dyn_buckets which is * updated when the table becomes empty. * * XXX currently there is only one list, ipfw_dyn. * * When a packet is received, its address fields are first masked * with the mask defined for the rule, then hashed, then matched * against the entries in the corresponding list. * Dynamic rules can be used for different purposes: * + stateful rules; * + enforcing limits on the number of sessions; * + in-kernel NAT (not implemented yet) * * The lifetime of dynamic rules is regulated by dyn_*_lifetime, * measured in seconds and depending on the flags. * * The total number of dynamic rules is stored in dyn_count. * The max number of dynamic rules is dyn_max. When we reach * the maximum number of rules we do not create anymore. This is * done to avoid consuming too much memory, but also too much * time when searching on each packet (ideally, we should try instead * to put a limit on the length of the list on each bucket...). * * Each dynamic rule holds a pointer to the parent ipfw rule so * we know what action to perform. Dynamic rules are removed when * the parent rule is deleted. XXX we should make them survive. * * There are some limitations with dynamic rules -- we do not * obey the 'randomized match', and we do not do multiple * passes through the firewall. XXX check the latter!!! */ /* * Static variables followed by global ones */ static VNET_DEFINE(ipfw_dyn_rule **, ipfw_dyn_v); static VNET_DEFINE(u_int32_t, dyn_buckets); static VNET_DEFINE(u_int32_t, curr_dyn_buckets); static VNET_DEFINE(struct callout, ipfw_timeout); #define V_ipfw_dyn_v VNET(ipfw_dyn_v) #define V_dyn_buckets VNET(dyn_buckets) #define V_curr_dyn_buckets VNET(curr_dyn_buckets) #define V_ipfw_timeout VNET(ipfw_timeout) static uma_zone_t ipfw_dyn_rule_zone; #ifndef __FreeBSD__ DEFINE_SPINLOCK(ipfw_dyn_mtx); #else static struct mtx ipfw_dyn_mtx; /* mutex guarding dynamic rules */ #endif #define IPFW_DYN_LOCK_INIT() \ mtx_init(&ipfw_dyn_mtx, "IPFW dynamic rules", NULL, MTX_DEF) #define IPFW_DYN_LOCK_DESTROY() mtx_destroy(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK() mtx_lock(&ipfw_dyn_mtx) #define IPFW_DYN_UNLOCK() mtx_unlock(&ipfw_dyn_mtx) #define IPFW_DYN_LOCK_ASSERT() mtx_assert(&ipfw_dyn_mtx, MA_OWNED) void ipfw_dyn_unlock(void) { IPFW_DYN_UNLOCK(); } /* * Timeouts for various events in handing dynamic rules. */ static VNET_DEFINE(u_int32_t, dyn_ack_lifetime); static VNET_DEFINE(u_int32_t, dyn_syn_lifetime); static VNET_DEFINE(u_int32_t, dyn_fin_lifetime); static VNET_DEFINE(u_int32_t, dyn_rst_lifetime); static VNET_DEFINE(u_int32_t, dyn_udp_lifetime); static VNET_DEFINE(u_int32_t, dyn_short_lifetime); #define V_dyn_ack_lifetime VNET(dyn_ack_lifetime) #define V_dyn_syn_lifetime VNET(dyn_syn_lifetime) #define V_dyn_fin_lifetime VNET(dyn_fin_lifetime) #define V_dyn_rst_lifetime VNET(dyn_rst_lifetime) #define V_dyn_udp_lifetime VNET(dyn_udp_lifetime) #define V_dyn_short_lifetime VNET(dyn_short_lifetime) /* * Keepalives are sent if dyn_keepalive is set. They are sent every * dyn_keepalive_period seconds, in the last dyn_keepalive_interval * seconds of lifetime of a rule. * dyn_rst_lifetime and dyn_fin_lifetime should be strictly lower * than dyn_keepalive_period. */ static VNET_DEFINE(u_int32_t, dyn_keepalive_interval); static VNET_DEFINE(u_int32_t, dyn_keepalive_period); static VNET_DEFINE(u_int32_t, dyn_keepalive); #define V_dyn_keepalive_interval VNET(dyn_keepalive_interval) #define V_dyn_keepalive_period VNET(dyn_keepalive_period) #define V_dyn_keepalive VNET(dyn_keepalive) static VNET_DEFINE(u_int32_t, dyn_count); /* # of dynamic rules */ static VNET_DEFINE(u_int32_t, dyn_max); /* max # of dynamic rules */ #define V_dyn_count VNET(dyn_count) #define V_dyn_max VNET(dyn_max) #ifdef SYSCTL_NODE SYSBEGIN(f2) SYSCTL_DECL(_net_inet_ip_fw); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_buckets, CTLFLAG_RW, &VNET_NAME(dyn_buckets), 0, "Number of dyn. buckets"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, curr_dyn_buckets, CTLFLAG_RD, &VNET_NAME(curr_dyn_buckets), 0, "Current Number of dyn. buckets"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_count, CTLFLAG_RD, &VNET_NAME(dyn_count), 0, "Number of dyn. rules"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_max, CTLFLAG_RW, &VNET_NAME(dyn_max), 0, "Max number of dyn. rules"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_ack_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_ack_lifetime), 0, "Lifetime of dyn. rules for acks"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_syn_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_syn_lifetime), 0, "Lifetime of dyn. rules for syn"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_fin_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_fin_lifetime), 0, "Lifetime of dyn. rules for fin"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_rst_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_rst_lifetime), 0, "Lifetime of dyn. rules for rst"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_udp_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_udp_lifetime), 0, "Lifetime of dyn. rules for UDP"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_short_lifetime, CTLFLAG_RW, &VNET_NAME(dyn_short_lifetime), 0, "Lifetime of dyn. rules for other situations"); SYSCTL_VNET_UINT(_net_inet_ip_fw, OID_AUTO, dyn_keepalive, CTLFLAG_RW, &VNET_NAME(dyn_keepalive), 0, "Enable keepalives for dyn. rules"); SYSEND #endif /* SYSCTL_NODE */ static __inline int hash_packet6(struct ipfw_flow_id *id) { u_int32_t i; i = (id->dst_ip6.__u6_addr.__u6_addr32[2]) ^ (id->dst_ip6.__u6_addr.__u6_addr32[3]) ^ (id->src_ip6.__u6_addr.__u6_addr32[2]) ^ (id->src_ip6.__u6_addr.__u6_addr32[3]) ^ (id->dst_port) ^ (id->src_port); return i; } /* * IMPORTANT: the hash function for dynamic rules must be commutative * in source and destination (ip,port), because rules are bidirectional * and we want to find both in the same bucket. */ static __inline int hash_packet(struct ipfw_flow_id *id) { u_int32_t i; #ifdef INET6 if (IS_IP6_FLOW_ID(id)) i = hash_packet6(id); else #endif /* INET6 */ i = (id->dst_ip) ^ (id->src_ip) ^ (id->dst_port) ^ (id->src_port); i &= (V_curr_dyn_buckets - 1); return i; } static __inline void unlink_dyn_rule_print(struct ipfw_flow_id *id) { struct in_addr da; #ifdef INET6 char src[INET6_ADDRSTRLEN], dst[INET6_ADDRSTRLEN]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif #ifdef INET6 if (IS_IP6_FLOW_ID(id)) { ip6_sprintf(src, &id->src_ip6); ip6_sprintf(dst, &id->dst_ip6); } else #endif { da.s_addr = htonl(id->src_ip); inet_ntop(AF_INET, &da, src, sizeof(src)); da.s_addr = htonl(id->dst_ip); inet_ntop(AF_INET, &da, dst, sizeof(dst)); } printf("ipfw: unlink entry %s %d -> %s %d, %d left\n", src, id->src_port, dst, id->dst_port, V_dyn_count - 1); } /** * unlink a dynamic rule from a chain. prev is a pointer to * the previous one, q is a pointer to the rule to delete, * head is a pointer to the head of the queue. * Modifies q and potentially also head. */ #define UNLINK_DYN_RULE(prev, head, q) { \ ipfw_dyn_rule *old_q = q; \ \ /* remove a refcount to the parent */ \ if (q->dyn_type == O_LIMIT) \ q->parent->count--; \ DEB(unlink_dyn_rule_print(&q->id);) \ if (prev != NULL) \ prev->next = q = q->next; \ else \ head = q = q->next; \ V_dyn_count--; \ uma_zfree(ipfw_dyn_rule_zone, old_q); } #define TIME_LEQ(a,b) ((int)((a)-(b)) <= 0) /** * Remove dynamic rules pointing to "rule", or all of them if rule == NULL. * * If keep_me == NULL, rules are deleted even if not expired, * otherwise only expired rules are removed. * * The value of the second parameter is also used to point to identify * a rule we absolutely do not want to remove (e.g. because we are * holding a reference to it -- this is the case with O_LIMIT_PARENT * rules). The pointer is only used for comparison, so any non-null * value will do. */ static void remove_dyn_rule(struct ip_fw *rule, ipfw_dyn_rule *keep_me) { static u_int32_t last_remove = 0; #define FORCE (keep_me == NULL) ipfw_dyn_rule *prev, *q; int i, pass = 0, max_pass = 0; IPFW_DYN_LOCK_ASSERT(); if (V_ipfw_dyn_v == NULL || V_dyn_count == 0) return; /* do not expire more than once per second, it is useless */ if (!FORCE && last_remove == time_uptime) return; last_remove = time_uptime; /* * because O_LIMIT refer to parent rules, during the first pass only * remove child and mark any pending LIMIT_PARENT, and remove * them in a second pass. */ next_pass: for (i = 0 ; i < V_curr_dyn_buckets ; i++) { for (prev=NULL, q = V_ipfw_dyn_v[i] ; q ; ) { /* * Logic can become complex here, so we split tests. */ if (q == keep_me) goto next; if (rule != NULL && rule != q->rule) goto next; /* not the one we are looking for */ if (q->dyn_type == O_LIMIT_PARENT) { /* * handle parent in the second pass, * record we need one. */ max_pass = 1; if (pass == 0) goto next; if (FORCE && q->count != 0 ) { /* XXX should not happen! */ printf("ipfw: OUCH! cannot remove rule," " count %d\n", q->count); } } else { if (!FORCE && !TIME_LEQ( q->expire, time_uptime )) goto next; } if (q->dyn_type != O_LIMIT_PARENT || !q->count) { UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); continue; } next: prev=q; q=q->next; } } if (pass++ < max_pass) goto next_pass; } void ipfw_remove_dyn_children(struct ip_fw *rule) { IPFW_DYN_LOCK(); remove_dyn_rule(rule, NULL /* force removal */); IPFW_DYN_UNLOCK(); } /* * Lookup a dynamic rule, locked version. */ static ipfw_dyn_rule * lookup_dyn_rule_locked(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { /* * Stateful ipfw extensions. * Lookup into dynamic session queue. */ #define MATCH_REVERSE 0 #define MATCH_FORWARD 1 #define MATCH_NONE 2 #define MATCH_UNKNOWN 3 int i, dir = MATCH_NONE; ipfw_dyn_rule *prev, *q = NULL; IPFW_DYN_LOCK_ASSERT(); if (V_ipfw_dyn_v == NULL) goto done; /* not found */ i = hash_packet(pkt); for (prev = NULL, q = V_ipfw_dyn_v[i]; q != NULL;) { if (q->dyn_type == O_LIMIT_PARENT && q->count) goto next; if (TIME_LEQ(q->expire, time_uptime)) { /* expire entry */ UNLINK_DYN_RULE(prev, V_ipfw_dyn_v[i], q); continue; } if (pkt->proto != q->id.proto || q->dyn_type == O_LIMIT_PARENT) goto next; if (IS_IP6_FLOW_ID(pkt)) { if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.src_ip6) && IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.dst_ip6) && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port) { dir = MATCH_FORWARD; break; } if (IN6_ARE_ADDR_EQUAL(&pkt->src_ip6, &q->id.dst_ip6) && IN6_ARE_ADDR_EQUAL(&pkt->dst_ip6, &q->id.src_ip6) && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port) { dir = MATCH_REVERSE; break; } } else { if (pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port) { dir = MATCH_FORWARD; break; } if (pkt->src_ip == q->id.dst_ip && pkt->dst_ip == q->id.src_ip && pkt->src_port == q->id.dst_port && pkt->dst_port == q->id.src_port) { dir = MATCH_REVERSE; break; } } next: prev = q; q = q->next; } if (q == NULL) goto done; /* q = NULL, not found */ if (prev != NULL) { /* found and not in front */ prev->next = q->next; q->next = V_ipfw_dyn_v[i]; V_ipfw_dyn_v[i] = q; } if (pkt->proto == IPPROTO_TCP) { /* update state according to flags */ uint32_t ack; u_char flags = pkt->_flags & (TH_FIN | TH_SYN | TH_RST); #define BOTH_SYN (TH_SYN | (TH_SYN << 8)) #define BOTH_FIN (TH_FIN | (TH_FIN << 8)) #define TCP_FLAGS (TH_FLAGS | (TH_FLAGS << 8)) #define ACK_FWD 0x10000 /* fwd ack seen */ #define ACK_REV 0x20000 /* rev ack seen */ q->state |= (dir == MATCH_FORWARD) ? flags : (flags << 8); switch (q->state & TCP_FLAGS) { case TH_SYN: /* opening */ q->expire = time_uptime + V_dyn_syn_lifetime; break; case BOTH_SYN: /* move to established */ case BOTH_SYN | TH_FIN: /* one side tries to close */ case BOTH_SYN | (TH_FIN << 8): #define _SEQ_GE(a,b) ((int)(a) - (int)(b) >= 0) if (tcp == NULL) break; ack = ntohl(tcp->th_ack); if (dir == MATCH_FORWARD) { if (q->ack_fwd == 0 || _SEQ_GE(ack, q->ack_fwd)) { q->ack_fwd = ack; q->state |= ACK_FWD; } } else { if (q->ack_rev == 0 || _SEQ_GE(ack, q->ack_rev)) { q->ack_rev = ack; q->state |= ACK_REV; } } if ((q->state & (ACK_FWD | ACK_REV)) == (ACK_FWD | ACK_REV)) { q->expire = time_uptime + V_dyn_ack_lifetime; q->state &= ~(ACK_FWD | ACK_REV); } break; case BOTH_SYN | BOTH_FIN: /* both sides closed */ if (V_dyn_fin_lifetime >= V_dyn_keepalive_period) V_dyn_fin_lifetime = V_dyn_keepalive_period - 1; q->expire = time_uptime + V_dyn_fin_lifetime; break; default: #if 0 /* * reset or some invalid combination, but can also * occur if we use keep-state the wrong way. */ if ( (q->state & ((TH_RST << 8)|TH_RST)) == 0) printf("invalid state: 0x%x\n", q->state); #endif if (V_dyn_rst_lifetime >= V_dyn_keepalive_period) V_dyn_rst_lifetime = V_dyn_keepalive_period - 1; q->expire = time_uptime + V_dyn_rst_lifetime; break; } } else if (pkt->proto == IPPROTO_UDP) { q->expire = time_uptime + V_dyn_udp_lifetime; } else { /* other protocols */ q->expire = time_uptime + V_dyn_short_lifetime; } done: if (match_direction != NULL) *match_direction = dir; return (q); } ipfw_dyn_rule * ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp) { ipfw_dyn_rule *q; IPFW_DYN_LOCK(); q = lookup_dyn_rule_locked(pkt, match_direction, tcp); if (q == NULL) IPFW_DYN_UNLOCK(); /* NB: return table locked when q is not NULL */ return q; } static void realloc_dynamic_table(void) { IPFW_DYN_LOCK_ASSERT(); /* * Try reallocation, make sure we have a power of 2 and do * not allow more than 64k entries. In case of overflow, * default to 1024. */ if (V_dyn_buckets > 65536) V_dyn_buckets = 1024; if ((V_dyn_buckets & (V_dyn_buckets-1)) != 0) { /* not a power of 2 */ V_dyn_buckets = V_curr_dyn_buckets; /* reset */ return; } V_curr_dyn_buckets = V_dyn_buckets; if (V_ipfw_dyn_v != NULL) free(V_ipfw_dyn_v, M_IPFW); for (;;) { V_ipfw_dyn_v = malloc(V_curr_dyn_buckets * sizeof(ipfw_dyn_rule *), M_IPFW, M_NOWAIT | M_ZERO); if (V_ipfw_dyn_v != NULL || V_curr_dyn_buckets <= 2) break; V_curr_dyn_buckets /= 2; } } /** * Install state of type 'type' for a dynamic session. * The hash table contains two type of rules: * - regular rules (O_KEEP_STATE) * - rules for sessions with limited number of sess per user * (O_LIMIT). When they are created, the parent is * increased by 1, and decreased on delete. In this case, * the third parameter is the parent rule and not the chain. * - "parent" rules for the above (O_LIMIT_PARENT). */ static ipfw_dyn_rule * add_dyn_rule(struct ipfw_flow_id *id, u_int8_t dyn_type, struct ip_fw *rule) { ipfw_dyn_rule *r; int i; IPFW_DYN_LOCK_ASSERT(); if (V_ipfw_dyn_v == NULL || (V_dyn_count == 0 && V_dyn_buckets != V_curr_dyn_buckets)) { realloc_dynamic_table(); if (V_ipfw_dyn_v == NULL) return NULL; /* failed ! */ } i = hash_packet(id); r = uma_zalloc(ipfw_dyn_rule_zone, M_NOWAIT | M_ZERO); if (r == NULL) { printf ("ipfw: sorry cannot allocate state\n"); return NULL; } /* increase refcount on parent, and set pointer */ if (dyn_type == O_LIMIT) { ipfw_dyn_rule *parent = (ipfw_dyn_rule *)rule; if ( parent->dyn_type != O_LIMIT_PARENT) panic("invalid parent"); parent->count++; r->parent = parent; rule = parent->rule; } r->id = *id; r->expire = time_uptime + V_dyn_syn_lifetime; r->rule = rule; r->dyn_type = dyn_type; r->pcnt = r->bcnt = 0; r->count = 0; r->bucket = i; r->next = V_ipfw_dyn_v[i]; V_ipfw_dyn_v[i] = r; V_dyn_count++; DEB({ struct in_addr da; #ifdef INET6 char src[INET6_ADDRSTRLEN]; char dst[INET6_ADDRSTRLEN]; #else char src[INET_ADDRSTRLEN]; char dst[INET_ADDRSTRLEN]; #endif #ifdef INET6 if (IS_IP6_FLOW_ID(&(r->id))) { ip6_sprintf(src, &r->id.src_ip6); ip6_sprintf(dst, &r->id.dst_ip6); } else #endif { da.s_addr = htonl(r->id.src_ip); inet_ntop(AF_INET, &da, src, sizeof(src)); da.s_addr = htonl(r->id.dst_ip); inet_ntop(AF_INET, &da, dst, sizeof(dst)); } printf("ipfw: add dyn entry ty %d %s %d -> %s %d, total %d\n", dyn_type, src, r->id.src_port, dst, r->id.dst_port, V_dyn_count); }) return r; } /** * lookup dynamic parent rule using pkt and rule as search keys. * If the lookup fails, then install one. */ static ipfw_dyn_rule * lookup_dyn_parent(struct ipfw_flow_id *pkt, struct ip_fw *rule) { ipfw_dyn_rule *q; int i; IPFW_DYN_LOCK_ASSERT(); if (V_ipfw_dyn_v) { int is_v6 = IS_IP6_FLOW_ID(pkt); i = hash_packet( pkt ); for (q = V_ipfw_dyn_v[i] ; q != NULL ; q=q->next) if (q->dyn_type == O_LIMIT_PARENT && rule== q->rule && pkt->proto == q->id.proto && pkt->src_port == q->id.src_port && pkt->dst_port == q->id.dst_port && ( (is_v6 && IN6_ARE_ADDR_EQUAL(&(pkt->src_ip6), &(q->id.src_ip6)) && IN6_ARE_ADDR_EQUAL(&(pkt->dst_ip6), &(q->id.dst_ip6))) || (!is_v6 && pkt->src_ip == q->id.src_ip && pkt->dst_ip == q->id.dst_ip) ) ) { q->expire = time_uptime + V_dyn_short_lifetime; DEB(printf("ipfw: lookup_dyn_parent found 0x%p\n",q);) return q; } } return add_dyn_rule(pkt, O_LIMIT_PARENT, rule); } /** * Install dynamic state for rule type cmd->o.opcode * * Returns 1 (failure) if state is not installed because of errors or because * session limitations are enforced. */ int ipfw_install_state(struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg) { static int last_log; ipfw_dyn_rule *q; struct in_addr da; #ifdef INET6 char src[INET6_ADDRSTRLEN + 2], dst[INET6_ADDRSTRLEN + 2]; #else char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN]; #endif src[0] = '\0'; dst[0] = '\0'; IPFW_DYN_LOCK(); DEB( #ifdef INET6 if (IS_IP6_FLOW_ID(&(args->f_id))) { ip6_sprintf(src, &args->f_id.src_ip6); ip6_sprintf(dst, &args->f_id.dst_ip6); } else #endif { da.s_addr = htonl(args->f_id.src_ip); inet_ntop(AF_INET, &da, src, sizeof(src)); da.s_addr = htonl(args->f_id.dst_ip); inet_ntop(AF_INET, &da, dst, sizeof(dst)); } printf("ipfw: %s: type %d %s %u -> %s %u\n", __func__, cmd->o.opcode, src, args->f_id.src_port, dst, args->f_id.dst_port); src[0] = '\0'; dst[0] = '\0'; ) q = lookup_dyn_rule_locked(&args->f_id, NULL, NULL); if (q != NULL) { /* should never occur */ DEB( if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: entry already present, done\n", __func__); }) IPFW_DYN_UNLOCK(); return (0); } if (V_dyn_count >= V_dyn_max) /* Run out of slots, try to remove any expired rule. */ remove_dyn_rule(NULL, (ipfw_dyn_rule *)1); if (V_dyn_count >= V_dyn_max) { if (last_log != time_uptime) { last_log = time_uptime; printf("ipfw: %s: Too many dynamic rules\n", __func__); } IPFW_DYN_UNLOCK(); return (1); /* cannot install, notify caller */ } switch (cmd->o.opcode) { case O_KEEP_STATE: /* bidir rule */ add_dyn_rule(&args->f_id, O_KEEP_STATE, rule); break; case O_LIMIT: { /* limit number of sessions */ struct ipfw_flow_id id; ipfw_dyn_rule *parent; uint32_t conn_limit; uint16_t limit_mask = cmd->limit_mask; conn_limit = (cmd->conn_limit == IP_FW_TABLEARG) ? tablearg : cmd->conn_limit; DEB( if (cmd->conn_limit == IP_FW_TABLEARG) printf("ipfw: %s: O_LIMIT rule, conn_limit: %u " "(tablearg)\n", __func__, conn_limit); else printf("ipfw: %s: O_LIMIT rule, conn_limit: %u\n", __func__, conn_limit); ) id.dst_ip = id.src_ip = id.dst_port = id.src_port = 0; id.proto = args->f_id.proto; id.addr_type = args->f_id.addr_type; id.fib = M_GETFIB(args->m); if (IS_IP6_FLOW_ID (&(args->f_id))) { if (limit_mask & DYN_SRC_ADDR) id.src_ip6 = args->f_id.src_ip6; if (limit_mask & DYN_DST_ADDR) id.dst_ip6 = args->f_id.dst_ip6; } else { if (limit_mask & DYN_SRC_ADDR) id.src_ip = args->f_id.src_ip; if (limit_mask & DYN_DST_ADDR) id.dst_ip = args->f_id.dst_ip; } if (limit_mask & DYN_SRC_PORT) id.src_port = args->f_id.src_port; if (limit_mask & DYN_DST_PORT) id.dst_port = args->f_id.dst_port; if ((parent = lookup_dyn_parent(&id, rule)) == NULL) { printf("ipfw: %s: add parent failed\n", __func__); IPFW_DYN_UNLOCK(); return (1); } if (parent->count >= conn_limit) { /* See if we can remove some expired rule. */ remove_dyn_rule(rule, parent); if (parent->count >= conn_limit) { if (V_fw_verbose && last_log != time_uptime) { last_log = time_uptime; #ifdef INET6 /* * XXX IPv6 flows are not * supported yet. */ if (IS_IP6_FLOW_ID(&(args->f_id))) { char ip6buf[INET6_ADDRSTRLEN]; snprintf(src, sizeof(src), "[%s]", ip6_sprintf(ip6buf, &args->f_id.src_ip6)); snprintf(dst, sizeof(dst), "[%s]", ip6_sprintf(ip6buf, &args->f_id.dst_ip6)); } else #endif { da.s_addr = htonl(args->f_id.src_ip); inet_ntop(AF_INET, &da, src, sizeof(src)); da.s_addr = htonl(args->f_id.dst_ip); inet_ntop(AF_INET, &da, dst, sizeof(dst)); } log(LOG_SECURITY | LOG_DEBUG, "ipfw: %d %s %s:%u -> %s:%u, %s\n", parent->rule->rulenum, "drop session", src, (args->f_id.src_port), dst, (args->f_id.dst_port), "too many entries"); } IPFW_DYN_UNLOCK(); return (1); } } add_dyn_rule(&args->f_id, O_LIMIT, (struct ip_fw *)parent); break; } default: printf("ipfw: %s: unknown dynamic rule type %u\n", __func__, cmd->o.opcode); IPFW_DYN_UNLOCK(); return (1); } /* XXX just set lifetime */ lookup_dyn_rule_locked(&args->f_id, NULL, NULL); IPFW_DYN_UNLOCK(); return (0); } /* * Generate a TCP packet, containing either a RST or a keepalive. * When flags & TH_RST, we are sending a RST packet, because of a * "reset" action matched the packet. * Otherwise we are sending a keepalive, and flags & TH_ * The 'replyto' mbuf is the mbuf being replied to, if any, and is required * so that MAC can label the reply appropriately. */ struct mbuf * ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) { struct mbuf *m = NULL; /* stupid compiler */ int len, dir; struct ip *h = NULL; /* stupid compiler */ #ifdef INET6 struct ip6_hdr *h6 = NULL; #endif struct tcphdr *th = NULL; MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) return (NULL); M_SETFIB(m, id->fib); #ifdef MAC if (replyto != NULL) mac_netinet_firewall_reply(replyto, m); else mac_netinet_firewall_send(m); #else (void)replyto; /* don't warn about unused arg */ #endif switch (id->addr_type) { case 4: len = sizeof(struct ip) + sizeof(struct tcphdr); break; #ifdef INET6 case 6: len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); break; #endif default: /* XXX: log me?!? */ FREE_PKT(m); return (NULL); } dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN); m->m_data += max_linkhdr; m->m_flags |= M_SKIP_FIREWALL; m->m_pkthdr.len = m->m_len = len; m->m_pkthdr.rcvif = NULL; bzero(m->m_data, len); switch (id->addr_type) { case 4: h = mtod(m, struct ip *); /* prepare for checksum */ h->ip_p = IPPROTO_TCP; h->ip_len = htons(sizeof(struct tcphdr)); if (dir) { h->ip_src.s_addr = htonl(id->src_ip); h->ip_dst.s_addr = htonl(id->dst_ip); } else { h->ip_src.s_addr = htonl(id->dst_ip); h->ip_dst.s_addr = htonl(id->src_ip); } th = (struct tcphdr *)(h + 1); break; #ifdef INET6 case 6: h6 = mtod(m, struct ip6_hdr *); /* prepare for checksum */ h6->ip6_nxt = IPPROTO_TCP; h6->ip6_plen = htons(sizeof(struct tcphdr)); if (dir) { h6->ip6_src = id->src_ip6; h6->ip6_dst = id->dst_ip6; } else { h6->ip6_src = id->dst_ip6; h6->ip6_dst = id->src_ip6; } th = (struct tcphdr *)(h6 + 1); break; #endif } if (dir) { th->th_sport = htons(id->src_port); th->th_dport = htons(id->dst_port); } else { th->th_sport = htons(id->dst_port); th->th_dport = htons(id->src_port); } th->th_off = sizeof(struct tcphdr) >> 2; if (flags & TH_RST) { if (flags & TH_ACK) { th->th_seq = htonl(ack); th->th_flags = TH_RST; } else { if (flags & TH_SYN) seq++; th->th_ack = htonl(seq); th->th_flags = TH_RST | TH_ACK; } } else { /* * Keepalive - use caller provided sequence numbers */ th->th_seq = htonl(seq); th->th_ack = htonl(ack); th->th_flags = TH_ACK; } switch (id->addr_type) { case 4: th->th_sum = in_cksum(m, len); /* finish the ip header */ h->ip_v = 4; h->ip_hl = sizeof(*h) >> 2; h->ip_tos = IPTOS_LOWDELAY; h->ip_off = 0; /* ip_len must be in host format for ip_output */ h->ip_len = len; h->ip_ttl = V_ip_defttl; h->ip_sum = 0; break; #ifdef INET6 case 6: th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6), sizeof(struct tcphdr)); /* finish the ip6 header */ h6->ip6_vfc |= IPV6_VERSION; h6->ip6_hlim = IPV6_DEFHLIM; break; #endif } return (m); } /* * This procedure is only used to handle keepalives. It is invoked * every dyn_keepalive_period */ static void ipfw_tick(void * vnetx) { struct mbuf *m0, *m, *mnext, **mtailp; #ifdef INET6 struct mbuf *m6, **m6_tailp; #endif int i; ipfw_dyn_rule *q; #ifdef VIMAGE struct vnet *vp = vnetx; #endif CURVNET_SET(vp); if (V_dyn_keepalive == 0 || V_ipfw_dyn_v == NULL || V_dyn_count == 0) goto done; /* * We make a chain of packets to go out here -- not deferring * until after we drop the IPFW dynamic rule lock would result * in a lock order reversal with the normal packet input -> ipfw * call stack. */ m0 = NULL; mtailp = &m0; #ifdef INET6 m6 = NULL; m6_tailp = &m6; #endif IPFW_DYN_LOCK(); for (i = 0 ; i < V_curr_dyn_buckets ; i++) { for (q = V_ipfw_dyn_v[i] ; q ; q = q->next ) { if (q->dyn_type == O_LIMIT_PARENT) continue; if (q->id.proto != IPPROTO_TCP) continue; if ( (q->state & BOTH_SYN) != BOTH_SYN) continue; if (TIME_LEQ(time_uptime + V_dyn_keepalive_interval, q->expire)) continue; /* too early */ if (TIME_LEQ(q->expire, time_uptime)) continue; /* too late, rule expired */ m = (q->state & ACK_REV) ? NULL : ipfw_send_pkt(NULL, &(q->id), q->ack_rev - 1, q->ack_fwd, TH_SYN); mnext = (q->state & ACK_FWD) ? NULL : ipfw_send_pkt(NULL, &(q->id), q->ack_fwd - 1, q->ack_rev, 0); switch (q->id.addr_type) { case 4: if (m != NULL) { *mtailp = m; mtailp = &(*mtailp)->m_nextpkt; } if (mnext != NULL) { *mtailp = mnext; mtailp = &(*mtailp)->m_nextpkt; } break; #ifdef INET6 case 6: if (m != NULL) { *m6_tailp = m; m6_tailp = &(*m6_tailp)->m_nextpkt; } if (mnext != NULL) { *m6_tailp = mnext; m6_tailp = &(*m6_tailp)->m_nextpkt; } break; #endif } } } IPFW_DYN_UNLOCK(); for (m = m0; m != NULL; m = mnext) { mnext = m->m_nextpkt; m->m_nextpkt = NULL; ip_output(m, NULL, NULL, 0, NULL, NULL); } #ifdef INET6 for (m = m6; m != NULL; m = mnext) { mnext = m->m_nextpkt; m->m_nextpkt = NULL; ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif done: callout_reset_on(&V_ipfw_timeout, V_dyn_keepalive_period * hz, ipfw_tick, vnetx, 0); CURVNET_RESTORE(); } void ipfw_dyn_attach(void) { ipfw_dyn_rule_zone = uma_zcreate("IPFW dynamic rule", sizeof(ipfw_dyn_rule), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); IPFW_DYN_LOCK_INIT(); } void ipfw_dyn_detach(void) { uma_zdestroy(ipfw_dyn_rule_zone); IPFW_DYN_LOCK_DESTROY(); } void ipfw_dyn_init(void) { V_ipfw_dyn_v = NULL; V_dyn_buckets = 256; /* must be power of 2 */ V_curr_dyn_buckets = 256; /* must be power of 2 */ V_dyn_ack_lifetime = 300; V_dyn_syn_lifetime = 20; V_dyn_fin_lifetime = 1; V_dyn_rst_lifetime = 1; V_dyn_udp_lifetime = 10; V_dyn_short_lifetime = 5; V_dyn_keepalive_interval = 20; V_dyn_keepalive_period = 5; V_dyn_keepalive = 1; /* do send keepalives */ V_dyn_max = 4096; /* max # of dynamic rules */ callout_init(&V_ipfw_timeout, CALLOUT_MPSAFE); callout_reset_on(&V_ipfw_timeout, hz, ipfw_tick, curvnet, 0); } void ipfw_dyn_uninit(int pass) { if (pass == 0) callout_drain(&V_ipfw_timeout); else { if (V_ipfw_dyn_v != NULL) free(V_ipfw_dyn_v, M_IPFW); } } int ipfw_dyn_len(void) { return (V_ipfw_dyn_v == NULL) ? 0 : (V_dyn_count * sizeof(ipfw_dyn_rule)); } void ipfw_get_dynamic(char **pbp, const char *ep) { ipfw_dyn_rule *p, *last = NULL; char *bp; int i; if (V_ipfw_dyn_v == NULL) return; bp = *pbp; IPFW_DYN_LOCK(); for (i = 0 ; i < V_curr_dyn_buckets; i++) for (p = V_ipfw_dyn_v[i] ; p != NULL; p = p->next) { if (bp + sizeof *p <= ep) { ipfw_dyn_rule *dst = (ipfw_dyn_rule *)bp; bcopy(p, dst, sizeof *p); bcopy(&(p->rule->rulenum), &(dst->rule), sizeof(p->rule->rulenum)); /* * store set number into high word of * dst->rule pointer. */ bcopy(&(p->rule->set), (char *)&dst->rule + sizeof(p->rule->rulenum), sizeof(p->rule->set)); /* * store a non-null value in "next". * The userland code will interpret a * NULL here as a marker * for the last dynamic rule. */ bcopy(&dst, &dst->next, sizeof(dst)); last = dst; dst->expire = TIME_LEQ(dst->expire, time_uptime) ? 0 : dst->expire - time_uptime ; bp += sizeof(ipfw_dyn_rule); } } IPFW_DYN_UNLOCK(); if (last != NULL) /* mark last dynamic rule */ bzero(&last->next, sizeof(last)); *pbp = bp; } /* end of file */ ipfw-user/sys/netinet/ipfw/dn_sched_fifo.c000644 000423 000000 00000007266 12006744005 021415 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD: head/sys/netinet/ipfw/dn_sched_fifo.c 204591 2010-03-02 17:40:48Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif /* * This file implements a FIFO scheduler for a single queue. * The queue is allocated as part of the scheduler instance, * and there is a single flowset is in the template which stores * queue size and policy. * Enqueue and dequeue use the default library functions. */ static int fifo_enqueue(struct dn_sch_inst *si, struct dn_queue *q, struct mbuf *m) { /* XXX if called with q != NULL and m=NULL, this is a * re-enqueue from an existing scheduler, which we should * handle. */ return dn_enqueue((struct dn_queue *)(si+1), m, 0); } static struct mbuf * fifo_dequeue(struct dn_sch_inst *si) { return dn_dequeue((struct dn_queue *)(si + 1)); } static int fifo_new_sched(struct dn_sch_inst *si) { /* This scheduler instance contains the queue */ struct dn_queue *q = (struct dn_queue *)(si + 1); set_oid(&q->ni.oid, DN_QUEUE, sizeof(*q)); q->_si = si; q->fs = si->sched->fs; return 0; } static int fifo_free_sched(struct dn_sch_inst *si) { struct dn_queue *q = (struct dn_queue *)(si + 1); dn_free_pkts(q->mq.head); bzero(q, sizeof(*q)); return 0; } /* * FIFO scheduler descriptor * contains the type of the scheduler, the name, the size of extra * data structures, and function pointers. */ static struct dn_alg fifo_desc = { _SI( .type = ) DN_SCHED_FIFO, _SI( .name = ) "FIFO", _SI( .flags = ) 0, _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct dn_queue), _SI( .q_datalen = ) 0, _SI( .enqueue = ) fifo_enqueue, _SI( .dequeue = ) fifo_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) fifo_new_sched, _SI( .free_sched = ) fifo_free_sched, _SI( .new_fsk = ) NULL, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) NULL, _SI( .free_queue = ) NULL, }; DECLARE_DNSCHED_MODULE(dn_fifo, &fifo_desc); ipfw-user/sys/netinet/ipfw/dn_sched_rr.c000644 000423 000000 00000016176 12006744005 021115 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD: head/sys/netinet/ipfw/dn_sched_rr.c 206845 2010-04-19 16:17:30Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #define DN_SCHED_RR 3 // XXX Where? struct rr_queue { struct dn_queue q; /* Standard queue */ int status; /* 1: queue is in the list */ int credit; /* Number of bytes to transmit */ int quantum; /* quantum * C */ struct rr_queue *qnext; /* */ }; /* struct rr_schk contains global config parameters * and is right after dn_schk */ struct rr_schk { int min_q; /* Min quantum */ int max_q; /* Max quantum */ int q_bytes; /* Bytes per quantum */ }; /* per-instance round robin list, right after dn_sch_inst */ struct rr_si { struct rr_queue *head, *tail; /* Pointer to current queue */ }; /* Append a queue to the rr list */ static inline void rr_append(struct rr_queue *q, struct rr_si *si) { q->status = 1; /* mark as in-rr_list */ q->credit = q->quantum; /* initialize credit */ /* append to the tail */ if (si->head == NULL) si->head = q; else si->tail->qnext = q; si->tail = q; /* advance the tail pointer */ q->qnext = si->head; /* make it circular */ } /* Remove the head queue from circular list. */ static inline void rr_remove_head(struct rr_si *si) { if (si->head == NULL) return; /* empty queue */ si->head->status = 0; if (si->head == si->tail) { si->head = si->tail = NULL; return; } si->head = si->head->qnext; si->tail->qnext = si->head; } /* Remove a queue from circular list. * XXX see if ti can be merge with remove_queue() */ static inline void remove_queue_q(struct rr_queue *q, struct rr_si *si) { struct rr_queue *prev; if (q->status != 1) return; if (q == si->head) { rr_remove_head(si); return; } for (prev = si->head; prev; prev = prev->qnext) { if (prev->qnext != q) continue; prev->qnext = q->qnext; if (q == si->tail) si->tail = prev; q->status = 0; break; } } static inline void next_pointer(struct rr_si *si) { if (si->head == NULL) return; /* empty queue */ si->head = si->head->qnext; si->tail = si->tail->qnext; } static int rr_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m) { struct rr_si *si; struct rr_queue *rrq; if (m != q->mq.head) { if (dn_enqueue(q, m, 0)) /* packet was dropped */ return 1; if (m != q->mq.head) return 0; } /* If reach this point, queue q was idle */ si = (struct rr_si *)(_si + 1); rrq = (struct rr_queue *)q; if (rrq->status == 1) /* Queue is already in the queue list */ return 0; /* Insert the queue in the queue list */ rr_append(rrq, si); return 0; } static struct mbuf * rr_dequeue(struct dn_sch_inst *_si) { /* Access scheduler instance private data */ struct rr_si *si = (struct rr_si *)(_si + 1); struct rr_queue *rrq; uint64_t len; while ( (rrq = si->head) ) { struct mbuf *m = rrq->q.mq.head; if ( m == NULL) { /* empty queue, remove from list */ rr_remove_head(si); continue; } len = m->m_pkthdr.len; if (len > rrq->credit) { /* Packet too big */ rrq->credit += rrq->quantum; /* Try next queue */ next_pointer(si); } else { rrq->credit -= len; return dn_dequeue(&rrq->q); } } /* no packet to dequeue*/ return NULL; } static int rr_config(struct dn_schk *_schk) { struct rr_schk *schk = (struct rr_schk *)(_schk + 1); ND("called"); /* use reasonable quantums (64..2k bytes, default 1500) */ schk->min_q = 64; schk->max_q = 2048; schk->q_bytes = 1500; /* quantum */ return 0; } static int rr_new_sched(struct dn_sch_inst *_si) { struct rr_si *si = (struct rr_si *)(_si + 1); ND("called"); si->head = si->tail = NULL; return 0; } static int rr_free_sched(struct dn_sch_inst *_si) { ND("called"); /* Nothing to do? */ return 0; } static int rr_new_fsk(struct dn_fsk *fs) { struct rr_schk *schk = (struct rr_schk *)(fs->sched + 1); /* par[0] is the weight, par[1] is the quantum step */ ipdn_bound_var(&fs->fs.par[0], 1, 1, 65536, "RR weight"); ipdn_bound_var(&fs->fs.par[1], schk->q_bytes, schk->min_q, schk->max_q, "RR quantum"); return 0; } static int rr_new_queue(struct dn_queue *_q) { struct rr_queue *q = (struct rr_queue *)_q; _q->ni.oid.subtype = DN_SCHED_RR; q->quantum = _q->fs->fs.par[0] * _q->fs->fs.par[1]; ND("called, q->quantum %d", q->quantum); q->credit = q->quantum; q->status = 0; if (_q->mq.head != NULL) { /* Queue NOT empty, insert in the queue list */ rr_append(q, (struct rr_si *)(_q->_si + 1)); } return 0; } static int rr_free_queue(struct dn_queue *_q) { struct rr_queue *q = (struct rr_queue *)_q; ND("called"); if (q->status == 1) { struct rr_si *si = (struct rr_si *)(_q->_si + 1); remove_queue_q(q, si); } return 0; } /* * RR scheduler descriptor * contains the type of the scheduler, the name, the size of the * structures and function pointers. */ static struct dn_alg rr_desc = { _SI( .type = ) DN_SCHED_RR, _SI( .name = ) "RR", _SI( .flags = ) DN_MULTIQUEUE, _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct rr_si), _SI( .q_datalen = ) sizeof(struct rr_queue) - sizeof(struct dn_queue), _SI( .enqueue = ) rr_enqueue, _SI( .dequeue = ) rr_dequeue, _SI( .config = ) rr_config, _SI( .destroy = ) NULL, _SI( .new_sched = ) rr_new_sched, _SI( .free_sched = ) rr_free_sched, _SI( .new_fsk = ) rr_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) rr_new_queue, _SI( .free_queue = ) rr_free_queue, }; DECLARE_DNSCHED_MODULE(dn_rr, &rr_desc); ipfw-user/sys/netinet/ipfw/ip_fw_table.c000644 000423 000000 00000046557 12006744005 021124 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_table.c 238265 2012-07-08 21:13:04Z melifaro $"); /* * Lookup table support for ipfw * * Lookup tables are implemented (at the moment) using the radix * tree used for routing tables. Tables store key-value entries, where * keys are network prefixes (addr/masklen), and values are integers. * As a degenerate case we can interpret keys as 32-bit integers * (with a /32 mask). * * The table is protected by the IPFW lock even for manipulation coming * from userland, because operations are typically fast. */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include #include #include #include /* struct ipfw_rule_ref */ #include #include /* LIST_HEAD */ #include #ifdef MAC #include #endif static MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); struct table_entry { struct radix_node rn[2]; struct sockaddr_in addr, mask; u_int32_t value; }; struct xaddr_iface { uint8_t if_len; /* length of this struct */ uint8_t pad[7]; /* Align name */ char ifname[IF_NAMESIZE]; /* Interface name */ }; struct table_xentry { struct radix_node rn[2]; union { #ifdef INET6 struct sockaddr_in6 addr6; #endif struct xaddr_iface iface; } a; union { #ifdef INET6 struct sockaddr_in6 mask6; #endif struct xaddr_iface ifmask; } m; u_int32_t value; }; /* * The radix code expects addr and mask to be array of bytes, * with the first byte being the length of the array. rn_inithead * is called with the offset in bits of the lookup key within the * array. If we use a sockaddr_in as the underlying type, * sin_len is conveniently located at offset 0, sin_addr is at * offset 4 and normally aligned. * But for portability, let's avoid assumption and make the code explicit */ #define KEY_LEN(v) *((uint8_t *)&(v)) #define KEY_OFS (8*offsetof(struct sockaddr_in, sin_addr)) /* * Do not require radix to compare more than actual IPv4/IPv6 address */ #define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) #define KEY_LEN_INET6 (offsetof(struct sockaddr_in6, sin6_addr) + sizeof(struct in6_addr)) #define KEY_LEN_IFACE (offsetof(struct xaddr_iface, ifname)) #define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) #define OFF_LEN_INET6 (8 * offsetof(struct sockaddr_in6, sin6_addr)) #define OFF_LEN_IFACE (8 * offsetof(struct xaddr_iface, ifname)) static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask) { uint32_t *cp; for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) *cp++ = 0xFFFFFFFF; *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); } int ipfw_add_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, uint8_t plen, uint8_t mlen, uint8_t type, uint32_t value) { struct radix_node_head *rnh, **rnh_ptr; struct table_entry *ent; struct table_xentry *xent; struct radix_node *rn; in_addr_t addr; int offset; void *ent_ptr; struct sockaddr *addr_ptr, *mask_ptr; char c; if (tbl >= V_fw_tables_max) return (EINVAL); switch (type) { case IPFW_TABLE_CIDR: if (plen == sizeof(in_addr_t)) { #ifdef INET /* IPv4 case */ if (mlen > 32) return (EINVAL); ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); ent->value = value; /* Set 'total' structure length */ KEY_LEN(ent->addr) = KEY_LEN_INET; KEY_LEN(ent->mask) = KEY_LEN_INET; /* Set offset of IPv4 address in bits */ offset = OFF_LEN_INET; ent->mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); addr = *((in_addr_t *)paddr); ent->addr.sin_addr.s_addr = addr & ent->mask.sin_addr.s_addr; /* Set pointers */ rnh_ptr = &ch->tables[tbl]; ent_ptr = ent; addr_ptr = (struct sockaddr *)&ent->addr; mask_ptr = (struct sockaddr *)&ent->mask; #endif #ifdef INET6 } else if (plen == sizeof(struct in6_addr)) { /* IPv6 case */ if (mlen > 128) return (EINVAL); xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); xent->value = value; /* Set 'total' structure length */ KEY_LEN(xent->a.addr6) = KEY_LEN_INET6; KEY_LEN(xent->m.mask6) = KEY_LEN_INET6; /* Set offset of IPv6 address in bits */ offset = OFF_LEN_INET6; ipv6_writemask(&xent->m.mask6.sin6_addr, mlen); memcpy(&xent->a.addr6.sin6_addr, paddr, sizeof(struct in6_addr)); APPLY_MASK(&xent->a.addr6.sin6_addr, &xent->m.mask6.sin6_addr); /* Set pointers */ rnh_ptr = &ch->xtables[tbl]; ent_ptr = xent; addr_ptr = (struct sockaddr *)&xent->a.addr6; mask_ptr = (struct sockaddr *)&xent->m.mask6; #endif } else { /* Unknown CIDR type */ return (EINVAL); } break; case IPFW_TABLE_INTERFACE: /* Check if string is terminated */ c = ((char *)paddr)[IF_NAMESIZE - 1]; ((char *)paddr)[IF_NAMESIZE - 1] = '\0'; if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0')) return (EINVAL); /* Include last \0 into comparison */ mlen++; xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); xent->value = value; /* Set 'total' structure length */ KEY_LEN(xent->a.iface) = KEY_LEN_IFACE + mlen; KEY_LEN(xent->m.ifmask) = KEY_LEN_IFACE + mlen; /* Set offset of interface name in bits */ offset = OFF_LEN_IFACE; memcpy(xent->a.iface.ifname, paddr, mlen); /* Assume direct match */ /* TODO: Add interface pattern matching */ #if 0 memset(xent->m.ifmask.ifname, 0xFF, IF_NAMESIZE); mask_ptr = (struct sockaddr *)&xent->m.ifmask; #endif /* Set pointers */ rnh_ptr = &ch->xtables[tbl]; ent_ptr = xent; addr_ptr = (struct sockaddr *)&xent->a.iface; mask_ptr = NULL; break; default: return (EINVAL); } IPFW_WLOCK(ch); /* Check if tabletype is valid */ if ((ch->tabletype[tbl] != 0) && (ch->tabletype[tbl] != type)) { IPFW_WUNLOCK(ch); free(ent_ptr, M_IPFW_TBL); return (EINVAL); } /* Check if radix tree exists */ if ((rnh = *rnh_ptr) == NULL) { IPFW_WUNLOCK(ch); /* Create radix for a new table */ if (!rn_inithead((void **)&rnh, offset)) { free(ent_ptr, M_IPFW_TBL); return (ENOMEM); } IPFW_WLOCK(ch); if (*rnh_ptr != NULL) { /* Tree is already attached by other thread */ rn_detachhead((void **)&rnh); rnh = *rnh_ptr; /* Check table type another time */ if (ch->tabletype[tbl] != type) { IPFW_WUNLOCK(ch); free(ent_ptr, M_IPFW_TBL); return (EINVAL); } } else { *rnh_ptr = rnh; /* * Set table type. It can be set already * (if we have IPv6-only table) but setting * it another time does not hurt */ ch->tabletype[tbl] = type; } } rn = rnh->rnh_addaddr(addr_ptr, mask_ptr, rnh, ent_ptr); IPFW_WUNLOCK(ch); if (rn == NULL) { free(ent_ptr, M_IPFW_TBL); return (EEXIST); } return (0); } int ipfw_del_table_entry(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, uint8_t plen, uint8_t mlen, uint8_t type) { struct radix_node_head *rnh, **rnh_ptr; struct table_entry *ent; in_addr_t addr; struct sockaddr_in sa, mask; struct sockaddr *sa_ptr, *mask_ptr; char c; if (tbl >= V_fw_tables_max) return (EINVAL); switch (type) { case IPFW_TABLE_CIDR: if (plen == sizeof(in_addr_t)) { /* Set 'total' structure length */ KEY_LEN(sa) = KEY_LEN_INET; KEY_LEN(mask) = KEY_LEN_INET; mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); addr = *((in_addr_t *)paddr); sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr; rnh_ptr = &ch->tables[tbl]; sa_ptr = (struct sockaddr *)&sa; mask_ptr = (struct sockaddr *)&mask; #ifdef INET6 } else if (plen == sizeof(struct in6_addr)) { /* IPv6 case */ if (mlen > 128) return (EINVAL); struct sockaddr_in6 sa6, mask6; memset(&sa6, 0, sizeof(struct sockaddr_in6)); memset(&mask6, 0, sizeof(struct sockaddr_in6)); /* Set 'total' structure length */ KEY_LEN(sa6) = KEY_LEN_INET6; KEY_LEN(mask6) = KEY_LEN_INET6; ipv6_writemask(&mask6.sin6_addr, mlen); memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr)); APPLY_MASK(&sa6.sin6_addr, &mask6.sin6_addr); rnh_ptr = &ch->xtables[tbl]; sa_ptr = (struct sockaddr *)&sa6; mask_ptr = (struct sockaddr *)&mask6; #endif } else { /* Unknown CIDR type */ return (EINVAL); } break; case IPFW_TABLE_INTERFACE: /* Check if string is terminated */ c = ((char *)paddr)[IF_NAMESIZE - 1]; ((char *)paddr)[IF_NAMESIZE - 1] = '\0'; if (((mlen = strlen((char *)paddr)) == IF_NAMESIZE - 1) && (c != '\0')) return (EINVAL); struct xaddr_iface ifname, ifmask; memset(&ifname, 0, sizeof(ifname)); /* Include last \0 into comparison */ mlen++; /* Set 'total' structure length */ KEY_LEN(ifname) = KEY_LEN_IFACE + mlen; KEY_LEN(ifmask) = KEY_LEN_IFACE + mlen; /* Assume direct match */ /* FIXME: Add interface pattern matching */ #if 0 memset(ifmask.ifname, 0xFF, IF_NAMESIZE); mask_ptr = (struct sockaddr *)&ifmask; #endif mask_ptr = NULL; memcpy(ifname.ifname, paddr, mlen); /* Set pointers */ rnh_ptr = &ch->xtables[tbl]; sa_ptr = (struct sockaddr *)&ifname; break; default: return (EINVAL); } IPFW_WLOCK(ch); if ((rnh = *rnh_ptr) == NULL) { IPFW_WUNLOCK(ch); return (ESRCH); } if (ch->tabletype[tbl] != type) { IPFW_WUNLOCK(ch); return (EINVAL); } ent = (struct table_entry *)rnh->rnh_deladdr(sa_ptr, mask_ptr, rnh); IPFW_WUNLOCK(ch); if (ent == NULL) return (ESRCH); free(ent, M_IPFW_TBL); return (0); } static int flush_table_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct table_entry *ent; ent = (struct table_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } int ipfw_flush_table(struct ip_fw_chain *ch, uint16_t tbl) { struct radix_node_head *rnh, *xrnh; if (tbl >= V_fw_tables_max) return (EINVAL); /* * We free both (IPv4 and extended) radix trees and * clear table type here to permit table to be reused * for different type without module reload */ IPFW_WLOCK(ch); /* Set IPv4 table pointer to zero */ if ((rnh = ch->tables[tbl]) != NULL) ch->tables[tbl] = NULL; /* Set extended table pointer to zero */ if ((xrnh = ch->xtables[tbl]) != NULL) ch->xtables[tbl] = NULL; /* Zero table type */ ch->tabletype[tbl] = 0; IPFW_WUNLOCK(ch); if (rnh != NULL) { rnh->rnh_walktree(rnh, flush_table_entry, rnh); rn_detachhead((void **)&rnh); } if (xrnh != NULL) { xrnh->rnh_walktree(xrnh, flush_table_entry, xrnh); rn_detachhead((void **)&xrnh); } return (0); } void ipfw_destroy_tables(struct ip_fw_chain *ch) { uint16_t tbl; /* Flush all tables */ for (tbl = 0; tbl < V_fw_tables_max; tbl++) ipfw_flush_table(ch, tbl); /* Free pointers itself */ free(ch->tables, M_IPFW); free(ch->xtables, M_IPFW); free(ch->tabletype, M_IPFW); } int ipfw_init_tables(struct ip_fw_chain *ch) { /* Allocate pointers */ ch->tables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); ch->xtables = malloc(V_fw_tables_max * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); ch->tabletype = malloc(V_fw_tables_max * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO); return (0); } int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables) { struct radix_node_head **tables, **xtables, *rnh; struct radix_node_head **tables_old, **xtables_old; uint8_t *tabletype, *tabletype_old; unsigned int ntables_old, tbl; /* Check new value for validity */ if (ntables > IPFW_TABLES_MAX) ntables = IPFW_TABLES_MAX; /* Allocate new pointers */ tables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); xtables = malloc(ntables * sizeof(void *), M_IPFW, M_WAITOK | M_ZERO); tabletype = malloc(ntables * sizeof(uint8_t), M_IPFW, M_WAITOK | M_ZERO); IPFW_WLOCK(ch); tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables; /* Copy old table pointers */ memcpy(tables, ch->tables, sizeof(void *) * tbl); memcpy(xtables, ch->xtables, sizeof(void *) * tbl); memcpy(tabletype, ch->tabletype, sizeof(uint8_t) * tbl); /* Change pointers and number of tables */ tables_old = ch->tables; xtables_old = ch->xtables; tabletype_old = ch->tabletype; ch->tables = tables; ch->xtables = xtables; ch->tabletype = tabletype; ntables_old = V_fw_tables_max; V_fw_tables_max = ntables; IPFW_WUNLOCK(ch); /* Check if we need to destroy radix trees */ if (ntables < ntables_old) { for (tbl = ntables; tbl < ntables_old; tbl++) { if ((rnh = tables_old[tbl]) != NULL) { rnh->rnh_walktree(rnh, flush_table_entry, rnh); rn_detachhead((void **)&rnh); } if ((rnh = xtables_old[tbl]) != NULL) { rnh->rnh_walktree(rnh, flush_table_entry, rnh); rn_detachhead((void **)&rnh); } } } /* Free old pointers */ free(tables_old, M_IPFW); free(xtables_old, M_IPFW); free(tabletype_old, M_IPFW); return (0); } int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val) { struct radix_node_head *rnh; struct table_entry *ent; struct sockaddr_in sa; if (tbl >= V_fw_tables_max) return (0); if ((rnh = ch->tables[tbl]) == NULL) return (0); KEY_LEN(sa) = KEY_LEN_INET; sa.sin_addr.s_addr = addr; ent = (struct table_entry *)(rnh->rnh_lookup(&sa, NULL, rnh)); if (ent != NULL) { *val = ent->value; return (1); } return (0); } int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, void *paddr, uint32_t *val, int type) { struct radix_node_head *rnh; struct table_xentry *xent; struct sockaddr_in6 sa6; struct xaddr_iface iface; if (tbl >= V_fw_tables_max) return (0); if ((rnh = ch->xtables[tbl]) == NULL) return (0); switch (type) { case IPFW_TABLE_CIDR: KEY_LEN(sa6) = KEY_LEN_INET6; memcpy(&sa6.sin6_addr, paddr, sizeof(struct in6_addr)); xent = (struct table_xentry *)(rnh->rnh_lookup(&sa6, NULL, rnh)); break; case IPFW_TABLE_INTERFACE: KEY_LEN(iface) = KEY_LEN_IFACE + strlcpy(iface.ifname, (char *)paddr, IF_NAMESIZE) + 1; /* Assume direct match */ /* FIXME: Add interface pattern matching */ xent = (struct table_xentry *)(rnh->rnh_lookup(&iface, NULL, rnh)); break; default: return (0); } if (xent != NULL) { *val = xent->value; return (1); } return (0); } static int count_table_entry(struct radix_node *rn, void *arg) { u_int32_t * const cnt = arg; (*cnt)++; return (0); } int ipfw_count_table(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) { struct radix_node_head *rnh; if (tbl >= V_fw_tables_max) return (EINVAL); *cnt = 0; if ((rnh = ch->tables[tbl]) == NULL) return (0); rnh->rnh_walktree(rnh, count_table_entry, cnt); return (0); } static int dump_table_entry(struct radix_node *rn, void *arg) { struct table_entry * const n = (struct table_entry *)rn; ipfw_table * const tbl = arg; ipfw_table_entry *ent; if (tbl->cnt == tbl->size) return (1); ent = &tbl->ent[tbl->cnt]; ent->tbl = tbl->tbl; if (in_nullhost(n->mask.sin_addr)) ent->masklen = 0; else ent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); ent->addr = n->addr.sin_addr.s_addr; ent->value = n->value; tbl->cnt++; return (0); } int ipfw_dump_table(struct ip_fw_chain *ch, ipfw_table *tbl) { struct radix_node_head *rnh; if (tbl->tbl >= V_fw_tables_max) return (EINVAL); tbl->cnt = 0; if ((rnh = ch->tables[tbl->tbl]) == NULL) return (0); rnh->rnh_walktree(rnh, dump_table_entry, tbl); return (0); } static int count_table_xentry(struct radix_node *rn, void *arg) { uint32_t * const cnt = arg; (*cnt) += sizeof(ipfw_table_xentry); return (0); } int ipfw_count_xtable(struct ip_fw_chain *ch, uint32_t tbl, uint32_t *cnt) { struct radix_node_head *rnh; if (tbl >= V_fw_tables_max) return (EINVAL); *cnt = 0; if ((rnh = ch->tables[tbl]) != NULL) rnh->rnh_walktree(rnh, count_table_xentry, cnt); if ((rnh = ch->xtables[tbl]) != NULL) rnh->rnh_walktree(rnh, count_table_xentry, cnt); /* Return zero if table is empty */ if (*cnt > 0) (*cnt) += sizeof(ipfw_xtable); return (0); } static int dump_table_xentry_base(struct radix_node *rn, void *arg) { struct table_entry * const n = (struct table_entry *)rn; ipfw_xtable * const tbl = arg; ipfw_table_xentry *xent; /* Out of memory, returning */ if (tbl->cnt == tbl->size) return (1); xent = &tbl->xent[tbl->cnt]; xent->len = sizeof(ipfw_table_xentry); xent->tbl = tbl->tbl; if (in_nullhost(n->mask.sin_addr)) xent->masklen = 0; else xent->masklen = 33 - ffs(ntohl(n->mask.sin_addr.s_addr)); /* Save IPv4 address as deprecated IPv6 compatible */ xent->k.addr6.s6_addr32[3] = n->addr.sin_addr.s_addr; xent->value = n->value; tbl->cnt++; return (0); } static int dump_table_xentry_extended(struct radix_node *rn, void *arg) { struct table_xentry * const n = (struct table_xentry *)rn; ipfw_xtable * const tbl = arg; ipfw_table_xentry *xent; #ifdef INET6 int i; uint32_t *v; #endif /* Out of memory, returning */ if (tbl->cnt == tbl->size) return (1); xent = &tbl->xent[tbl->cnt]; xent->len = sizeof(ipfw_table_xentry); xent->tbl = tbl->tbl; switch (tbl->type) { #ifdef INET6 case IPFW_TABLE_CIDR: /* Count IPv6 mask */ v = (uint32_t *)&n->m.mask6.sin6_addr; for (i = 0; i < sizeof(struct in6_addr) / 4; i++, v++) xent->masklen += bitcount32(*v); memcpy(&xent->k, &n->a.addr6.sin6_addr, sizeof(struct in6_addr)); break; #endif case IPFW_TABLE_INTERFACE: /* Assume exact mask */ xent->masklen = 8 * IF_NAMESIZE; memcpy(&xent->k, &n->a.iface.ifname, IF_NAMESIZE); break; default: /* unknown, skip entry */ return (0); } xent->value = n->value; tbl->cnt++; return (0); } int ipfw_dump_xtable(struct ip_fw_chain *ch, ipfw_xtable *tbl) { struct radix_node_head *rnh; if (tbl->tbl >= V_fw_tables_max) return (EINVAL); tbl->cnt = 0; tbl->type = ch->tabletype[tbl->tbl]; if ((rnh = ch->tables[tbl->tbl]) != NULL) rnh->rnh_walktree(rnh, dump_table_xentry_base, tbl); if ((rnh = ch->xtables[tbl->tbl]) != NULL) rnh->rnh_walktree(rnh, dump_table_xentry_extended, tbl); return (0); } /* end of file */ ipfw-user/sys/netinet/ipfw/dn_sched.h000644 000423 000000 00000015411 12007700547 020412 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Riccardo Panicucci, Luigi Rizzo, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The API to write a packet scheduling algorithm for dummynet. * * $FreeBSD: head/sys/netinet/ipfw/dn_sched.h 213267 2010-09-29 09:40:20Z luigi $ */ #ifndef _DN_SCHED_H #define _DN_SCHED_H #define DN_MULTIQUEUE 0x01 /* * Descriptor for a scheduling algorithm. * Contains all function pointers for a given scheduler * This is typically created when a module is loaded, and stored * in a global list of schedulers. */ struct dn_alg { uint32_t type; /* the scheduler type */ const char *name; /* scheduler name */ uint32_t flags; /* DN_MULTIQUEUE if supports multiple queues */ /* * The following define the size of 3 optional data structures * that may need to be allocated at runtime, and are appended * to each of the base data structures: scheduler, sched.inst, * and queue. We don't have a per-flowset structure. */ /* + parameters attached to the template, e.g. * default queue sizes, weights, quantum size, and so on; */ size_t schk_datalen; /* + per-instance parameters, such as timestamps, * containers for queues, etc; */ size_t si_datalen; size_t q_datalen; /* per-queue parameters (e.g. S,F) */ /* * Methods implemented by the scheduler: * enqueue enqueue packet 'm' on scheduler 's', queue 'q'. * q is NULL for !MULTIQUEUE. * Return 0 on success, 1 on drop (packet consumed anyways). * Note that q should be interpreted only as a hint * on the flow that the mbuf belongs to: while a * scheduler will normally enqueue m into q, it is ok * to leave q alone and put the mbuf elsewhere. * This function is called in two cases: * - when a new packet arrives to the scheduler; * - when a scheduler is reconfigured. In this case the * call is issued by the new_queue callback, with a * non empty queue (q) and m pointing to the first * mbuf in the queue. For this reason, the function * should internally check for (m != q->mq.head) * before calling dn_enqueue(). * * dequeue Called when scheduler instance 's' can * dequeue a packet. Return NULL if none are available. * XXX what about non work-conserving ? * * config called on 'sched X config ...', normally writes * in the area of size sch_arg * * destroy called on 'sched delete', frees everything * in sch_arg (other parts are handled by more specific * functions) * * new_sched called when a new instance is created, e.g. * to create the local queue for !MULTIQUEUE, set V or * copy parameters for WFQ, and so on. * * free_sched called when deleting an instance, cleans * extra data in the per-instance area. * * new_fsk called when a flowset is linked to a scheduler, * e.g. to validate parameters such as weights etc. * free_fsk when a flowset is unlinked from a scheduler. * (probably unnecessary) * * new_queue called to set the per-queue parameters, * e.g. S and F, adjust sum of weights in the parent, etc. * * The new_queue callback is normally called from when * creating a new queue. In some cases (such as a * scheduler change or reconfiguration) it can be called * with a non empty queue. In this case, the queue * In case of non empty queue, the new_queue callback could * need to call the enqueue function. In this case, * the callback should eventually call enqueue() passing * as m the first element in the queue. * * free_queue actions related to a queue removal, e.g. undo * all the above. If the queue has data in it, also remove * from the scheduler. This can e.g. happen during a reconfigure. */ int (*enqueue)(struct dn_sch_inst *, struct dn_queue *, struct mbuf *); struct mbuf * (*dequeue)(struct dn_sch_inst *); int (*config)(struct dn_schk *); int (*destroy)(struct dn_schk*); int (*new_sched)(struct dn_sch_inst *); int (*free_sched)(struct dn_sch_inst *); int (*new_fsk)(struct dn_fsk *f); int (*free_fsk)(struct dn_fsk *f); int (*new_queue)(struct dn_queue *q); int (*free_queue)(struct dn_queue *q); /* run-time fields */ int ref_count; /* XXX number of instances in the system */ SLIST_ENTRY(dn_alg) next; /* Next scheduler in the list */ }; /* MSVC does not support initializers so we need this ugly macro */ #ifdef _WIN32 #define _SI(fld) #else #define _SI(fld) fld #endif /* * Additionally, dummynet exports some functions and macros * to be used by schedulers: */ void dn_free_pkts(struct mbuf *mnext); int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop); /* bound a variable between min and max */ int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg); /* * Extract the head of a queue, update stats. Must be the very last * thing done on a dequeue as the queue itself may go away. */ static __inline struct mbuf* dn_dequeue(struct dn_queue *q) { struct mbuf *m = q->mq.head; if (m == NULL) return NULL; q->mq.head = m->m_nextpkt; q->mq.count--; /* Update stats for the queue */ q->ni.length--; q->ni.len_bytes -= m->m_pkthdr.len; if (q->_si) { q->_si->ni.length--; q->_si->ni.len_bytes -= m->m_pkthdr.len; } if (q->ni.length == 0) /* queue is now idle */ q->q_time = dn_cfg.curr_time; return m; } int dn_sched_modevent(module_t mod, int cmd, void *arg); #define DECLARE_DNSCHED_MODULE(name, dnsched) \ static moduledata_t name##_mod = { \ #name, dn_sched_modevent, dnsched \ }; \ DECLARE_MODULE(name, name##_mod, \ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); \ MODULE_DEPEND(name, dummynet, 3, 3, 3); #endif /* _DN_SCHED_H */ ipfw-user/sys/netinet/ipfw/dn_sched_qfq.c000644 000423 000000 00000055542 12007435564 021271 0ustar00luigiwheel000000 000000 /* * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * $FreeBSD: head/sys/netinet/ipfw/dn_sched_qfq.c 230614 2012-01-27 13:26:25Z luigi $ */ #ifdef _KERNEL #include #include #include #include #include #include #include /* IFNAMSIZ */ #include #include /* ipfw_rule_ref */ #include /* flow_id */ #include #include #include #include #else #include #endif #ifdef QFQ_DEBUG struct qfq_sched; static void dump_sched(struct qfq_sched *q, const char *msg); #define NO(x) x #else #define NO(x) #endif #define DN_SCHED_QFQ 4 // XXX Where? typedef unsigned long bitmap; /* * bitmaps ops are critical. Some linux versions have __fls * and the bitmap ops. Some machines have ffs */ #if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) int fls(unsigned int n) { int i = 0; for (i = 0; n > 0; n >>= 1, i++) ; return i; } #endif #if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24)) static inline unsigned long __fls(unsigned long word) { return fls(word) - 1; } #endif #if !defined(_KERNEL) || !defined(__linux__) #ifdef QFQ_DEBUG int test_bit(int ix, bitmap *p) { if (ix < 0 || ix > 31) D("bad index %d", ix); return *p & (1< 31) D("bad index %d", ix); *p |= (1< 31) D("bad index %d", ix); *p &= ~(1<index = 0 *.__grp->slot_shift where MIN_SLOT_SHIFT is derived by difference from the others. The max group index corresponds to Lmax/w_min, where Lmax=1<group mapping. Class weights are * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the * group with the smallest index that can support the L_i / r_i * configured for the class. * * grp->index is the index of the group; and grp->slot_shift * is the shift for the corresponding (scaled) sigma_i. * * When computing the group index, we do (len<i_wsum) #define IWSUM ((1< 0; } /* Round a precise timestamp to its slotted value. */ static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift) { return ts & ~((1ULL << shift) - 1); } /* return the pointer to the group with lowest index in the bitmap */ static inline struct qfq_group *qfq_ffs(struct qfq_sched *q, unsigned long bitmap) { int index = ffs(bitmap) - 1; // zero-based return &q->groups[index]; } /* * Calculate a flow index, given its weight and maximum packet length. * index = log_2(maxlen/weight) but we need to apply the scaling. * This is used only once at flow creation. */ static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen) { uint64_t slot_size = (uint64_t)maxlen *inv_w; unsigned long size_map; int index = 0; size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT); if (!size_map) goto out; index = __fls(size_map) + 1; // basically a log_2() index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1))); if (index < 0) index = 0; out: ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index); return index; } /*---- end support functions ----*/ /*-------- API calls --------------------------------*/ /* * Validate and copy parameters from flowset. */ static int qfq_new_queue(struct dn_queue *_q) { struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); struct qfq_class *cl = (struct qfq_class *)_q; int i; uint32_t w; /* approximated weight */ /* import parameters from the flowset. They should be correct * already. */ w = _q->fs->fs.par[0]; cl->lmax = _q->fs->fs.par[1]; if (!w || w > QFQ_MAX_WEIGHT) { w = 1; D("rounding weight to 1"); } cl->inv_w = ONE_FP/w; w = ONE_FP/cl->inv_w; if (q->wsum + w > QFQ_MAX_WSUM) return EINVAL; i = qfq_calc_index(cl->inv_w, cl->lmax); cl->grp = &q->groups[i]; q->wsum += w; // XXX cl->S = q->V; ? // XXX compute q->i_wsum return 0; } /* remove an empty queue */ static int qfq_free_queue(struct dn_queue *_q) { struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1); struct qfq_class *cl = (struct qfq_class *)_q; if (cl->inv_w) { q->wsum -= ONE_FP/cl->inv_w; cl->inv_w = 0; /* reset weight to avoid run twice */ } return 0; } /* Calculate a mask to mimic what would be ffs_from(). */ static inline unsigned long mask_from(unsigned long bitmap, int from) { return bitmap & ~((1UL << from) - 1); } /* * The state computation relies on ER=0, IR=1, EB=2, IB=3 * First compute eligibility comparing grp->S, q->V, * then check if someone is blocking us and possibly add EB */ static inline unsigned int qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp) { /* if S > V we are not eligible */ unsigned int state = qfq_gt(grp->S, q->V); unsigned long mask = mask_from(q->bitmaps[ER], grp->index); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (qfq_gt(grp->F, next->F)) state |= EB; } return state; } /* * In principle * q->bitmaps[dst] |= q->bitmaps[src] & mask; * q->bitmaps[src] &= ~mask; * but we should make sure that src != dst */ static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst) { q->bitmaps[dst] |= q->bitmaps[src] & mask; q->bitmaps[src] &= ~mask; } static inline void qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish) { unsigned long mask = mask_from(q->bitmaps[ER], index + 1); struct qfq_group *next; if (mask) { next = qfq_ffs(q, mask); if (!qfq_gt(next->F, old_finish)) return; } mask = (1UL << index) - 1; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } /* * perhaps * old_V ^= q->V; old_V >>= QFQ_MIN_SLOT_SHIFT; if (old_V) { ... } * */ static inline void qfq_make_eligible(struct qfq_sched *q, uint64_t old_V) { unsigned long mask, vslot, old_vslot; vslot = q->V >> QFQ_MIN_SLOT_SHIFT; old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT; if (vslot != old_vslot) { mask = (2UL << (__fls(vslot ^ old_vslot))) - 1; qfq_move_groups(q, mask, IR, ER); qfq_move_groups(q, mask, IB, EB); } } /* * XXX we should make sure that slot becomes less than 32. * This is guaranteed by the input values. * roundedS is always cl->S rounded on grp->slot_shift bits. */ static inline void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS) { uint64_t slot = (roundedS - grp->S) >> grp->slot_shift; unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS; cl->next = grp->slots[i]; grp->slots[i] = cl; __set_bit(slot, &grp->full_slots); } /* * remove the entry from the slot */ static inline void qfq_front_slot_remove(struct qfq_group *grp) { struct qfq_class **h = &grp->slots[grp->front]; *h = (*h)->next; if (!*h) __clear_bit(0, &grp->full_slots); } /* * Returns the first full queue in a group. As a side effect, * adjust the bucket list so the first non-empty bucket is at * position 0 in full_slots. */ static inline struct qfq_class * qfq_slot_scan(struct qfq_group *grp) { int i; ND("grp %d full %x", grp->index, grp->full_slots); if (!grp->full_slots) return NULL; i = ffs(grp->full_slots) - 1; // zero-based if (i > 0) { grp->front = (grp->front + i) % QFQ_MAX_SLOTS; grp->full_slots >>= i; } return grp->slots[grp->front]; } /* * adjust the bucket list. When the start time of a group decreases, * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to * move the objects. The mask of occupied slots must be shifted * because we use ffs() to find the first non-empty slot. * This covers decreases in the group's start time, but what about * increases of the start time ? * Here too we should make sure that i is less than 32 */ static inline void qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS) { unsigned int i = (grp->S - roundedS) >> grp->slot_shift; grp->full_slots <<= i; grp->front = (grp->front - i) % QFQ_MAX_SLOTS; } static inline void qfq_update_eligible(struct qfq_sched *q, uint64_t old_V) { bitmap ineligible; ineligible = q->bitmaps[IR] | q->bitmaps[IB]; if (ineligible) { if (!q->bitmaps[ER]) { struct qfq_group *grp; grp = qfq_ffs(q, ineligible); if (qfq_gt(grp->S, q->V)) q->V = grp->S; } qfq_make_eligible(q, old_V); } } /* * Updates the class, returns true if also the group needs to be updated. */ static inline int qfq_update_class(struct qfq_sched *q, struct qfq_group *grp, struct qfq_class *cl) { cl->S = cl->F; if (cl->_q.mq.head == NULL) { qfq_front_slot_remove(grp); } else { unsigned int len; uint64_t roundedS; len = cl->_q.mq.head->m_pkthdr.len; cl->F = cl->S + (uint64_t)len * cl->inv_w; roundedS = qfq_round_down(cl->S, grp->slot_shift); if (roundedS == grp->S) return 0; qfq_front_slot_remove(grp); qfq_slot_insert(grp, cl, roundedS); } return 1; } static struct mbuf * qfq_dequeue(struct dn_sch_inst *si) { struct qfq_sched *q = (struct qfq_sched *)(si + 1); struct qfq_group *grp; struct qfq_class *cl; struct mbuf *m; uint64_t old_V; NO(q->loops++;) if (!q->bitmaps[ER]) { NO(if (q->queued) dump_sched(q, "start dequeue");) return NULL; } grp = qfq_ffs(q, q->bitmaps[ER]); cl = grp->slots[grp->front]; /* extract from the first bucket in the bucket list */ m = dn_dequeue(&cl->_q); if (!m) { D("BUG/* non-workconserving leaf */"); return NULL; } NO(q->queued--;) old_V = q->V; q->V += (uint64_t)m->m_pkthdr.len * IWSUM; ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V); if (qfq_update_class(q, grp, cl)) { uint64_t old_F = grp->F; cl = qfq_slot_scan(grp); if (!cl) { /* group gone, remove from ER */ __clear_bit(grp->index, &q->bitmaps[ER]); // grp->S = grp->F + 1; // XXX debugging only } else { uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift); unsigned int s; if (grp->S == roundedS) goto skip_unblock; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); /* remove from ER and put in the new set */ __clear_bit(grp->index, &q->bitmaps[ER]); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } /* we need to unblock even if the group has gone away */ qfq_unblock_groups(q, grp->index, old_F); } skip_unblock: qfq_update_eligible(q, old_V); NO(if (!q->bitmaps[ER] && q->queued) dump_sched(q, "end dequeue");) return m; } /* * Assign a reasonable start time for a new flow k in group i. * Admissible values for \hat(F) are multiples of \sigma_i * no greater than V+\sigma_i . Larger values mean that * we had a wraparound so we consider the timestamp to be stale. * * If F is not stale and F >= V then we set S = F. * Otherwise we should assign S = V, but this may violate * the ordering in ER. So, if we have groups in ER, set S to * the F_j of the first group j which would be blocking us. * We are guaranteed not to move S backward because * otherwise our group i would still be blocked. */ static inline void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) { unsigned long mask; uint64_t limit, roundedF; int slot_shift = cl->grp->slot_shift; roundedF = qfq_round_down(cl->F, slot_shift); limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift); if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) { /* timestamp was stale */ mask = mask_from(q->bitmaps[ER], cl->grp->index); if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { cl->S = next->F; return; } } cl->S = q->V; } else { /* timestamp is not stale */ cl->S = cl->F; } } static int qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m) { struct qfq_sched *q = (struct qfq_sched *)(si + 1); struct qfq_group *grp; struct qfq_class *cl = (struct qfq_class *)_q; uint64_t roundedS; int s; NO(q->loops++;) DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len, _q, cl->inv_w, cl->grp->index); /* XXX verify that the packet obeys the parameters */ if (m != _q->mq.head) { if (dn_enqueue(_q, m, 0)) /* packet was dropped */ return 1; NO(q->queued++;) if (m != _q->mq.head) return 0; } /* If reach this point, queue q was idle */ grp = cl->grp; qfq_update_start(q, cl); /* adjust start time */ /* compute new finish time and rounded start. */ cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w; roundedS = qfq_round_down(cl->S, grp->slot_shift); /* * insert cl in the correct bucket. * If cl->S >= grp->S we don't need to adjust the * bucket list and simply go to the insertion phase. * Otherwise grp->S is decreasing, we must make room * in the bucket list, and also recompute the group state. * Finally, if there were no flows in this group and nobody * was in ER make sure to adjust V. */ if (grp->full_slots) { if (!qfq_gt(grp->S, cl->S)) goto skip_update; /* create a slot for this cl->S */ qfq_slot_rotate(q, grp, roundedS); /* group was surely ineligible, remove */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[IB]); } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V)) q->V = roundedS; grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); ND("new state %d 0x%x", s, q->bitmaps[s]); ND("S %llx F %llx V %llx", cl->S, cl->F, q->V); skip_update: qfq_slot_insert(grp, cl, roundedS); return 0; } #if 0 static inline void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp, struct qfq_class *cl, struct qfq_class **pprev) { unsigned int i, offset; uint64_t roundedS; roundedS = qfq_round_down(cl->S, grp->slot_shift); offset = (roundedS - grp->S) >> grp->slot_shift; i = (grp->front + offset) % QFQ_MAX_SLOTS; #ifdef notyet if (!pprev) { pprev = &grp->slots[i]; while (*pprev && *pprev != cl) pprev = &(*pprev)->next; } #endif *pprev = cl->next; if (!grp->slots[i]) __clear_bit(offset, &grp->full_slots); } /* * called to forcibly destroy a queue. * If the queue is not in the front bucket, or if it has * other queues in the front bucket, we can simply remove * the queue with no other side effects. * Otherwise we must propagate the event up. * XXX description to be completed. */ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl, struct qfq_class **pprev) { struct qfq_group *grp = &q->groups[cl->index]; unsigned long mask; uint64_t roundedS; int s; cl->F = cl->S; // not needed if the class goes away. qfq_slot_remove(q, grp, cl, pprev); if (!grp->full_slots) { /* nothing left in the group, remove from all sets. * Do ER last because if we were blocking other groups * we must unblock them. */ __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); if (test_bit(grp->index, &q->bitmaps[ER]) && !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) { mask = q->bitmaps[ER] & ((1UL << grp->index) - 1); if (mask) mask = ~((1UL << __fls(mask)) - 1); else mask = ~0UL; qfq_move_groups(q, mask, EB, ER); qfq_move_groups(q, mask, IB, IR); } __clear_bit(grp->index, &q->bitmaps[ER]); } else if (!grp->slots[grp->front]) { cl = qfq_slot_scan(grp); roundedS = qfq_round_down(cl->S, grp->slot_shift); if (grp->S != roundedS) { __clear_bit(grp->index, &q->bitmaps[ER]); __clear_bit(grp->index, &q->bitmaps[IR]); __clear_bit(grp->index, &q->bitmaps[EB]); __clear_bit(grp->index, &q->bitmaps[IB]); grp->S = roundedS; grp->F = roundedS + (2ULL << grp->slot_shift); s = qfq_calc_state(q, grp); __set_bit(grp->index, &q->bitmaps[s]); } } qfq_update_eligible(q, q->V); } #endif static int qfq_new_fsk(struct dn_fsk *f) { ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight"); ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen"); ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]); return 0; } /* * initialize a new scheduler instance */ static int qfq_new_sched(struct dn_sch_inst *si) { struct qfq_sched *q = (struct qfq_sched *)(si + 1); struct qfq_group *grp; int i; for (i = 0; i <= QFQ_MAX_INDEX; i++) { grp = &q->groups[i]; grp->index = i; grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS - (QFQ_MAX_INDEX - i); } return 0; } /* * QFQ scheduler descriptor */ static struct dn_alg qfq_desc = { _SI( .type = ) DN_SCHED_QFQ, _SI( .name = ) "QFQ", _SI( .flags = ) DN_MULTIQUEUE, _SI( .schk_datalen = ) 0, _SI( .si_datalen = ) sizeof(struct qfq_sched), _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue), _SI( .enqueue = ) qfq_enqueue, _SI( .dequeue = ) qfq_dequeue, _SI( .config = ) NULL, _SI( .destroy = ) NULL, _SI( .new_sched = ) qfq_new_sched, _SI( .free_sched = ) NULL, _SI( .new_fsk = ) qfq_new_fsk, _SI( .free_fsk = ) NULL, _SI( .new_queue = ) qfq_new_queue, _SI( .free_queue = ) qfq_free_queue, }; DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc); #ifdef QFQ_DEBUG static void dump_groups(struct qfq_sched *q, uint32_t mask) { int i, j; for (i = 0; i < QFQ_MAX_INDEX + 1; i++) { struct qfq_group *g = &q->groups[i]; if (0 == (mask & (1<slots[j]) D(" bucket %d %p", j, g->slots[j]); } D("full_slots 0x%x", g->full_slots); D(" %2d S 0x%20llx F 0x%llx %c", i, g->S, g->F, mask & (1<loops, q->queued, q->V); D(" ER 0x%08x", q->bitmaps[ER]); D(" EB 0x%08x", q->bitmaps[EB]); D(" IR 0x%08x", q->bitmaps[IR]); D(" IB 0x%08x", q->bitmaps[IB]); dump_groups(q, 0xffffffff); }; #endif /* QFQ_DEBUG */ ipfw-user/sys/netinet/ipfw/ip_fw_pfil.c000644 000423 000000 00000026355 12007435564 020771 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2004 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw_pfil.c 227085 2011-11-04 16:24:19Z bz $"); #include "opt_ipfw.h" #include "opt_inet.h" #include "opt_inet6.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #include #include #include static VNET_DEFINE(int, fw_enable) = 1; #define V_fw_enable VNET(fw_enable) #ifdef INET6 static VNET_DEFINE(int, fw6_enable) = 1; #define V_fw6_enable VNET(fw6_enable) #endif int ipfw_chg_hook(SYSCTL_HANDLER_ARGS); /* Forward declarations. */ static int ipfw_divert(struct mbuf **, int, struct ipfw_rule_ref *, int); #ifdef SYSCTL_NODE SYSBEGIN(f1) SYSCTL_DECL(_net_inet_ip_fw); SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_enable), 0, ipfw_chg_hook, "I", "Enable ipfw"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6_fw); SYSCTL_VNET_PROC(_net_inet6_ip6_fw, OID_AUTO, enable, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw6_enable), 0, ipfw_chg_hook, "I", "Enable ipfw+6"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * The pfilter hook to pass packets to ipfw_chk and then to * dummynet, divert, netgraph or other modules. * The packet may be consumed. */ int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir, struct inpcb *inp) { struct ip_fw_args args; struct m_tag *tag; int ipfw; int ret; /* all the processing now uses ip_len in net format */ if (mtod(*m0, struct ip *)->ip_v == 4) SET_NET_IPLEN(mtod(*m0, struct ip *)); /* convert dir to IPFW values */ dir = (dir == PFIL_IN) ? DIR_IN : DIR_OUT; bzero(&args, sizeof(args)); again: /* * extract and remove the tag if present. If we are left * with onepass, optimize the outgoing path. */ tag = m_tag_locate(*m0, MTAG_IPFW_RULE, 0, NULL); if (tag != NULL) { args.rule = *((struct ipfw_rule_ref *)(tag+1)); m_tag_delete(*m0, tag); if (args.rule.info & IPFW_ONEPASS) { if (mtod(*m0, struct ip *)->ip_v == 4) SET_HOST_IPLEN(mtod(*m0, struct ip *)); return (0); } } args.m = *m0; args.oif = dir == DIR_OUT ? ifp : NULL; args.inp = inp; ipfw = ipfw_chk(&args); *m0 = args.m; KASSERT(*m0 != NULL || ipfw == IP_FW_DENY, ("%s: m0 is NULL", __func__)); /* breaking out of the switch means drop */ ret = 0; /* default return value for pass */ switch (ipfw) { case IP_FW_PASS: /* next_hop may be set by ipfw_chk */ if (args.next_hop == NULL && args.next_hop6 == NULL) break; /* pass */ #if !defined(IPFIREWALL_FORWARD) || (!defined(INET6) && !defined(INET)) ret = EACCES; #else { struct m_tag *fwd_tag; size_t len; KASSERT(args.next_hop == NULL || args.next_hop6 == NULL, ("%s: both next_hop=%p and next_hop6=%p not NULL", __func__, args.next_hop, args.next_hop6)); #ifdef INET6 if (args.next_hop6 != NULL) len = sizeof(struct sockaddr_in6); #endif #ifdef INET if (args.next_hop != NULL) len = sizeof(struct sockaddr_in); #endif /* Incoming packets should not be tagged so we do not * m_tag_find. Outgoing packets may be tagged, so we * reuse the tag if present. */ fwd_tag = (dir == DIR_IN) ? NULL : m_tag_find(*m0, PACKET_TAG_IPFORWARD, NULL); if (fwd_tag != NULL) { m_tag_unlink(*m0, fwd_tag); } else { fwd_tag = m_tag_get(PACKET_TAG_IPFORWARD, len, M_NOWAIT); if (fwd_tag == NULL) { ret = EACCES; break; /* i.e. drop */ } } #ifdef INET6 if (args.next_hop6 != NULL) { bcopy(args.next_hop6, (fwd_tag+1), len); if (in6_localip(&args.next_hop6->sin6_addr)) (*m0)->m_flags |= M_FASTFWD_OURS; } #endif #ifdef INET if (args.next_hop != NULL) { bcopy(args.next_hop, (fwd_tag+1), len); if (in_localip(args.next_hop->sin_addr)) (*m0)->m_flags |= M_FASTFWD_OURS; } #endif m_tag_prepend(*m0, fwd_tag); } #endif /* IPFIREWALL_FORWARD */ break; case IP_FW_DENY: ret = EACCES; break; /* i.e. drop */ case IP_FW_DUMMYNET: ret = EACCES; if (ip_dn_io_ptr == NULL) break; /* i.e. drop */ if (mtod(*m0, struct ip *)->ip_v == 4) ret = ip_dn_io_ptr(m0, dir, &args); else if (mtod(*m0, struct ip *)->ip_v == 6) ret = ip_dn_io_ptr(m0, dir | PROTO_IPV6, &args); else break; /* drop it */ /* * XXX should read the return value. * dummynet normally eats the packet and sets *m0=NULL * unless the packet can be sent immediately. In this * case args is updated and we should re-run the * check without clearing args. */ if (*m0 != NULL) goto again; break; case IP_FW_TEE: case IP_FW_DIVERT: if (ip_divert_ptr == NULL) { ret = EACCES; break; /* i.e. drop */ } ret = ipfw_divert(m0, dir, &args.rule, (ipfw == IP_FW_TEE) ? 1 : 0); /* continue processing for the original packet (tee). */ if (*m0) goto again; break; case IP_FW_NGTEE: case IP_FW_NETGRAPH: if (ng_ipfw_input_p == NULL) { ret = EACCES; break; /* i.e. drop */ } ret = ng_ipfw_input_p(m0, dir, &args, (ipfw == IP_FW_NGTEE) ? 1 : 0); if (ipfw == IP_FW_NGTEE) /* ignore errors for NGTEE */ goto again; /* continue with packet */ break; case IP_FW_NAT: /* honor one-pass in case of successful nat */ if (V_fw_one_pass) break; /* ret is already 0 */ goto again; case IP_FW_REASS: goto again; /* continue with packet */ default: KASSERT(0, ("%s: unknown retval", __func__)); } if (ret != 0) { if (*m0) FREE_PKT(*m0); *m0 = NULL; } if (*m0 && mtod(*m0, struct ip *)->ip_v == 4) SET_HOST_IPLEN(mtod(*m0, struct ip *)); return ret; } /* do the divert, return 1 on error 0 on success */ static int ipfw_divert(struct mbuf **m0, int incoming, struct ipfw_rule_ref *rule, int tee) { /* * ipfw_chk() has already tagged the packet with the divert tag. * If tee is set, copy packet and return original. * If not tee, consume packet and send it to divert socket. */ struct mbuf *clone; struct ip *ip = mtod(*m0, struct ip *); struct m_tag *tag; /* Cloning needed for tee? */ if (tee == 0) { clone = *m0; /* use the original mbuf */ *m0 = NULL; } else { clone = m_dup(*m0, M_DONTWAIT); /* If we cannot duplicate the mbuf, we sacrifice the divert * chain and continue with the tee-ed packet. */ if (clone == NULL) return 1; } /* * Divert listeners can normally handle non-fragmented packets, * but we can only reass in the non-tee case. * This means that listeners on a tee rule may get fragments, * and have to live with that. * Note that we now have the 'reass' ipfw option so if we care * we can do it before a 'tee'. */ if (!tee) switch (ip->ip_v) { case IPVERSION: if (ntohs(ip->ip_off) & (IP_MF | IP_OFFMASK)) { int hlen; struct mbuf *reass; SET_HOST_IPLEN(ip); /* ip_reass wants host order */ reass = ip_reass(clone); /* Reassemble packet. */ if (reass == NULL) return 0; /* not an error */ /* if reass = NULL then it was consumed by ip_reass */ /* * IP header checksum fixup after reassembly and leave header * in network byte order. */ ip = mtod(reass, struct ip *); hlen = ip->ip_hl << 2; SET_NET_IPLEN(ip); ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(reass, hlen); clone = reass; } break; #ifdef INET6 case IPV6_VERSION >> 4: { struct ip6_hdr *const ip6 = mtod(clone, struct ip6_hdr *); if (ip6->ip6_nxt == IPPROTO_FRAGMENT) { int nxt, off; off = sizeof(struct ip6_hdr); nxt = frag6_input(&clone, &off, 0); if (nxt == IPPROTO_DONE) return (0); } break; } #endif } /* attach a tag to the packet with the reinject info */ tag = m_tag_alloc(MTAG_IPFW_RULE, 0, sizeof(struct ipfw_rule_ref), M_NOWAIT); if (tag == NULL) { FREE_PKT(clone); return 1; } *((struct ipfw_rule_ref *)(tag+1)) = *rule; m_tag_prepend(clone, tag); /* Do the dirty job... */ ip_divert_ptr(clone, incoming); return 0; } /* * attach or detach hooks for a given protocol family */ static int ipfw_hook(int onoff, int pf) { struct pfil_head *pfh; pfh = pfil_head_get(PFIL_TYPE_AF, pf); if (pfh == NULL) return ENOENT; (void) (onoff ? pfil_add_hook : pfil_remove_hook) (ipfw_check_hook, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh); return 0; } int ipfw_attach_hooks(int arg) { int error = 0; if (arg == 0) /* detach */ ipfw_hook(0, AF_INET); else if (V_fw_enable && ipfw_hook(1, AF_INET) != 0) { error = ENOENT; /* see ip_fw_pfil.c::ipfw_hook() */ printf("ipfw_hook() error\n"); } #ifdef INET6 if (arg == 0) /* detach */ ipfw_hook(0, AF_INET6); else if (V_fw6_enable && ipfw_hook(1, AF_INET6) != 0) { error = ENOENT; printf("ipfw6_hook() error\n"); } #endif return error; } int ipfw_chg_hook(SYSCTL_HANDLER_ARGS) { int enable; int oldenable; int error; int af; if (arg1 == &VNET_NAME(fw_enable)) { enable = V_fw_enable; af = AF_INET; } #ifdef INET6 else if (arg1 == &VNET_NAME(fw6_enable)) { enable = V_fw6_enable; af = AF_INET6; } #endif else return (EINVAL); oldenable = enable; error = sysctl_handle_int(oidp, &enable, 0, req); if (error) return (error); enable = (enable) ? 1 : 0; if (enable == oldenable) return (0); error = ipfw_hook(enable, af); if (error) return (error); if (af == AF_INET) V_fw_enable = enable; #ifdef INET6 else if (af == AF_INET6) V_fw6_enable = enable; #endif return (0); } /* end of file */ ipfw-user/sys/netinet/ipfw/ip_dn_io.c000644 000423 000000 00000060660 12012141422 020407 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Dummynet portions related to packet handling. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_dn_io.c 227309 2011-11-07 15:43:11Z ed $"); #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ #include #include #include #include /* ip_len, ip_off */ #include /* ip_output(), IP_FORWARDING */ #include #include #include #include #include #include #include /* various ether_* routines */ #include /* for ip6_input, ip6_output prototypes */ #include /* * We keep a private variable for the simulation time, but we could * probably use an existing one ("softticks" in sys/kern/kern_timeout.c) * instead of dn_cfg.curr_time */ struct dn_parms dn_cfg; //VNET_DEFINE(struct dn_parms, _base_dn_cfg); static long tick_last; /* Last tick duration (usec). */ static long tick_delta; /* Last vs standard tick diff (usec). */ static long tick_delta_sum; /* Accumulated tick difference (usec).*/ static long tick_adjustment; /* Tick adjustments done. */ static long tick_lost; /* Lost(coalesced) ticks number. */ /* Adjusted vs non-adjusted curr_time difference (ticks). */ static long tick_diff; static unsigned long io_pkt; static unsigned long io_pkt_fast; static unsigned long io_pkt_drop; /* * We use a heap to store entities for which we have pending timer events. * The heap is checked at every tick and all entities with expired events * are extracted. */ MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap"); extern void (*bridge_dn_p)(struct mbuf *, struct ifnet *); #ifdef SYSCTL_NODE /* * Because of the way the SYSBEGIN/SYSEND macros work on other * platforms, there should not be functions between them. * So keep the handlers outside the block. */ static int sysctl_hash_size(SYSCTL_HANDLER_ARGS) { int error, value; value = dn_cfg.hash_size; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (value < 16 || value > 65536) return (EINVAL); dn_cfg.hash_size = value; return (0); } static int sysctl_limits(SYSCTL_HANDLER_ARGS) { int error; long value; if (arg2 != 0) value = dn_cfg.slot_limit; else value = dn_cfg.byte_limit; error = sysctl_handle_long(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (arg2 != 0) { if (value < 1) return (EINVAL); dn_cfg.slot_limit = value; } else { if (value < 1500) return (EINVAL); dn_cfg.byte_limit = value; } return (0); } SYSBEGIN(f4) SYSCTL_DECL(_net_inet); SYSCTL_DECL(_net_inet_ip); static SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); /* wrapper to pass dn_cfg fields to SYSCTL_* */ //#define DC(x) (&(VNET_NAME(_base_dn_cfg).x)) #define DC(x) (&(dn_cfg.x)) /* parameters */ SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, hash_size, CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_hash_size, "I", "Default hash table size"); SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit, CTLTYPE_LONG | CTLFLAG_RW, 0, 1, sysctl_limits, "L", "Upper limit in slots for pipe queue."); SYSCTL_PROC(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit, CTLTYPE_LONG | CTLFLAG_RW, 0, 0, sysctl_limits, "L", "Upper limit in bytes for pipe queue."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast, CTLFLAG_RW, DC(io_fast), 0, "Enable fast dummynet io."); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, DC(debug), 0, "Dummynet debug level"); /* RED parameters */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, CTLFLAG_RD, DC(red_lookup_depth), 0, "Depth of RED lookup table"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, CTLFLAG_RD, DC(red_avg_pkt_size), 0, "RED Medium packet size"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, CTLFLAG_RD, DC(red_max_pkt_size), 0, "RED Max packet size"); /* time adjustment */ SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta, CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum, CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment, CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff, CTLFLAG_RD, &tick_diff, 0, "Adjusted vs non-adjusted curr_time difference (ticks)."); SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost, CTLFLAG_RD, &tick_lost, 0, "Number of ticks coalesced by dummynet taskqueue."); /* Drain parameters */ SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire, CTLFLAG_RW, DC(expire), 0, "Expire empty queues/pipes"); SYSCTL_UINT(_net_inet_ip_dummynet, OID_AUTO, expire_cycle, CTLFLAG_RD, DC(expire_cycle), 0, "Expire cycle for queues/pipes"); /* statistics */ SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, schk_count, CTLFLAG_RD, DC(schk_count), 0, "Number of schedulers"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, si_count, CTLFLAG_RD, DC(si_count), 0, "Number of scheduler instances"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, fsk_count, CTLFLAG_RD, DC(fsk_count), 0, "Number of flowsets"); SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, queue_count, CTLFLAG_RD, DC(queue_count), 0, "Number of queues"); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt, CTLFLAG_RD, &io_pkt, 0, "Number of packets passed to dummynet."); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast, CTLFLAG_RD, &io_pkt_fast, 0, "Number of packets bypassed dummynet scheduler."); SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop, CTLFLAG_RD, &io_pkt_drop, 0, "Number of packets dropped by dummynet."); #undef DC SYSEND #endif static void dummynet_send(struct mbuf *); /* * Packets processed by dummynet have an mbuf tag associated with * them that carries their dummynet state. * Outside dummynet, only the 'rule' field is relevant, and it must * be at the beginning of the structure. */ struct dn_pkt_tag { struct ipfw_rule_ref rule; /* matching rule */ /* second part, dummynet specific */ int dn_dir; /* action when packet comes out.*/ /* see ip_fw_private.h */ uint64_t output_time; /* when the pkt is due for delivery*/ struct ifnet *ifp; /* interface, for ip_output */ // struct _ip6dn_args ip6opt; /* XXX ipv6 options, 192 bytes */ }; /* * Return the mbuf tag holding the dummynet state (it should * be the first one on the list). */ static struct dn_pkt_tag * dn_tag_get(struct mbuf *m) { struct m_tag *mtag = m_tag_first(m); KASSERT(mtag != NULL && mtag->m_tag_cookie == MTAG_ABI_COMPAT && mtag->m_tag_id == PACKET_TAG_DUMMYNET, ("packet on dummynet queue w/o dummynet tag!")); return (struct dn_pkt_tag *)(mtag+1); } static inline void mq_append(struct mq *q, struct mbuf *m) { #ifdef USERSPACE // buffers from netmap need to be copied // XXX note that the routine is not expected to fail ND("append %p to %p", m, q); if (m->m_flags & M_STACK) { struct mbuf *m_new; void *p; int l, ofs; ofs = m->m_data - m->__m_extbuf; // XXX allocate MGETHDR(m_new, M_DONTWAIT, MT_DATA); ND("*** WARNING, volatile buf %p ext %p %d dofs %d m_new %p", m, m->__m_extbuf, m->__m_extlen, ofs, m_new); p = m_new->__m_extbuf; /* new pointer */ l = m_new->__m_extlen; /* new len */ if (l <= m->__m_extlen) { panic("extlen too large"); } *m_new = *m; // copy m_new->m_flags &= ~M_STACK; m_new->__m_extbuf = p; // point to new buffer pkt_copy(m->__m_extbuf, p, m->__m_extlen); m_new->m_data = p + ofs; m = m_new; } #endif /* USERSPACE */ if (q->head == NULL) q->head = m; else q->tail->m_nextpkt = m; q->count++; if (0 && q->count % 10000 == 0) D("count %p %d", q, q->count); q->tail = m; m->m_nextpkt = NULL; } /* * Dispose a list of packet. Use a functions so if we need to do * more work, this is a central point to do it. */ void dn_free_pkts(struct mbuf *mnext) { struct mbuf *m; while ((m = mnext) != NULL) { mnext = m->m_nextpkt; FREE_PKT(m); } } static int red_drops (struct dn_queue *q, int len) { /* * RED algorithm * * RED calculates the average queue size (avg) using a low-pass filter * with an exponential weighted (w_q) moving average: * avg <- (1-w_q) * avg + w_q * q_size * where q_size is the queue length (measured in bytes or * packets). * * If q_size == 0, we compute the idle time for the link, and set * avg = (1 - w_q)^(idle/s) * where s is the time needed for transmitting a medium-sized packet. * * Now, if avg < min_th the packet is enqueued. * If avg > max_th the packet is dropped. Otherwise, the packet is * dropped with probability P function of avg. */ struct dn_fsk *fs = q->fs; int64_t p_b = 0; /* Queue in bytes or packets? */ uint32_t q_size = (fs->fs.flags & DN_QSIZE_BYTES) ? q->ni.len_bytes : q->ni.length; /* Average queue size estimation. */ if (q_size != 0) { /* Queue is not empty, avg <- avg + (q_size - avg) * w_q */ int diff = SCALE(q_size) - q->avg; int64_t v = SCALE_MUL((int64_t)diff, (int64_t)fs->w_q); q->avg += (int)v; } else { /* * Queue is empty, find for how long the queue has been * empty and use a lookup table for computing * (1 - * w_q)^(idle_time/s) where s is the time to send a * (small) packet. * XXX check wraps... */ if (q->avg) { u_int t = div64((dn_cfg.curr_time - q->q_time), fs->lookup_step); q->avg = (t < fs->lookup_depth) ? SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; } } /* Should i drop? */ if (q->avg < fs->min_th) { q->count = -1; return (0); /* accept packet */ } if (q->avg >= fs->max_th) { /* average queue >= max threshold */ if (fs->fs.flags & DN_IS_GENTLE_RED) { /* * According to Gentle-RED, if avg is greater than * max_th the packet is dropped with a probability * p_b = c_3 * avg - c_4 * where c_3 = (1 - max_p) / max_th * c_4 = 1 - 2 * max_p */ p_b = SCALE_MUL((int64_t)fs->c_3, (int64_t)q->avg) - fs->c_4; } else { q->count = -1; return (1); } } else if (q->avg > fs->min_th) { /* * We compute p_b using the linear dropping function * p_b = c_1 * avg - c_2 * where c_1 = max_p / (max_th - min_th) * c_2 = max_p * min_th / (max_th - min_th) */ p_b = SCALE_MUL((int64_t)fs->c_1, (int64_t)q->avg) - fs->c_2; } if (fs->fs.flags & DN_QSIZE_BYTES) p_b = div64((p_b * len) , fs->max_pkt_size); if (++q->count == 0) q->random = random() & 0xffff; else { /* * q->count counts packets arrived since last drop, so a greater * value of q->count means a greater packet drop probability. */ if (SCALE_MUL(p_b, SCALE((int64_t)q->count)) > q->random) { q->count = 0; /* After a drop we calculate a new random value. */ q->random = random() & 0xffff; return (1); /* drop */ } } /* End of RED algorithm. */ return (0); /* accept */ } /* * Enqueue a packet in q, subject to space and queue management policy * (whose parameters are in q->fs). * Update stats for the queue and the scheduler. * Return 0 on success, 1 on drop. The packet is consumed anyways. */ int dn_enqueue(struct dn_queue *q, struct mbuf* m, int drop) { struct dn_fs *f; struct dn_flow *ni; /* stats for scheduler instance */ uint64_t len; if (q->fs == NULL || q->_si == NULL) { printf("%s fs %p si %p, dropping\n", __FUNCTION__, q->fs, q->_si); FREE_PKT(m); return 1; } f = &(q->fs->fs); ni = &q->_si->ni; len = m->m_pkthdr.len; /* Update statistics, then check reasons to drop pkt. */ q->ni.tot_bytes += len; q->ni.tot_pkts++; ni->tot_bytes += len; ni->tot_pkts++; if (drop) goto drop; if (f->plr && random() < f->plr) goto drop; if (f->flags & DN_IS_RED && red_drops(q, m->m_pkthdr.len)) goto drop; if (f->flags & DN_QSIZE_BYTES) { if (q->ni.len_bytes > f->qsize) goto drop; } else if (q->ni.length >= f->qsize) { goto drop; } mq_append(&q->mq, m); q->ni.length++; q->ni.len_bytes += len; ni->length++; ni->len_bytes += len; return 0; drop: io_pkt_drop++; q->ni.drops++; ni->drops++; FREE_PKT(m); return 1; } /* * Fetch packets from the delay line which are due now. If there are * leftover packets, reinsert the delay line in the heap. * Runs under scheduler lock. */ static void transmit_event(struct mq *q, struct delay_line *dline, uint64_t now) { struct mbuf *m; struct dn_pkt_tag *pkt = NULL; dline->oid.subtype = 0; /* not in heap */ while ((m = dline->mq.head) != NULL) { pkt = dn_tag_get(m); if (!DN_KEY_LEQ(pkt->output_time, now)) break; dline->mq.head = m->m_nextpkt; dline->mq.count--; mq_append(q, m); } if (m != NULL) { dline->oid.subtype = 1; /* in heap */ heap_insert(&dn_cfg.evheap, pkt->output_time, dline); } } /* * Convert the additional MAC overheads/delays into an equivalent * number of bits for the given data rate. The samples are * in milliseconds so we need to divide by 1000. */ static uint64_t extra_bits(struct mbuf *m, struct dn_schk *s) { int index; uint64_t bits; struct dn_profile *pf = s->profile; if (!pf || pf->samples_no == 0) return 0; index = random() % pf->samples_no; bits = div64((uint64_t)pf->samples[index] * s->link.bandwidth, 1000); if (index >= pf->loss_level) { struct dn_pkt_tag *dt = dn_tag_get(m); if (dt) dt->dn_dir = DIR_DROP; } return bits; } /* * Send traffic from a scheduler instance due by 'now'. * Return a pointer to the head of the queue. */ static struct mbuf * serve_sched(struct mq *q, struct dn_sch_inst *si, uint64_t now) { struct mq def_q; struct dn_schk *s = si->sched; struct mbuf *m = NULL; int delay_line_idle = (si->dline.mq.head == NULL); int done, bw; if (q == NULL) { q = &def_q; q->head = NULL; } bw = s->link.bandwidth; si->kflags &= ~DN_ACTIVE; if (bw > 0) si->credit += (now - si->sched_time) * bw; else si->credit = 0; si->sched_time = now; done = 0; while (si->credit >= 0 && (m = s->fp->dequeue(si)) != NULL) { uint64_t len_scaled; done++; len_scaled = (bw == 0) ? 0 : hz * (m->m_pkthdr.len * 8 + extra_bits(m, s)); si->credit -= len_scaled; /* Move packet in the delay line */ dn_tag_get(m)->output_time = dn_cfg.curr_time + s->link.delay ; mq_append(&si->dline.mq, m); } /* * If credit >= 0 the instance is idle, mark time. * Otherwise put back in the heap, and adjust the output * time of the last inserted packet, m, which was too early. */ if (si->credit >= 0) { si->idle_time = now; } else { uint64_t t; KASSERT (bw > 0, ("bw=0 and credit<0 ?")); t = div64(bw - 1 - si->credit, bw); if (m) dn_tag_get(m)->output_time += t; si->kflags |= DN_ACTIVE; heap_insert(&dn_cfg.evheap, now + t, si); } if (delay_line_idle && done) transmit_event(q, &si->dline, now); return q->head; } /* * The timer handler for dummynet. Time is computed in ticks, but * but the code is tolerant to the actual rate at which this is called. * Once complete, the function reschedules itself for the next tick. */ void dummynet_task(void *context, int pending) { struct timeval t; struct mq q = { NULL, NULL }; /* queue to accumulate results */ CURVNET_SET((struct vnet *)context); DN_BH_WLOCK(); /* Update number of lost(coalesced) ticks. */ tick_lost += pending - 1; getmicrouptime(&t); /* Last tick duration (usec). */ tick_last = (t.tv_sec - dn_cfg.prev_t.tv_sec) * 1000000 + (t.tv_usec - dn_cfg.prev_t.tv_usec); /* Last tick vs standard tick difference (usec). */ tick_delta = (tick_last * hz - 1000000) / hz; /* Accumulated tick difference (usec). */ tick_delta_sum += tick_delta; dn_cfg.prev_t = t; /* * Adjust curr_time if the accumulated tick difference is * greater than the 'standard' tick. Since curr_time should * be monotonically increasing, we do positive adjustments * as required, and throttle curr_time in case of negative * adjustment. */ dn_cfg.curr_time++; if (tick_delta_sum - tick >= 0) { int diff = tick_delta_sum / tick; dn_cfg.curr_time += diff; tick_diff += diff; tick_delta_sum %= tick; tick_adjustment++; } else if (tick_delta_sum + tick <= 0) { dn_cfg.curr_time--; tick_diff--; tick_delta_sum += tick; tick_adjustment++; } /* serve pending events, accumulate in q */ for (;;) { struct dn_id *p; /* generic parameter to handler */ if (dn_cfg.evheap.elements == 0 || DN_KEY_LT(dn_cfg.curr_time, HEAP_TOP(&dn_cfg.evheap)->key)) break; p = HEAP_TOP(&dn_cfg.evheap)->object; heap_extract(&dn_cfg.evheap, NULL); if (p->type == DN_SCH_I) { serve_sched(&q, (struct dn_sch_inst *)p, dn_cfg.curr_time); } else { /* extracted a delay line */ transmit_event(&q, (struct delay_line *)p, dn_cfg.curr_time); } } if (dn_cfg.expire && ++dn_cfg.expire_cycle >= dn_cfg.expire) { dn_cfg.expire_cycle = 0; dn_drain_scheduler(); dn_drain_queue(); } DN_BH_WUNLOCK(); dn_reschedule(); if (q.head != NULL) dummynet_send(q.head); CURVNET_RESTORE(); } /* * forward a chain of packets to the proper destination. * This runs outside the dummynet lock. */ static void dummynet_send(struct mbuf *m) { struct mbuf *n; for (; m != NULL; m = n) { struct ifnet *ifp = NULL; /* gcc 3.4.6 complains */ struct m_tag *tag; int dst; n = m->m_nextpkt; m->m_nextpkt = NULL; tag = m_tag_first(m); if (tag == NULL) { /* should not happen */ dst = DIR_DROP; } else { struct dn_pkt_tag *pkt = dn_tag_get(m); /* extract the dummynet info, rename the tag * to carry reinject info. */ dst = pkt->dn_dir; ifp = pkt->ifp; tag->m_tag_cookie = MTAG_IPFW_RULE; tag->m_tag_id = 0; } switch (dst) { case DIR_OUT: SET_HOST_IPLEN(mtod(m, struct ip *)); ip_output(m, NULL, NULL, IP_FORWARDING, NULL, NULL); break ; case DIR_IN : /* put header in network format for ip_input() */ //SET_NET_IPLEN(mtod(m, struct ip *)); netisr_dispatch(NETISR_IP, m); break; #ifdef INET6 case DIR_IN | PROTO_IPV6: netisr_dispatch(NETISR_IPV6, m); break; case DIR_OUT | PROTO_IPV6: ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); break; #endif case DIR_FWD | PROTO_IFB: /* DN_TO_IFB_FWD: */ if (bridge_dn_p != NULL) ((*bridge_dn_p)(m, ifp)); else printf("dummynet: if_bridge not loaded\n"); break; case DIR_IN | PROTO_LAYER2: /* DN_TO_ETH_DEMUX: */ /* * The Ethernet code assumes the Ethernet header is * contiguous in the first mbuf header. * Insure this is true. */ if (m->m_len < ETHER_HDR_LEN && (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { printf("dummynet/ether: pullup failed, " "dropping packet\n"); break; } ether_demux(m->m_pkthdr.rcvif, m); break; case DIR_OUT | PROTO_LAYER2: /* N_TO_ETH_OUT: */ ether_output_frame(ifp, m); break; case DIR_DROP: /* drop the packet after some time */ FREE_PKT(m); break; default: printf("dummynet: bad switch %d!\n", dst); FREE_PKT(m); break; } } } static inline int tag_mbuf(struct mbuf *m, int dir, struct ip_fw_args *fwa) { struct dn_pkt_tag *dt; struct m_tag *mtag; mtag = m_tag_get(PACKET_TAG_DUMMYNET, sizeof(*dt), M_NOWAIT | M_ZERO); if (mtag == NULL) return 1; /* Cannot allocate packet header. */ m_tag_prepend(m, mtag); /* Attach to mbuf chain. */ dt = (struct dn_pkt_tag *)(mtag + 1); dt->rule = fwa->rule; dt->rule.info &= IPFW_ONEPASS; /* only keep this info */ dt->dn_dir = dir; dt->ifp = fwa->oif; /* dt->output tame is updated as we move through */ dt->output_time = dn_cfg.curr_time; return 0; } /* * dummynet hook for packets. * We use the argument to locate the flowset fs and the sched_set sch * associated to it. The we apply flow_mask and sched_mask to * determine the queue and scheduler instances. * * dir where shall we send the packet after dummynet. * *m0 the mbuf with the packet * ifp the 'ifp' parameter from the caller. * NULL in ip_input, destination interface in ip_output, */ int dummynet_io(struct mbuf **m0, int dir, struct ip_fw_args *fwa) { struct mbuf *m = *m0; struct dn_fsk *fs = NULL; struct dn_sch_inst *si; struct dn_queue *q = NULL; /* default */ int fs_id = (fwa->rule.info & IPFW_INFO_MASK) + ((fwa->rule.info & IPFW_IS_PIPE) ? 2*DN_MAX_ID : 0); DN_BH_WLOCK(); io_pkt++; /* we could actually tag outside the lock, but who cares... */ if (tag_mbuf(m, dir, fwa)) goto dropit; if (dn_cfg.busy) { /* if the upper half is busy doing something expensive, * lets queue the packet and move forward */ mq_append(&dn_cfg.pending, m); m = *m0 = NULL; /* consumed */ goto done; /* already active, nothing to do */ } /* XXX locate_flowset could be optimised with a direct ref. */ fs = dn_ht_find(dn_cfg.fshash, fs_id, 0, NULL); if (fs == NULL) goto dropit; /* This queue/pipe does not exist! */ if (fs->sched == NULL) /* should not happen */ goto dropit; /* find scheduler instance, possibly applying sched_mask */ si = ipdn_si_find(fs->sched, &(fwa->f_id)); if (si == NULL) goto dropit; /* * If the scheduler supports multiple queues, find the right one * (otherwise it will be ignored by enqueue). */ if (fs->sched->fp->flags & DN_MULTIQUEUE) { q = ipdn_q_find(fs, si, &(fwa->f_id)); if (q == NULL) goto dropit; } if (fs->sched->fp->enqueue(si, q, m)) { /* packet was dropped by enqueue() */ m = *m0 = NULL; goto dropit; } if (si->kflags & DN_ACTIVE) { m = *m0 = NULL; /* consumed */ goto done; /* already active, nothing to do */ } /* compute the initial allowance */ if (si->idle_time < dn_cfg.curr_time) { /* Do this only on the first packet on an idle pipe */ struct dn_link *p = &fs->sched->link; si->sched_time = dn_cfg.curr_time; si->credit = dn_cfg.io_fast ? p->bandwidth : 0; if (p->burst) { uint64_t burst = (dn_cfg.curr_time - si->idle_time) * p->bandwidth; if (burst > p->burst) burst = p->burst; si->credit += burst; } } /* pass through scheduler and delay line */ m = serve_sched(NULL, si, dn_cfg.curr_time); /* optimization -- pass it back to ipfw for immediate send */ /* XXX Don't call dummynet_send() if scheduler return the packet * just enqueued. This avoid a lock order reversal. * */ if (/*dn_cfg.io_fast &&*/ m == *m0 && (dir & PROTO_LAYER2) == 0 ) { /* fast io, rename the tag * to carry reinject info. */ struct m_tag *tag = m_tag_first(m); tag->m_tag_cookie = MTAG_IPFW_RULE; tag->m_tag_id = 0; io_pkt_fast++; if (m->m_nextpkt != NULL) { printf("dummynet: fast io: pkt chain detected!\n"); m->m_nextpkt = NULL; } m = NULL; } else { *m0 = NULL; } done: DN_BH_WUNLOCK(); if (m) dummynet_send(m); return 0; dropit: io_pkt_drop++; DN_BH_WUNLOCK(); if (m) FREE_PKT(m); *m0 = NULL; return (fs && (fs->fs.flags & DN_NOERROR)) ? 0 : ENOBUFS; } ipfw-user/sys/netinet/ipfw/ip_fw2.c000644 000423 000000 00000221375 12007720524 020031 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_fw2.c 238977 2012-08-01 18:49:00Z luigi $"); /* * The FreeBSD IP packet firewall, main file */ #include "opt_ipfw.h" #include "opt_ipdivert.h" #include "opt_inet.h" #ifndef INET #error "IPFIREWALL requires INET" #endif /* INET */ #include "opt_inet6.h" #include "opt_ipsec.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* for ETHERTYPE_IP */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #endif #include /* XXX for in_cksum */ #ifdef MAC #include #endif /* * static variables followed by global ones. * All ipfw global variables are here. */ /* ipfw_vnet_ready controls when we are open for business */ static VNET_DEFINE(int, ipfw_vnet_ready) = 0; #define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) static VNET_DEFINE(int, fw_deny_unknown_exthdrs); #define V_fw_deny_unknown_exthdrs VNET(fw_deny_unknown_exthdrs) static VNET_DEFINE(int, fw_permit_single_frag6) = 1; #define V_fw_permit_single_frag6 VNET(fw_permit_single_frag6) #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT static int default_to_accept = 1; #else static int default_to_accept; #endif VNET_DEFINE(int, autoinc_step); VNET_DEFINE(int, fw_one_pass) = 1; VNET_DEFINE(unsigned int, fw_tables_max); /* Use 128 tables by default */ static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT; /* * Each rule belongs to one of 32 different sets (0..31). * The variable set_disable contains one bit per set. * If the bit is set, all rules in the corresponding set * are disabled. Set RESVD_SET(31) is reserved for the default rule * and rules that are not deleted by the flush command, * and CANNOT be disabled. * Rules in set RESVD_SET can only be deleted individually. */ VNET_DEFINE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DEFINE(int, fw_verbose); /* counter for ipfw_log(NULL...) */ VNET_DEFINE(u_int64_t, norule_counter); VNET_DEFINE(int, verbose_limit); /* layer3_chain contains the list of rules for layer 3 */ VNET_DEFINE(struct ip_fw_chain, layer3_chain); ipfw_nat_t *ipfw_nat_ptr = NULL; struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_del_ptr; ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #ifdef SYSCTL_NODE uint32_t dummy_def = IPFW_DEFAULT_RULE; static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS); SYSBEGIN(f3) SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, one_pass, CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0, "Only do a single pass through ipfw when using dummynet(4)"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step, CTLFLAG_RW, &VNET_NAME(autoinc_step), 0, "Rule number auto-increment step"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0, "Log matches to ipfw rules"); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit, CTLFLAG_RW, &VNET_NAME(verbose_limit), 0, "Set upper limit of matches of ipfw rules logged"); SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD, &dummy_def, 0, "The default/max possible rule number."); SYSCTL_VNET_PROC(_net_inet_ip_fw, OID_AUTO, tables_max, CTLTYPE_UINT|CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU", "Maximum number of tables"); SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN, &default_to_accept, 0, "Make the default rule accept all packets."); TUNABLE_INT("net.inet.ip.fw.default_to_accept", &default_to_accept); TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables); SYSCTL_VNET_INT(_net_inet_ip_fw, OID_AUTO, static_count, CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0, "Number of static rules"); #ifdef INET6 SYSCTL_DECL(_net_inet6_ip6); SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall"); SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs, CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_deny_unknown_exthdrs), 0, "Deny packets with unknown IPv6 Extension Headers"); SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6, CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(fw_permit_single_frag6), 0, "Permit single packet IPv6 fragments"); #endif /* INET6 */ SYSEND #endif /* SYSCTL_NODE */ /* * Some macros used in the various matching options. * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T * Other macros just cast void * into the appropriate type */ #define L3HDR(T, ip) ((T *)((u_int32_t *)(ip) + (ip)->ip_hl)) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) #define ICMP(p) ((struct icmphdr *)(p)) #define ICMP6(p) ((struct icmp6_hdr *)(p)) static __inline int icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd) { int type = icmp->icmp_type; return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<icmp_type; return (type <= ICMP_MAXTYPE && (TT & (1<arg1 or cmd->d[0]. * * We scan options and store the bits we find set. We succeed if * * (want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear * * The code is sometimes optimized not to store additional variables. */ static int flags_match(ipfw_insn *cmd, u_int8_t bits) { u_char want_clear; bits = ~bits; if ( ((cmd->arg1 & 0xff) & bits) != 0) return 0; /* some bits we want set were clear */ want_clear = (cmd->arg1 >> 8) & 0xff; if ( (want_clear & bits) != want_clear) return 0; /* some bits we want clear were set */ return 1; } static int ipopts_match(struct ip *ip, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(ip + 1); int x = (ip->ip_hl << 2) - sizeof (struct ip); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[IPOPT_OPTVAL]; if (opt == IPOPT_EOL) break; if (opt == IPOPT_NOP) optlen = 1; else { optlen = cp[IPOPT_OLEN]; if (optlen <= 0 || optlen > x) return 0; /* invalid or truncated */ } switch (opt) { default: break; case IPOPT_LSRR: bits |= IP_FW_IPOPT_LSRR; break; case IPOPT_SSRR: bits |= IP_FW_IPOPT_SSRR; break; case IPOPT_RR: bits |= IP_FW_IPOPT_RR; break; case IPOPT_TS: bits |= IP_FW_IPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd) { int optlen, bits = 0; u_char *cp = (u_char *)(tcp + 1); int x = (tcp->th_off << 2) - sizeof(struct tcphdr); for (; x > 0; x -= optlen, cp += optlen) { int opt = cp[0]; if (opt == TCPOPT_EOL) break; if (opt == TCPOPT_NOP) optlen = 1; else { optlen = cp[1]; if (optlen <= 0) break; } switch (opt) { default: break; case TCPOPT_MAXSEG: bits |= IP_FW_TCPOPT_MSS; break; case TCPOPT_WINDOW: bits |= IP_FW_TCPOPT_WINDOW; break; case TCPOPT_SACK_PERMITTED: case TCPOPT_SACK: bits |= IP_FW_TCPOPT_SACK; break; case TCPOPT_TIMESTAMP: bits |= IP_FW_TCPOPT_TS; break; } } return (flags_match(cmd, bits)); } static int iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain, uint32_t *tablearg) { if (ifp == NULL) /* no iface with this packet, match fails */ return 0; /* Check by name or by IP address */ if (cmd->name[0] != '\0') { /* match by name */ if (cmd->name[0] == '\1') /* use tablearg to match */ return ipfw_lookup_table_extended(chain, cmd->p.glob, ifp->if_xname, tablearg, IPFW_TABLE_INTERFACE); /* Check name */ if (cmd->p.glob) { if (fnmatch(cmd->name, ifp->if_xname, 0) == 0) return(1); } else { if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0) return(1); } } else { #if !defined(USERSPACE) && defined(__FreeBSD__) /* and OSX too ? */ struct ifaddr *ia; if_addr_rlock(ifp); TAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) { if (ia->ifa_addr->sa_family != AF_INET) continue; if (cmd->p.ip.s_addr == ((struct sockaddr_in *) (ia->ifa_addr))->sin_addr.s_addr) { if_addr_runlock(ifp); return(1); /* match */ } } if_addr_runlock(ifp); #endif /* __FreeBSD__ */ } return(0); /* no match, fail ... */ } /* * The verify_path function checks if a route to the src exists and * if it is reachable via ifp (when provided). * * The 'verrevpath' option checks that the interface that an IP packet * arrives on is the same interface that traffic destined for the * packet's source address would be routed out of. * The 'versrcreach' option just checks that the source address is * reachable via any route (except default) in the routing table. * These two are a measure to block forged packets. This is also * commonly known as "anti-spoofing" or Unicast Reverse Path * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs * is purposely reminiscent of the Cisco IOS command, * * ip verify unicast reverse-path * ip verify unicast source reachable-via any * * which implements the same functionality. But note that the syntax * is misleading, and the check may be performed on all IP packets * whether unicast, multicast, or broadcast. */ static int verify_path(struct in_addr src, struct ifnet *ifp, u_int fib) { #if defined(USERSPACE) || !defined(__FreeBSD__) return 0; #else struct route ro; struct sockaddr_in *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in *)&(ro.ro_dst); dst->sin_family = AF_INET; dst->sin_len = sizeof(*dst); dst->sin_addr = src; in_rtalloc_ign(&ro, 0, fib); if (ro.ro_rt == NULL) return 0; /* * If ifp is provided, check for equality with rtentry. * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * in order to pass packets injected back by if_simloop(): * if useloopback == 1 routing entry (via lo0) for our own address * may exist, so we need to handle routing assymetry. */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && satosin(rt_key(ro.ro_rt))->sin_addr.s_addr == INADDR_ANY) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; #endif /* __FreeBSD__ */ } #ifdef INET6 /* * ipv6 specific rules here... */ static __inline int icmp6type_match (int type, ipfw_insn_u32 *cmd) { return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) ); } static int flow6id_match( int curr_flow, ipfw_insn_u32 *cmd ) { int i; for (i=0; i <= cmd->o.arg1; ++i ) if (curr_flow == cmd->d[i] ) return 1; return 0; } /* support for IP6_*_ME opcodes */ static int search_ip6_addr_net (struct in6_addr * ip6_addr) { struct ifnet *mdc; struct ifaddr *mdc2; struct in6_ifaddr *fdm; struct in6_addr copia; TAILQ_FOREACH(mdc, &V_ifnet, if_link) { if_addr_rlock(mdc); TAILQ_FOREACH(mdc2, &mdc->if_addrhead, ifa_link) { if (mdc2->ifa_addr->sa_family == AF_INET6) { fdm = (struct in6_ifaddr *)mdc2; copia = fdm->ia_addr.sin6_addr; /* need for leaving scope_id in the sock_addr */ in6_clearscope(&copia); if (IN6_ARE_ADDR_EQUAL(ip6_addr, &copia)) { if_addr_runlock(mdc); return 1; } } } if_addr_runlock(mdc); } return 0; } static int verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib) { struct route_in6 ro; struct sockaddr_in6 *dst; bzero(&ro, sizeof(ro)); dst = (struct sockaddr_in6 * )&(ro.ro_dst); dst->sin6_family = AF_INET6; dst->sin6_len = sizeof(*dst); dst->sin6_addr = *src; in6_rtalloc_ign(&ro, 0, fib); if (ro.ro_rt == NULL) return 0; /* * if ifp is provided, check for equality with rtentry * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp, * to support the case of sending packets to an address of our own. * (where the former interface is the first argument of if_simloop() * (=ifp), the latter is lo0) */ if (ifp != NULL && ro.ro_rt->rt_ifa->ifa_ifp != ifp) { RTFREE(ro.ro_rt); return 0; } /* if no ifp provided, check if rtentry is not default route */ if (ifp == NULL && IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(ro.ro_rt))->sin6_addr)) { RTFREE(ro.ro_rt); return 0; } /* or if this is a blackhole/reject route */ if (ifp == NULL && ro.ro_rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { RTFREE(ro.ro_rt); return 0; } /* found valid route */ RTFREE(ro.ro_rt); return 1; } static int is_icmp6_query(int icmp6_type) { if ((icmp6_type <= ICMP6_MAXTYPE) && (icmp6_type == ICMP6_ECHO_REQUEST || icmp6_type == ICMP6_MEMBERSHIP_QUERY || icmp6_type == ICMP6_WRUREQUEST || icmp6_type == ICMP6_FQDN_QUERY || icmp6_type == ICMP6_NI_QUERY)) return (1); return (0); } static void send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6) { struct mbuf *m; m = args->m; if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) { struct tcphdr *tcp; tcp = (struct tcphdr *)((char *)ip6 + hlen); if ((tcp->th_flags & TH_RST) == 0) { struct mbuf *m0; m0 = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m0 != NULL) ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL); } FREE_PKT(m); } else if (code != ICMP6_UNREACH_RST) { /* Send an ICMPv6 unreach. */ #if 0 /* * Unlike above, the mbufs need to line up with the ip6 hdr, * as the contents are read. We need to m_adj() the * needed amount. * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif icmp6_error(m, ICMP6_DST_UNREACH, code, 0); } else FREE_PKT(m); args->m = NULL; } #endif /* INET6 */ /* * sends a reject message, consuming the mbuf passed as an argument. */ static void send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip) { #if 0 /* XXX When ip is not guaranteed to be at mtod() we will * need to account for this */ * The mbuf will however be thrown away so we can adjust it. * Remember we did an m_pullup on it already so we * can make some assumptions about contiguousness. */ if (args->L3offset) m_adj(m, args->L3offset); #endif if (code != ICMP_REJECT_RST) { /* Send an ICMP unreach */ /* We need the IP header in host order for icmp_error(). */ SET_HOST_IPLEN(ip); icmp_error(args->m, ICMP_UNREACH, code, 0L, 0); } else if (args->f_id.proto == IPPROTO_TCP) { struct tcphdr *const tcp = L3HDR(struct tcphdr, mtod(args->m, struct ip *)); if ( (tcp->th_flags & TH_RST) == 0) { struct mbuf *m; m = ipfw_send_pkt(args->m, &(args->f_id), ntohl(tcp->th_seq), ntohl(tcp->th_ack), tcp->th_flags | TH_RST); if (m != NULL) ip_output(m, NULL, NULL, 0, NULL, NULL); } FREE_PKT(args->m); } else FREE_PKT(args->m); args->m = NULL; } /* * Support for uid/gid/jail lookup. These tests are expensive * (because we may need to look into the list of active sockets) * so we cache the results. ugid_lookupp is 0 if we have not * yet done a lookup, 1 if we succeeded, and -1 if we tried * and failed. The function always returns the match value. * We could actually spare the variable and use *uc, setting * it to '(void *)check_uidgid if we have no info, NULL if * we tried and failed, or any other value if successful. */ static int check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp, struct ucred **uc) { #if defined(USERSPACE) return 0; // not supported in userspace #else #ifndef __FreeBSD__ /* XXX */ return cred_check(insn, proto, oif, dst_ip, dst_port, src_ip, src_port, (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb); #else /* FreeBSD */ struct in_addr src_ip, dst_ip; struct inpcbinfo *pi; struct ipfw_flow_id *id; struct inpcb *pcb, *inp; struct ifnet *oif; int lookupflags; int match; id = &args->f_id; inp = args->inp; oif = args->oif; /* * Check to see if the UDP or TCP stack supplied us with * the PCB. If so, rather then holding a lock and looking * up the PCB, we can use the one that was supplied. */ if (inp && *ugid_lookupp == 0) { INP_LOCK_ASSERT(inp); if (inp->inp_socket != NULL) { *uc = crhold(inp->inp_cred); *ugid_lookupp = 1; } else *ugid_lookupp = -1; } /* * If we have already been here and the packet has no * PCB entry associated with it, then we can safely * assume that this is a no match. */ if (*ugid_lookupp == -1) return (0); if (id->proto == IPPROTO_TCP) { lookupflags = 0; pi = &V_tcbinfo; } else if (id->proto == IPPROTO_UDP) { lookupflags = INPLOOKUP_WILDCARD; pi = &V_udbinfo; } else return 0; lookupflags |= INPLOOKUP_RLOCKPCB; match = 0; if (*ugid_lookupp == 0) { if (id->addr_type == 6) { #ifdef INET6 if (oif == NULL) pcb = in6_pcblookup_mbuf(pi, &id->src_ip6, htons(id->src_port), &id->dst_ip6, htons(id->dst_port), lookupflags, oif, args->m); else pcb = in6_pcblookup_mbuf(pi, &id->dst_ip6, htons(id->dst_port), &id->src_ip6, htons(id->src_port), lookupflags, oif, args->m); #else *ugid_lookupp = -1; return (0); #endif } else { src_ip.s_addr = htonl(id->src_ip); dst_ip.s_addr = htonl(id->dst_ip); if (oif == NULL) pcb = in_pcblookup_mbuf(pi, src_ip, htons(id->src_port), dst_ip, htons(id->dst_port), lookupflags, oif, args->m); else pcb = in_pcblookup_mbuf(pi, dst_ip, htons(id->dst_port), src_ip, htons(id->src_port), lookupflags, oif, args->m); } if (pcb != NULL) { INP_RLOCK_ASSERT(pcb); *uc = crhold(pcb->inp_cred); *ugid_lookupp = 1; INP_RUNLOCK(pcb); } if (*ugid_lookupp == 0) { /* * We tried and failed, set the variable to -1 * so we will not try again on this packet. */ *ugid_lookupp = -1; return (0); } } if (insn->o.opcode == O_UID) match = ((*uc)->cr_uid == (uid_t)insn->d[0]); else if (insn->o.opcode == O_GID) match = groupmember((gid_t)insn->d[0], *uc); else if (insn->o.opcode == O_JAIL) match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]); return (match); #endif /* __FreeBSD__ */ #endif /* not supported in userspace */ } /* * Helper function to set args with info on the rule after the matching * one. slot is precise, whereas we guess rule_id as they are * assigned sequentially. */ static inline void set_match(struct ip_fw_args *args, int slot, struct ip_fw_chain *chain) { args->rule.chain_id = chain->id; args->rule.slot = slot + 1; /* we use 0 as a marker */ args->rule.rule_id = 1 + chain->map[slot]->id; args->rule.rulenum = chain->map[slot]->rulenum; } /* * The main check routine for the firewall. * * All arguments are in args so we can modify them and return them * back to the caller. * * Parameters: * * args->m (in/out) The packet; we set to NULL when/if we nuke it. * Starts with the IP header. * args->eh (in) Mac header if present, NULL for layer3 packet. * args->L3offset Number of bytes bypassed if we came from L2. * e.g. often sizeof(eh) ** NOTYET ** * args->oif Outgoing interface, NULL if packet is incoming. * The incoming interface is in the mbuf. (in) * args->divert_rule (in/out) * Skip up to the first rule past this rule number; * upon return, non-zero port number for divert or tee. * * args->rule Pointer to the last matching rule (in/out) * args->next_hop Socket we are forwarding to (out). * args->next_hop6 IPv6 next hop we are forwarding to (out). * args->f_id Addresses grabbed from the packet (out) * args->rule.info a cookie depending on rule action * * Return value: * * IP_FW_PASS the packet must be accepted * IP_FW_DENY the packet must be dropped * IP_FW_DIVERT divert packet, port in m_tag * IP_FW_TEE tee packet, port in m_tag * IP_FW_DUMMYNET to dummynet, pipe in args->cookie * IP_FW_NETGRAPH into netgraph, cookie args->cookie * args->rule contains the matching rule, * args->rule.info has additional information. * */ int ipfw_chk(struct ip_fw_args *args) { /* * Local variables holding state while processing a packet: * * IMPORTANT NOTE: to speed up the processing of rules, there * are some assumption on the values of the variables, which * are documented here. Should you change them, please check * the implementation of the various instructions to make sure * that they still work. * * args->eh The MAC header. It is non-null for a layer2 * packet, it is NULL for a layer-3 packet. * **notyet** * args->L3offset Offset in the packet to the L3 (IP or equiv.) header. * * m | args->m Pointer to the mbuf, as received from the caller. * It may change if ipfw_chk() does an m_pullup, or if it * consumes the packet because it calls send_reject(). * XXX This has to change, so that ipfw_chk() never modifies * or consumes the buffer. * ip is the beginning of the ip(4 or 6) header. * Calculated by adding the L3offset to the start of data. * (Until we start using L3offset, the packet is * supposed to start with the ip header). */ struct mbuf *m = args->m; struct ip *ip = mtod(m, struct ip *); /* * For rules which contain uid/gid or jail constraints, cache * a copy of the users credentials after the pcb lookup has been * executed. This will speed up the processing of rules with * these types of constraints, as well as decrease contention * on pcb related locks. */ #ifndef __FreeBSD__ struct bsd_ucred ucred_cache; #else struct ucred *ucred_cache = NULL; #endif int ucred_lookup = 0; /* * oif | args->oif If NULL, ipfw_chk has been called on the * inbound path (ether_input, ip_input). * If non-NULL, ipfw_chk has been called on the outbound path * (ether_output, ip_output). */ struct ifnet *oif = args->oif; int f_pos = 0; /* index of current rule in the array */ int retval = 0; /* * hlen The length of the IP header. */ u_int hlen = 0; /* hlen >0 means we have an IP pkt */ /* * offset The offset of a fragment. offset != 0 means that * we have a fragment at this offset of an IPv4 packet. * offset == 0 means that (if this is an IPv4 packet) * this is the first or only fragment. * For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header * or there is a single packet fragement (fragement header added * without needed). We will treat a single packet fragment as if * there was no fragment header (or log/block depending on the * V_fw_permit_single_frag6 sysctl setting). */ u_short offset = 0; u_short ip6f_mf = 0; /* * Local copies of addresses. They are only valid if we have * an IP packet. * * proto The protocol. Set to 0 for non-ip packets, * or to the protocol read from the packet otherwise. * proto != 0 means that we have an IPv4 packet. * * src_port, dst_port port numbers, in HOST format. Only * valid for TCP and UDP packets. * * src_ip, dst_ip ip addresses, in NETWORK format. * Only valid for IPv4 packets. */ uint8_t proto; uint16_t src_port = 0, dst_port = 0; /* NOTE: host format */ struct in_addr src_ip, dst_ip; /* NOTE: network format */ uint16_t iplen=0; int pktlen; uint16_t etype = 0; /* Host order stored ether type */ /* * dyn_dir = MATCH_UNKNOWN when rules unchecked, * MATCH_NONE when checked and not matched (q = NULL), * MATCH_FORWARD or MATCH_REVERSE otherwise (q != NULL) */ int dyn_dir = MATCH_UNKNOWN; ipfw_dyn_rule *q = NULL; struct ip_fw_chain *chain = &V_layer3_chain; /* * We store in ulp a pointer to the upper layer protocol header. * In the ipv4 case this is easy to determine from the header, * but for ipv6 we might have some additional headers in the middle. * ulp is NULL if not found. */ void *ulp = NULL; /* upper layer protocol pointer. */ /* XXX ipv6 variables */ int is_ipv6 = 0; uint8_t icmp6_type = 0; uint16_t ext_hd = 0; /* bits vector for extension header filtering */ /* end of ipv6 variables */ int is_ipv4 = 0; int done = 0; /* flag to exit the outer loop */ if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready)) return (IP_FW_PASS); /* accept */ dst_ip.s_addr = 0; /* make sure it is initialized */ src_ip.s_addr = 0; /* make sure it is initialized */ pktlen = m->m_pkthdr.len; args->f_id.fib = M_GETFIB(m); /* note mbuf not altered) */ proto = args->f_id.proto = 0; /* mark f_id invalid */ /* XXX 0 is a valid proto: IP/IPv6 Hop-by-Hop Option */ /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) PULLUP_LEN(_len, p, sizeof(T)) #define PULLUP_LEN(_len, p, T) \ do { \ int x = (_len) + T; \ if ((m)->m_len < x) { \ args->m = m = m_pullup(m, x); \ if (m == NULL) \ goto pullup_failed; \ } \ p = (mtod(m, char *) + (_len)); \ } while (0) /* * if we have an ether header, */ if (args->eh) etype = ntohs(args->eh->ether_type); /* Identify IP packets and fill up variables. */ if (pktlen >= sizeof(struct ip6_hdr) && (args->eh == NULL || etype == ETHERTYPE_IPV6) && ip->ip_v == 6) { struct ip6_hdr *ip6 = (struct ip6_hdr *)ip; is_ipv6 = 1; args->f_id.addr_type = 6; hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; /* Search extension headers to find upper layer protocols */ while (ulp == NULL && offset == 0) { switch (proto) { case IPPROTO_ICMPV6: PULLUP_TO(hlen, ulp, struct icmp6_hdr); icmp6_type = ICMP6(ulp)->icmp6_type; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_HOPOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); switch (((struct ip6_rthdr *)ulp)->ip6r_type) { case 0: ext_hd |= EXT_RTHDR0; break; case 2: ext_hd |= EXT_RTHDR2; break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Routing Header type(%d)\n", ((struct ip6_rthdr *) ulp)->ip6r_type); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } ext_hd |= EXT_ROUTING; hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); ext_hd |= EXT_FRAGMENT; hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_MORE_FRAG; if (V_fw_permit_single_frag6 == 0 && offset == 0 && ip6f_mf == 0) { if (V_fw_verbose) printf("IPFW2: IPV6 - Invalid " "Fragment Header\n"); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); break; } args->f_id.extra = ntohl(((struct ip6_frag *)ulp)->ip6f_ident); ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); ext_hd |= EXT_DSTOPTS; hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); ext_hd |= EXT_AH; hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; case IPPROTO_ESP: /* RFC 2406 */ PULLUP_TO(hlen, ulp, uint32_t); /* SPI, Seq# */ /* Anything past Seq# is variable length and * data past this ext. header is encrypted. */ ext_hd |= EXT_ESP; break; case IPPROTO_NONE: /* RFC 2460 */ /* * Packet ends here, and IPv6 header has * already been pulled up. If ip6e_len!=0 * then octets must be ignored. */ ulp = ip; /* non-NULL to get out of loop. */ break; case IPPROTO_OSPFIGP: /* XXX OSPF header check? */ PULLUP_TO(hlen, ulp, struct ip6_ext); break; case IPPROTO_PIM: /* XXX PIM header check? */ PULLUP_TO(hlen, ulp, struct pim); break; case IPPROTO_CARP: PULLUP_TO(hlen, ulp, struct carp_header); if (((struct carp_header *)ulp)->carp_version != CARP_VERSION) return (IP_FW_DENY); if (((struct carp_header *)ulp)->carp_type != CARP_ADVERTISEMENT) return (IP_FW_DENY); break; case IPPROTO_IPV6: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip6_hdr); break; case IPPROTO_IPV4: /* RFC 2893 */ PULLUP_TO(hlen, ulp, struct ip); break; default: if (V_fw_verbose) printf("IPFW2: IPV6 - Unknown " "Extension Header(%d), ext_hd=%x\n", proto, ext_hd); if (V_fw_deny_unknown_exthdrs) return (IP_FW_DENY); PULLUP_TO(hlen, ulp, struct ip6_ext); break; } /*switch */ } ip = mtod(m, struct ip *); ip6 = (struct ip6_hdr *)ip; args->f_id.src_ip6 = ip6->ip6_src; args->f_id.dst_ip6 = ip6->ip6_dst; args->f_id.src_ip = 0; args->f_id.dst_ip = 0; args->f_id.flow_id6 = ntohl(ip6->ip6_flow); } else if (pktlen >= sizeof(struct ip) && (args->eh == NULL || etype == ETHERTYPE_IP) && ip->ip_v == 4) { is_ipv4 = 1; hlen = ip->ip_hl << 2; args->f_id.addr_type = 4; /* * Collect parameters into local variables for faster matching. */ proto = ip->ip_p; src_ip = ip->ip_src; dst_ip = ip->ip_dst; offset = ntohs(ip->ip_off) & IP_OFFMASK; iplen = ntohs(ip->ip_len); pktlen = iplen < pktlen ? iplen : pktlen; if (offset == 0) { switch (proto) { case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dst_port = TCP(ulp)->th_dport; src_port = TCP(ulp)->th_sport; /* save flags for dynamic rules */ args->f_id._flags = TCP(ulp)->th_flags; break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); src_port = SCTP(ulp)->src_port; dst_port = SCTP(ulp)->dest_port; break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dst_port = UDP(ulp)->uh_dport; src_port = UDP(ulp)->uh_sport; break; case IPPROTO_ICMP: PULLUP_TO(hlen, ulp, struct icmphdr); //args->f_id.flags = ICMP(ulp)->icmp_type; break; default: break; } } ip = mtod(m, struct ip *); args->f_id.src_ip = ntohl(src_ip.s_addr); args->f_id.dst_ip = ntohl(dst_ip.s_addr); } #undef PULLUP_TO if (proto) { /* we may have port numbers, store them */ args->f_id.proto = proto; args->f_id.src_port = src_port = ntohs(src_port); args->f_id.dst_port = dst_port = ntohs(dst_port); } IPFW_RLOCK(chain); if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */ IPFW_RUNLOCK(chain); return (IP_FW_PASS); /* accept */ } if (args->rule.slot) { /* * Packet has already been tagged as a result of a previous * match on rule args->rule aka args->rule_id (PIPE, QUEUE, * REASS, NETGRAPH, DIVERT/TEE...) * Validate the slot and continue from the next one * if still present, otherwise do a lookup. */ f_pos = (args->rule.chain_id == chain->id) ? args->rule.slot : ipfw_find_rule(chain, args->rule.rulenum, args->rule.rule_id); } else { f_pos = 0; } /* * Now scan the rules, and parse microinstructions for each rule. * We have two nested loops and an inner switch. Sometimes we * need to break out of one or both loops, or re-enter one of * the loops with updated variables. Loop variables are: * * f_pos (outer loop) points to the current rule. * On output it points to the matching rule. * done (outer loop) is used as a flag to break the loop. * l (inner loop) residual length of current rule. * cmd points to the current microinstruction. * * We break the inner loop by setting l=0 and possibly * cmdlen=0 if we don't want to advance cmd. * We break the outer loop by setting done=1 * We can restart the inner loop by setting l>0 and f_pos, f, cmd * as needed. */ for (; f_pos < chain->n_rules; f_pos++) { ipfw_insn *cmd; uint32_t tablearg = 0; int l, cmdlen, skip_or; /* skip rest of OR block */ struct ip_fw *f; f = chain->map[f_pos]; if (V_set_disable & (1 << f->set) ) continue; skip_or = 0; for (l = f->cmd_len, cmd = f->cmd ; l > 0 ; l -= cmdlen, cmd += cmdlen) { int match; /* * check_body is a jump target used when we find a * CHECK_STATE, and need to jump to the body of * the target rule. */ /* check_body: */ cmdlen = F_LEN(cmd); /* * An OR block (insn_1 || .. || insn_n) has the * F_OR bit set in all but the last instruction. * The first match will set "skip_or", and cause * the following instructions to be skipped until * past the one with the F_OR bit clear. */ if (skip_or) { /* skip this instruction */ if ((cmd->len & F_OR) == 0) skip_or = 0; /* next one is good */ continue; } match = 0; /* set to 1 if we succeed */ switch (cmd->opcode) { /* * The first set of opcodes compares the packet's * fields with some pattern, setting 'match' if a * match is found. At the end of the loop there is * logic to deal with F_NOT and F_OR flags associated * with the opcode. */ case O_NOP: match = 1; break; case O_FORWARD_MAC: printf("ipfw: opcode %d unimplemented\n", cmd->opcode); break; case O_GID: case O_UID: case O_JAIL: /* * We only check offset == 0 && proto != 0, * as this ensures that we have a * packet with the ports info. */ if (offset != 0) break; if (proto == IPPROTO_TCP || proto == IPPROTO_UDP) match = check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); #else (void *)&ucred_cache); #endif break; case O_RECV: match = iface_match(m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_XMIT: match = iface_match(oif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_VIA: match = iface_match(oif ? oif : m->m_pkthdr.rcvif, (ipfw_insn_if *)cmd, chain, &tablearg); break; case O_MACADDR2: if (args->eh != NULL) { /* have MAC header */ u_int32_t *want = (u_int32_t *) ((ipfw_insn_mac *)cmd)->addr; u_int32_t *mask = (u_int32_t *) ((ipfw_insn_mac *)cmd)->mask; u_int32_t *hdr = (u_int32_t *)args->eh; match = ( want[0] == (hdr[0] & mask[0]) && want[1] == (hdr[1] & mask[1]) && want[2] == (hdr[2] & mask[2]) ); } break; case O_MAC_TYPE: if (args->eh != NULL) { u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (etype >= p[0] && etype <= p[1]); } break; case O_FRAG: match = (offset != 0); break; case O_IN: /* "out" is "not in" */ match = (oif == NULL); break; case O_LAYER2: match = (args->eh != NULL); break; case O_DIVERTED: { /* For diverted packets, args->rule.info * contains the divert port (in host format) * reason and direction. */ uint32_t i = args->rule.info; match = (i&IPFW_IS_MASK) == IPFW_IS_DIVERT && cmd->arg1 & ((i & IPFW_INFO_IN) ? 1 : 2); } break; case O_PROTO: /* * We do not allow an arg of 0 so the * check of "proto" only suffices. */ match = (proto == cmd->arg1); break; case O_IP_SRC: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == src_ip.s_addr); break; case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: if (is_ipv4) { uint32_t key = (cmd->opcode == O_IP_DST_LOOKUP) ? dst_ip.s_addr : src_ip.s_addr; uint32_t v = 0; if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) { /* generic lookup. The key must be * in 32bit big-endian format. */ v = ((ipfw_insn_u32 *)cmd)->d[1]; if (v == 0) key = dst_ip.s_addr; else if (v == 1) key = src_ip.s_addr; else if (v == 6) /* dscp */ key = (ip->ip_tos >> 2) & 0x3f; else if (offset != 0) break; else if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) break; else if (v == 2) key = htonl(dst_port); else if (v == 3) key = htonl(src_port); #ifndef USERSPACE else if (v == 4 || v == 5) { check_uidgid( (ipfw_insn_u32 *)cmd, args, &ucred_lookup, #ifdef __FreeBSD__ &ucred_cache); if (v == 4 /* O_UID */) key = ucred_cache->cr_uid; else if (v == 5 /* O_JAIL */) key = ucred_cache->cr_prison->pr_id; #else /* !__FreeBSD__ */ (void *)&ucred_cache); if (v ==4 /* O_UID */) key = ucred_cache.uid; else if (v == 5 /* O_JAIL */) key = ucred_cache.xid; #endif /* !__FreeBSD__ */ key = htonl(key); } else #endif /* !USERSPACE */ break; } match = ipfw_lookup_table(chain, cmd->arg1, key, &v); if (!match) break; if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; else tablearg = v; } else if (is_ipv6) { uint32_t v = 0; void *pkey = (cmd->opcode == O_IP_DST_LOOKUP) ? &args->f_id.dst_ip6: &args->f_id.src_ip6; match = ipfw_lookup_table_extended(chain, cmd->arg1, pkey, &v, IPFW_TABLE_CIDR); if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) match = ((ipfw_insn_u32 *)cmd)->d[0] == v; if (match) tablearg = v; } break; case O_IP_SRC_MASK: case O_IP_DST_MASK: if (is_ipv4) { uint32_t a = (cmd->opcode == O_IP_DST_MASK) ? dst_ip.s_addr : src_ip.s_addr; uint32_t *p = ((ipfw_insn_u32 *)cmd)->d; int i = cmdlen-1; for (; !match && i>0; i-= 2, p+= 2) match = (p[0] == (a & p[1])); } break; case O_IP_SRC_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(src_ip, tif); match = (tif != NULL); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_SRC_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.src_ip6); #endif break; case O_IP_DST_SET: case O_IP_SRC_SET: if (is_ipv4) { u_int32_t *d = (u_int32_t *)(cmd+1); u_int32_t addr = cmd->opcode == O_IP_DST_SET ? args->f_id.dst_ip : args->f_id.src_ip; if (addr < d[0]) break; addr -= d[0]; /* subtract base */ match = (addr < cmd->arg1) && ( d[ 1 + (addr>>5)] & (1<<(addr & 0x1f)) ); } break; case O_IP_DST: match = is_ipv4 && (((ipfw_insn_ip *)cmd)->addr.s_addr == dst_ip.s_addr); break; case O_IP_DST_ME: if (is_ipv4) { struct ifnet *tif; INADDR_TO_IFP(dst_ip, tif); match = (tif != NULL); break; } #ifdef INET6 /* FALLTHROUGH */ case O_IP6_DST_ME: match= is_ipv6 && search_ip6_addr_net(&args->f_id.dst_ip6); #endif break; case O_IP_SRCPORT: case O_IP_DSTPORT: /* * offset == 0 && proto != 0 is enough * to guarantee that we have a * packet with port info. */ if ((proto==IPPROTO_UDP || proto==IPPROTO_TCP) && offset == 0) { u_int16_t x = (cmd->opcode == O_IP_SRCPORT) ? src_port : dst_port ; u_int16_t *p = ((ipfw_insn_u16 *)cmd)->ports; int i; for (i = cmdlen - 1; !match && i>0; i--, p += 2) match = (x>=p[0] && x<=p[1]); } break; case O_ICMPTYPE: match = (offset == 0 && proto==IPPROTO_ICMP && icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) ); break; #ifdef INET6 case O_ICMP6TYPE: match = is_ipv6 && offset == 0 && proto==IPPROTO_ICMPV6 && icmp6type_match( ICMP6(ulp)->icmp6_type, (ipfw_insn_u32 *)cmd); break; #endif /* INET6 */ case O_IPOPT: match = (is_ipv4 && ipopts_match(ip, cmd) ); break; case O_IPVER: match = (is_ipv4 && cmd->arg1 == ip->ip_v); break; case O_IPID: case O_IPLEN: case O_IPTTL: if (is_ipv4) { /* only for IP packets */ uint16_t x; uint16_t *p; int i; if (cmd->opcode == O_IPLEN) x = iplen; else if (cmd->opcode == O_IPTTL) x = ip->ip_ttl; else /* must be IPID */ x = ntohs(ip->ip_id); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_IPPRECEDENCE: match = (is_ipv4 && (cmd->arg1 == (ip->ip_tos & 0xe0)) ); break; case O_IPTOS: match = (is_ipv4 && flags_match(cmd, ip->ip_tos)); break; case O_TCPDATALEN: if (proto == IPPROTO_TCP && offset == 0) { struct tcphdr *tcp; uint16_t x; uint16_t *p; int i; tcp = TCP(ulp); x = iplen - ((ip->ip_hl + tcp->th_off) << 2); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* otherwise we have ranges */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i>0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_TCPFLAGS: match = (proto == IPPROTO_TCP && offset == 0 && flags_match(cmd, TCP(ulp)->th_flags)); break; case O_TCPOPTS: PULLUP_LEN(hlen, ulp, (TCP(ulp)->th_off << 2)); match = (proto == IPPROTO_TCP && offset == 0 && tcpopts_match(TCP(ulp), cmd)); break; case O_TCPSEQ: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_seq); break; case O_TCPACK: match = (proto == IPPROTO_TCP && offset == 0 && ((ipfw_insn_u32 *)cmd)->d[0] == TCP(ulp)->th_ack); break; case O_TCPWIN: if (proto == IPPROTO_TCP && offset == 0) { uint16_t x; uint16_t *p; int i; x = ntohs(TCP(ulp)->th_win); if (cmdlen == 1) { match = (cmd->arg1 == x); break; } /* Otherwise we have ranges. */ p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for (; !match && i > 0; i--, p += 2) match = (x >= p[0] && x <= p[1]); } break; case O_ESTAB: /* reject packets which have SYN only */ /* XXX should i also check for TH_ACK ? */ match = (proto == IPPROTO_TCP && offset == 0 && (TCP(ulp)->th_flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN); break; case O_ALTQ: { struct pf_mtag *at; ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd; match = 1; at = pf_find_mtag(m); if (at != NULL && at->qid != 0) break; at = pf_get_mtag(m); if (at == NULL) { /* * Let the packet fall back to the * default ALTQ. */ break; } at->qid = altq->qid; at->hdr = ip; break; } case O_LOG: ipfw_log(f, hlen, args, m, oif, offset | ip6f_mf, tablearg, ip); match = 1; break; case O_PROB: match = (random()<((ipfw_insn_u32 *)cmd)->d[0]); break; case O_VERREVPATH: /* Outgoing packets automatically pass/match */ match = ((oif != NULL) || (m->m_pkthdr.rcvif == NULL) || ( #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), m->m_pkthdr.rcvif, args->f_id.fib) : #endif verify_path(src_ip, m->m_pkthdr.rcvif, args->f_id.fib))); break; case O_VERSRCREACH: /* Outgoing packets automatically pass/match */ match = (hlen > 0 && ((oif != NULL) || #ifdef INET6 is_ipv6 ? verify_path6(&(args->f_id.src_ip6), NULL, args->f_id.fib) : #endif verify_path(src_ip, NULL, args->f_id.fib))); break; case O_ANTISPOOF: /* Outgoing packets automatically pass/match */ if (oif == NULL && hlen > 0 && ( (is_ipv4 && in_localaddr(src_ip)) #ifdef INET6 || (is_ipv6 && in6_localaddr(&(args->f_id.src_ip6))) #endif )) match = #ifdef INET6 is_ipv6 ? verify_path6( &(args->f_id.src_ip6), m->m_pkthdr.rcvif, args->f_id.fib) : #endif verify_path(src_ip, m->m_pkthdr.rcvif, args->f_id.fib); else match = 1; break; case O_IPSEC: #ifdef IPSEC match = (m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL); #endif /* otherwise no match */ break; #ifdef INET6 case O_IP6_SRC: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_DST: match = is_ipv6 && IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6, &((ipfw_insn_ip6 *)cmd)->addr6); break; case O_IP6_SRC_MASK: case O_IP6_DST_MASK: if (is_ipv6) { int i = cmdlen - 1; struct in6_addr p; struct in6_addr *d = &((ipfw_insn_ip6 *)cmd)->addr6; for (; !match && i > 0; d += 2, i -= F_INSN_SIZE(struct in6_addr) * 2) { p = (cmd->opcode == O_IP6_SRC_MASK) ? args->f_id.src_ip6: args->f_id.dst_ip6; APPLY_MASK(&p, &d[1]); match = IN6_ARE_ADDR_EQUAL(&d[0], &p); } } break; case O_FLOW6ID: match = is_ipv6 && flow6id_match(args->f_id.flow_id6, (ipfw_insn_u32 *) cmd); break; case O_EXT_HDR: match = is_ipv6 && (ext_hd & ((ipfw_insn *) cmd)->arg1); break; case O_IP6: match = is_ipv6; break; #endif case O_IP4: match = is_ipv4; break; case O_TAG: { struct m_tag *mtag; uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; /* Packet is already tagged with this tag? */ mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL); /* We have `untag' action when F_NOT flag is * present. And we must remove this mtag from * mbuf and reset `match' to zero (`match' will * be inversed later). * Otherwise we should allocate new mtag and * push it into mbuf. */ if (cmd->len & F_NOT) { /* `untag' action */ if (mtag != NULL) m_tag_delete(m, mtag); match = 0; } else { if (mtag == NULL) { mtag = m_tag_alloc( MTAG_IPFW, tag, 0, M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } match = 1; } break; } case O_FIB: /* try match the specified fib */ if (args->f_id.fib == cmd->arg1) match = 1; break; case O_SOCKARG: { #ifndef USERSPACE /* not supported in userspace */ struct inpcb *inp = args->inp; struct inpcbinfo *pi; if (is_ipv6) /* XXX can we remove this ? */ break; if (proto == IPPROTO_TCP) pi = &V_tcbinfo; else if (proto == IPPROTO_UDP) pi = &V_udbinfo; else break; /* * XXXRW: so_user_cookie should almost * certainly be inp_user_cookie? */ /* For incomming packet, lookup up the inpcb using the src/dest ip/port tuple */ if (inp == NULL) { inp = in_pcblookup(pi, src_ip, htons(src_port), dst_ip, htons(dst_port), INPLOOKUP_RLOCKPCB, NULL); if (inp != NULL) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; INP_RUNLOCK(inp); } } else { if (inp->inp_socket) { tablearg = inp->inp_socket->so_user_cookie; if (tablearg) match = 1; } } #endif /* !USERSPACE */ break; } case O_TAGGED: { struct m_tag *mtag; uint32_t tag = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; if (cmdlen == 1) { match = m_tag_locate(m, MTAG_IPFW, tag, NULL) != NULL; break; } /* we have ranges */ for (mtag = m_tag_first(m); mtag != NULL && !match; mtag = m_tag_next(m, mtag)) { uint16_t *p; int i; if (mtag->m_tag_cookie != MTAG_IPFW) continue; p = ((ipfw_insn_u16 *)cmd)->ports; i = cmdlen - 1; for(; !match && i > 0; i--, p += 2) match = mtag->m_tag_id >= p[0] && mtag->m_tag_id <= p[1]; } break; } /* * The second set of opcodes represents 'actions', * i.e. the terminal part of a rule once the packet * matches all previous patterns. * Typically there is only one action for each rule, * and the opcode is stored at the end of the rule * (but there are exceptions -- see below). * * In general, here we set retval and terminate the * outer loop (would be a 'break 3' in some language, * but we need to set l=0, done=1) * * Exceptions: * O_COUNT and O_SKIPTO actions: * instead of terminating, we jump to the next rule * (setting l=0), or to the SKIPTO target (setting * f/f_len, cmd and l as needed), respectively. * * O_TAG, O_LOG and O_ALTQ action parameters: * perform some action and set match = 1; * * O_LIMIT and O_KEEP_STATE: these opcodes are * not real 'actions', and are stored right * before the 'action' part of the rule. * These opcodes try to install an entry in the * state tables; if successful, we continue with * the next opcode (match=1; break;), otherwise * the packet must be dropped (set retval, * break loops with l=0, done=1) * * O_PROBE_STATE and O_CHECK_STATE: these opcodes * cause a lookup of the state table, and a jump * to the 'action' part of the parent rule * if an entry is found, or * (CHECK_STATE only) a jump to the next rule if * the entry is not found. * The result of the lookup is cached so that * further instances of these opcodes become NOPs. * The jump to the next rule is done by setting * l=0, cmdlen=0. */ case O_LIMIT: case O_KEEP_STATE: if (ipfw_install_state(f, (ipfw_insn_limit *)cmd, args, tablearg)) { /* error or limit violation */ retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ } match = 1; break; case O_PROBE_STATE: case O_CHECK_STATE: /* * dynamic rules are checked at the first * keep-state or check-state occurrence, * with the result being stored in dyn_dir. * The compiler introduces a PROBE_STATE * instruction for us when we have a * KEEP_STATE (because PROBE_STATE needs * to be run first). */ if (dyn_dir == MATCH_UNKNOWN && (q = ipfw_lookup_dyn_rule(&args->f_id, &dyn_dir, proto == IPPROTO_TCP ? TCP(ulp) : NULL)) != NULL) { /* * Found dynamic entry, update stats * and jump to the 'action' part of * the parent rule by setting * f, cmd, l and clearing cmdlen. */ q->pcnt++; q->bcnt += pktlen; /* XXX we would like to have f_pos * readily accessible in the dynamic * rule, instead of having to * lookup q->rule. */ f = q->rule; f_pos = ipfw_find_rule(chain, f->rulenum, f->id); cmd = ACTION_PTR(f); l = f->cmd_len - f->act_ofs; ipfw_dyn_unlock(); cmdlen = 0; match = 1; break; } /* * Dynamic entry not found. If CHECK_STATE, * skip to next rule, if PROBE_STATE just * ignore and continue with next opcode. */ if (cmd->opcode == O_CHECK_STATE) l = 0; /* exit inner loop */ match = 1; break; case O_ACCEPT: retval = 0; /* accept */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_PIPE: case O_QUEUE: set_match(args, f_pos, chain); args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; if (cmd->opcode == O_PIPE) args->rule.info |= IPFW_IS_PIPE; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = IP_FW_DUMMYNET; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_DIVERT: case O_TEE: if (args->eh) /* not on layer 2 */ break; /* otherwise this is terminal */ l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ retval = (cmd->opcode == O_DIVERT) ? IP_FW_DIVERT : IP_FW_TEE; set_match(args, f_pos, chain); args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; break; case O_COUNT: f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; l = 0; /* exit inner loop */ break; case O_SKIPTO: f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; /* If possible use cached f_pos (in f->next_rule), * whose version is written in f->next_rule * (horrible hacks to avoid changing the ABI). */ if (cmd->arg1 != IP_FW_TABLEARG && (uintptr_t)f->x_next == chain->id) { f_pos = (uintptr_t)f->next_rule; } else { int i = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; /* make sure we do not jump backward */ if (i <= f->rulenum) i = f->rulenum + 1; f_pos = ipfw_find_rule(chain, i, 0); /* update the cache */ if (cmd->arg1 != IP_FW_TABLEARG) { f->next_rule = (void *)(uintptr_t)f_pos; f->x_next = (void *)(uintptr_t)chain->id; } } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the skipto rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; match = 1; cmdlen = 0; skip_or = 0; continue; break; /* not reached */ case O_CALLRETURN: { /* * Implementation of `subroutine' call/return, * in the stack carried in an mbuf tag. This * is different from `skipto' in that any call * address is possible (`skipto' must prevent * backward jumps to avoid endless loops). * We have `return' action when F_NOT flag is * present. The `m_tag_id' field is used as * stack pointer. */ struct m_tag *mtag; uint16_t jmpto, *stack; #define IS_CALL ((cmd->len & F_NOT) == 0) #define IS_RETURN ((cmd->len & F_NOT) != 0) /* * Hand-rolled version of m_tag_locate() with * wildcard `type'. * If not already tagged, allocate new tag. */ mtag = m_tag_first(m); while (mtag != NULL) { if (mtag->m_tag_cookie == MTAG_IPFW_CALL) break; mtag = m_tag_next(m, mtag); } if (mtag == NULL && IS_CALL) { mtag = m_tag_alloc(MTAG_IPFW_CALL, 0, IPFW_CALLSTACK_SIZE * sizeof(uint16_t), M_NOWAIT); if (mtag != NULL) m_tag_prepend(m, mtag); } /* * On error both `call' and `return' just * continue with next rule. */ if (IS_RETURN && (mtag == NULL || mtag->m_tag_id == 0)) { l = 0; /* exit inner loop */ break; } if (IS_CALL && (mtag == NULL || mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) { printf("ipfw: call stack error, " "go to next rule\n"); l = 0; /* exit inner loop */ break; } f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; stack = (uint16_t *)(mtag + 1); /* * The `call' action may use cached f_pos * (in f->next_rule), whose version is written * in f->next_rule. * The `return' action, however, doesn't have * fixed jump address in cmd->arg1 and can't use * cache. */ if (IS_CALL) { stack[mtag->m_tag_id] = f->rulenum; mtag->m_tag_id++; if (cmd->arg1 != IP_FW_TABLEARG && (uintptr_t)f->x_next == chain->id) { f_pos = (uintptr_t)f->next_rule; } else { jmpto = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg: cmd->arg1; f_pos = ipfw_find_rule(chain, jmpto, 0); /* update the cache */ if (cmd->arg1 != IP_FW_TABLEARG) { f->next_rule = (void *)(uintptr_t) f_pos; f->x_next = (void *)(uintptr_t) chain->id; } } } else { /* `return' action */ mtag->m_tag_id--; jmpto = stack[mtag->m_tag_id] + 1; f_pos = ipfw_find_rule(chain, jmpto, 0); } /* * Skip disabled rules, and re-enter * the inner loop with the correct * f_pos, f, l and cmd. * Also clear cmdlen and skip_or */ for (; f_pos < chain->n_rules - 1 && (V_set_disable & (1 << chain->map[f_pos]->set)); f_pos++) ; /* Re-enter the inner loop at the dest rule. */ f = chain->map[f_pos]; l = f->cmd_len; cmd = f->cmd; cmdlen = 0; skip_or = 0; continue; break; /* NOTREACHED */ } #undef IS_CALL #undef IS_RETURN case O_REJECT: /* * Drop the packet and send a reject notice * if the packet is not ICMP (or is an ICMP * query), and it is not multicast/broadcast. */ if (hlen > 0 && is_ipv4 && offset == 0 && (proto != IPPROTO_ICMP || is_icmp_query(ICMP(ulp))) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN_MULTICAST(ntohl(dst_ip.s_addr))) { send_reject(args, cmd->arg1, iplen, ip); m = args->m; } /* FALLTHROUGH */ #ifdef INET6 case O_UNREACH6: if (hlen > 0 && is_ipv6 && ((offset & IP6F_OFF_MASK) == 0) && (proto != IPPROTO_ICMPV6 || (is_icmp6_query(icmp6_type) == 1)) && !(m->m_flags & (M_BCAST|M_MCAST)) && !IN6_IS_ADDR_MULTICAST(&args->f_id.dst_ip6)) { send_reject6( args, cmd->arg1, hlen, (struct ip6_hdr *)ip); m = args->m; } /* FALLTHROUGH */ #endif case O_DENY: retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_FORWARD_IP: if (args->eh) /* not valid on layer2 pkts */ break; if (q == NULL || q->rule != f || dyn_dir == MATCH_FORWARD) { struct sockaddr_in *sa; sa = &(((ipfw_insn_sa *)cmd)->sa); if (sa->sin_addr.s_addr == INADDR_ANY) { bcopy(sa, &args->hopstore, sizeof(*sa)); args->hopstore.sin_addr.s_addr = htonl(tablearg); args->next_hop = &args->hopstore; } else { args->next_hop = sa; } } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #ifdef INET6 case O_FORWARD_IP6: if (args->eh) /* not valid on layer2 pkts */ break; if (q == NULL || q->rule != f || dyn_dir == MATCH_FORWARD) { struct sockaddr_in6 *sin6; sin6 = &(((ipfw_insn_sa6 *)cmd)->sa); args->next_hop6 = sin6; } retval = IP_FW_PASS; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; #endif case O_NETGRAPH: case O_NGTEE: set_match(args, f_pos, chain); args->rule.info = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; if (V_fw_one_pass) args->rule.info |= IPFW_ONEPASS; retval = (cmd->opcode == O_NETGRAPH) ? IP_FW_NETGRAPH : IP_FW_NGTEE; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_SETFIB: { uint32_t fib; f->pcnt++; /* update stats */ f->bcnt += pktlen; f->timestamp = time_uptime; fib = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg: cmd->arg1; if (fib >= rt_numfibs) fib = 0; M_SETFIB(m, fib); args->f_id.fib = fib; l = 0; /* exit inner loop */ break; } case O_NAT: if (!IPFW_NAT_LOADED) { retval = IP_FW_DENY; } else { struct cfg_nat *t; int nat_id; set_match(args, f_pos, chain); /* Check if this is 'global' nat rule */ if (cmd->arg1 == 0) { retval = ipfw_nat_ptr(args, NULL, m); l = 0; done = 1; break; } t = ((ipfw_insn_nat *)cmd)->nat; if (t == NULL) { nat_id = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg : cmd->arg1; t = (*lookup_nat_ptr)(&chain->nat, nat_id); if (t == NULL) { retval = IP_FW_DENY; l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; } if (cmd->arg1 != IP_FW_TABLEARG) ((ipfw_insn_nat *)cmd)->nat = t; } retval = ipfw_nat_ptr(args, t, m); } l = 0; /* exit inner loop */ done = 1; /* exit outer loop */ break; case O_REASS: { int ip_off; f->pcnt++; f->bcnt += pktlen; l = 0; /* in any case exit inner loop */ ip_off = ntohs(ip->ip_off); /* if not fragmented, go to next rule */ if ((ip_off & (IP_MF | IP_OFFMASK)) == 0) break; /* * ip_reass() expects len & off in host * byte order. */ SET_HOST_IPLEN(ip); args->m = m = ip_reass(m); /* * do IP header checksum fixup. */ if (m == NULL) { /* fragment got swallowed */ retval = IP_FW_DENY; } else { /* good, packet complete */ int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; SET_NET_IPLEN(ip); ip->ip_sum = 0; if (hlen == sizeof(struct ip)) ip->ip_sum = in_cksum_hdr(ip); else ip->ip_sum = in_cksum(m, hlen); retval = IP_FW_REASS; set_match(args, f_pos, chain); } done = 1; /* exit outer loop */ break; } default: panic("-- unknown opcode %d\n", cmd->opcode); } /* end of switch() on opcodes */ /* * if we get here with l=0, then match is irrelevant. */ if (cmd->len & F_NOT) match = !match; if (match) { if (cmd->len & F_OR) skip_or = 1; } else { if (!(cmd->len & F_OR)) /* not an OR block, */ break; /* try next rule */ } } /* end of inner loop, scan opcodes */ #undef PULLUP_LEN if (done) break; /* next_rule:; */ /* try next rule */ } /* end of outer for, scan rules */ if (done) { struct ip_fw *rule = chain->map[f_pos]; /* Update statistics */ rule->pcnt++; rule->bcnt += pktlen; rule->timestamp = time_uptime; } else { retval = IP_FW_DENY; printf("ipfw: ouch!, skip past end of rules, denying packet\n"); } IPFW_RUNLOCK(chain); #ifdef __FreeBSD__ if (ucred_cache != NULL) crfree(ucred_cache); #endif return (retval); pullup_failed: if (V_fw_verbose) printf("ipfw: pullup failed\n"); return (IP_FW_DENY); } /* * Set maximum number of tables that can be used in given VNET ipfw instance. */ #ifdef SYSCTL_NODE static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS) { int error; unsigned int ntables; ntables = V_fw_tables_max; error = sysctl_handle_int(oidp, &ntables, 0, req); /* Read operation or some error */ if ((error != 0) || (req->newptr == NULL)) return (error); return (ipfw_resize_tables(&V_layer3_chain, ntables)); } #endif /* * Module and VNET glue */ /* * Stuff that must be initialised only on boot or module load */ static int ipfw_init(void) { int error = 0; ipfw_dyn_attach(); /* * Only print out this stuff the first time around, * when called from the sysinit code. */ printf("ipfw2 " #ifdef INET6 "(+ipv6) " #endif "initialized, divert %s, nat %s, " "rule-based forwarding " #ifdef IPFIREWALL_FORWARD "enabled, " #else "disabled, " #endif "default to %s, logging ", #ifdef IPDIVERT "enabled", #else "loadable", #endif #ifdef IPFIREWALL_NAT "enabled", #else "loadable", #endif default_to_accept ? "accept" : "deny"); /* * Note: V_xxx variables can be accessed here but the vnet specific * initializer may not have been called yet for the VIMAGE case. * Tuneables will have been processed. We will print out values for * the default vnet. * XXX This should all be rationalized AFTER 8.0 */ if (V_fw_verbose == 0) printf("disabled\n"); else if (V_verbose_limit == 0) printf("unlimited\n"); else printf("limited to %d packets/entry by default\n", V_verbose_limit); /* Check user-supplied table count for validness */ if (default_fw_tables > IPFW_TABLES_MAX) default_fw_tables = IPFW_TABLES_MAX; ipfw_log_bpf(1); /* init */ return (error); } /* * Called for the removal of the last instance only on module unload. */ static void ipfw_destroy(void) { ipfw_log_bpf(0); /* uninit */ ipfw_dyn_detach(); printf("IP firewall unloaded\n"); } /* * Stuff that must be initialized for every instance * (including the first of course). */ static int vnet_ipfw_init(const void *unused) { int error; struct ip_fw *rule = NULL; struct ip_fw_chain *chain; chain = &V_layer3_chain; /* First set up some values that are compile time options */ V_autoinc_step = 100; /* bounded to 1..1000 in add_rule() */ V_fw_deny_unknown_exthdrs = 1; #ifdef IPFIREWALL_VERBOSE V_fw_verbose = 1; #endif #ifdef IPFIREWALL_VERBOSE_LIMIT V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT; #endif #ifdef IPFIREWALL_NAT LIST_INIT(&chain->nat); #endif /* insert the default rule and create the initial map */ chain->n_rules = 1; chain->static_len = sizeof(struct ip_fw); chain->map = malloc(sizeof(struct ip_fw *), M_IPFW, M_WAITOK | M_ZERO); if (chain->map) rule = malloc(chain->static_len, M_IPFW, M_WAITOK | M_ZERO); /* Set initial number of tables */ V_fw_tables_max = default_fw_tables; error = ipfw_init_tables(chain); if (error) { printf("ipfw2: setting up tables failed\n"); free(chain->map, M_IPFW); free(rule, M_IPFW); return (ENOSPC); } /* fill and insert the default rule */ rule->act_ofs = 0; rule->rulenum = IPFW_DEFAULT_RULE; rule->cmd_len = 1; rule->set = RESVD_SET; rule->cmd[0].len = 1; rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY; chain->rules = chain->default_rule = chain->map[0] = rule; chain->id = rule->id = 1; IPFW_LOCK_INIT(chain); ipfw_dyn_init(); /* First set up some values that are compile time options */ V_ipfw_vnet_ready = 1; /* Open for business */ /* * Hook the sockopt handler, and the layer2 (V_ip_fw_chk_ptr) * and pfil hooks for ipv4 and ipv6. Even if the latter two fail * we still keep the module alive because the sockopt and * layer2 paths are still useful. * ipfw[6]_hook return 0 on success, ENOENT on failure, * so we can ignore the exact return value and just set a flag. * * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so * changes in the underlying (per-vnet) variables trigger * immediate hook()/unhook() calls. * In layer2 we have the same behaviour, except that V_ether_ipfw * is checked on each packet because there are no pfil hooks. */ V_ip_fw_ctl_ptr = ipfw_ctl; V_ip_fw_chk_ptr = ipfw_chk; error = ipfw_attach_hooks(1); return (error); } /* * Called for the removal of each instance. */ static int vnet_ipfw_uninit(const void *unused) { struct ip_fw *reap, *rule; struct ip_fw_chain *chain = &V_layer3_chain; int i; V_ipfw_vnet_ready = 0; /* tell new callers to go away */ /* * disconnect from ipv4, ipv6, layer2 and sockopt. * Then grab, release and grab again the WLOCK so we make * sure the update is propagated and nobody will be in. */ (void)ipfw_attach_hooks(0 /* detach */); V_ip_fw_chk_ptr = NULL; V_ip_fw_ctl_ptr = NULL; IPFW_UH_WLOCK(chain); IPFW_UH_WUNLOCK(chain); IPFW_UH_WLOCK(chain); IPFW_WLOCK(chain); ipfw_dyn_uninit(0); /* run the callout_drain */ IPFW_WUNLOCK(chain); ipfw_destroy_tables(chain); reap = NULL; IPFW_WLOCK(chain); for (i = 0; i < chain->n_rules; i++) { rule = chain->map[i]; rule->x_next = reap; reap = rule; } if (chain->map) free(chain->map, M_IPFW); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); if (reap != NULL) ipfw_reap_rules(reap); IPFW_LOCK_DESTROY(chain); ipfw_dyn_uninit(1); /* free the remaining parts */ return 0; } /* * Module event handler. * In general we have the choice of handling most of these events by the * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to * use the SYSINIT handlers as they are more capable of expressing the * flow of control during module and vnet operations, so this is just * a skeleton. Note there is no SYSINIT equivalent of the module * SHUTDOWN handler, but we don't have anything to do in that case anyhow. */ static int ipfw_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: /* Called once at module load or * system boot if compiled in. */ break; case MOD_QUIESCE: /* Called before unload. May veto unloading. */ break; case MOD_UNLOAD: /* Called during unload. */ break; case MOD_SHUTDOWN: /* Called during system shutdown. */ break; default: err = EOPNOTSUPP; break; } return err; } static moduledata_t ipfwmod = { "ipfw", ipfw_modevent, 0 }; /* Define startup order. */ #define IPFW_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN #define IPFW_MODEVENT_ORDER (SI_ORDER_ANY - 255) /* On boot slot in here. */ #define IPFW_MODULE_ORDER (IPFW_MODEVENT_ORDER + 1) /* A little later. */ #define IPFW_VNET_ORDER (IPFW_MODEVENT_ORDER + 2) /* Later still. */ DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER); MODULE_VERSION(ipfw, 2); /* should declare some dependencies here */ /* * Starting up. Done in order after ipfwmod() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_init, NULL); VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_init, NULL); /* * Closing up shop. These are done in REVERSE ORDER, but still * after ipfwmod() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER, ipfw_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER, vnet_ipfw_uninit, NULL); /* end of file */ ipfw-user/sys/netinet/ipfw/ip_dn_private.h000644 000423 000000 00000032041 12007700562 021461 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 2010 Luigi Rizzo, Riccardo Panicucci, Universita` di Pisa * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * internal dummynet APIs. * * $FreeBSD: head/sys/netinet/ipfw/ip_dn_private.h 213253 2010-09-28 22:46:13Z luigi $ */ #ifndef _IP_DN_PRIVATE_H #define _IP_DN_PRIVATE_H /* debugging support * use ND() to remove debugging, D() to print a line, * DX(level, ...) to print above a certain level * If you redefine D() you are expected to redefine all. */ #ifndef D #define ND(fmt, ...) do {} while (0) #define D1(fmt, ...) do {} while (0) #define D(fmt, ...) printf("%-10s " fmt "\n", \ __FUNCTION__, ## __VA_ARGS__) #define DX(lev, fmt, ...) do { \ if (dn_cfg.debug > lev) D(fmt, ## __VA_ARGS__); } while (0) #endif MALLOC_DECLARE(M_DUMMYNET); #ifndef __linux__ #define div64(a, b) ((int64_t)(a) / (int64_t)(b)) #endif #define DN_LOCK_INIT() do { \ mtx_init(&dn_cfg.uh_mtx, "dn_uh", NULL, MTX_DEF); \ mtx_init(&dn_cfg.bh_mtx, "dn_bh", NULL, MTX_DEF); \ } while (0) #define DN_LOCK_DESTROY() do { \ mtx_destroy(&dn_cfg.uh_mtx); \ mtx_destroy(&dn_cfg.bh_mtx); \ } while (0) #if 0 /* not used yet */ #define DN_UH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_UH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_UH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_UH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_UH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) #endif #define DN_BH_RLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_BH_RUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_BH_WLOCK() mtx_lock(&dn_cfg.uh_mtx) #define DN_BH_WUNLOCK() mtx_unlock(&dn_cfg.uh_mtx) #define DN_BH_LOCK_ASSERT() mtx_assert(&dn_cfg.uh_mtx, MA_OWNED) SLIST_HEAD(dn_schk_head, dn_schk); SLIST_HEAD(dn_sch_inst_head, dn_sch_inst); SLIST_HEAD(dn_fsk_head, dn_fsk); SLIST_HEAD(dn_queue_head, dn_queue); SLIST_HEAD(dn_alg_head, dn_alg); struct mq { /* a basic queue of packets*/ struct mbuf *head, *tail; int count; }; static inline void set_oid(struct dn_id *o, int type, int len) { o->type = type; o->len = len; o->subtype = 0; }; /* * configuration and global data for a dummynet instance * * When a configuration is modified from userland, 'id' is incremented * so we can use the value to check for stale pointers. */ struct dn_parms { uint32_t id; /* configuration version */ /* defaults (sysctl-accessible) */ int red_lookup_depth; int red_avg_pkt_size; int red_max_pkt_size; int hash_size; int max_hash_size; long byte_limit; /* max queue sizes */ long slot_limit; int io_fast; int debug; /* timekeeping */ struct timeval prev_t; /* last time dummynet_tick ran */ struct dn_heap evheap; /* scheduled events */ /* counters of objects -- used for reporting space */ int schk_count; int si_count; int fsk_count; int queue_count; /* ticks and other stuff */ uint64_t curr_time; /* flowsets and schedulers are in hash tables, with 'hash_size' * buckets. fshash is looked up at every packet arrival * so better be generous if we expect many entries. */ struct dn_ht *fshash; struct dn_ht *schedhash; /* list of flowsets without a scheduler -- use sch_chain */ struct dn_fsk_head fsu; /* list of unlinked flowsets */ struct dn_alg_head schedlist; /* list of algorithms */ /* Store the fs/sch to scan when draining. The value is the * bucket number of the hash table. Expire can be disabled * with net.inet.ip.dummynet.expire=0, or it happens every * expire ticks. **/ int drain_fs; int drain_sch; uint32_t expire; uint32_t expire_cycle; /* tick count */ int init_done; /* if the upper half is busy doing something long, * can set the busy flag and we will enqueue packets in * a queue for later processing. */ int busy; struct mq pending; #ifdef _KERNEL /* * This file is normally used in the kernel, unless we do * some userland tests, in which case we do not need a mtx. * uh_mtx arbitrates between system calls and also * protects fshash, schedhash and fsunlinked. * These structures are readonly for the lower half. * bh_mtx protects all other structures which may be * modified upon packet arrivals */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t uh_mtx; spinlock_t bh_mtx; #else struct mtx uh_mtx; struct mtx bh_mtx; #endif #endif /* _KERNEL */ }; /* * Delay line, contains all packets on output from a link. * Every scheduler instance has one. */ struct delay_line { struct dn_id oid; struct dn_sch_inst *si; struct mq mq; }; /* * The kernel side of a flowset. It is linked in a hash table * of flowsets, and in a list of children of their parent scheduler. * qht is either the queue or (if HAVE_MASK) a hash table queues. * Note that the mask to use is the (flow_mask|sched_mask), which * changes as we attach/detach schedulers. So we store it here. * * XXX If we want to add scheduler-specific parameters, we need to * put them in external storage because the scheduler may not be * available when the fsk is created. */ struct dn_fsk { /* kernel side of a flowset */ struct dn_fs fs; SLIST_ENTRY(dn_fsk) fsk_next; /* hash chain for fshash */ struct ipfw_flow_id fsk_mask; /* qht is a hash table of queues, or just a single queue * a bit in fs.flags tells us which one */ struct dn_ht *qht; struct dn_schk *sched; /* Sched we are linked to */ SLIST_ENTRY(dn_fsk) sch_chain; /* list of fsk attached to sched */ /* bucket index used by drain routine to drain queues for this * flowset */ int drain_bucket; /* Parameter realted to RED / GRED */ /* original values are in dn_fs*/ int w_q ; /* queue weight (scaled) */ int max_th ; /* maximum threshold for queue (scaled) */ int min_th ; /* minimum threshold for queue (scaled) */ int max_p ; /* maximum value for p_b (scaled) */ u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ u_int lookup_depth ; /* depth of lookup table */ int lookup_step ; /* granularity inside the lookup table */ int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ int avg_pkt_size ; /* medium packet size */ int max_pkt_size ; /* max packet size */ }; /* * A queue is created as a child of a flowset unless it belongs to * a !MULTIQUEUE scheduler. It is normally in a hash table in the * flowset. fs always points to the parent flowset. * si normally points to the sch_inst, unless the flowset has been * detached from the scheduler -- in this case si == NULL and we * should not enqueue. */ struct dn_queue { struct dn_flow ni; /* oid, flow_id, stats */ struct mq mq; /* packets queue */ struct dn_sch_inst *_si; /* owner scheduler instance */ SLIST_ENTRY(dn_queue) q_next; /* hash chain list for qht */ struct dn_fsk *fs; /* parent flowset. */ /* RED parameters */ int avg; /* average queue length est. (scaled) */ int count; /* arrivals since last RED drop */ int random; /* random value (scaled) */ uint64_t q_time; /* start of queue idle time */ }; /* * The kernel side of a scheduler. Contains the userland config, * a link, pointer to extra config arguments from command line, * kernel flags, and a pointer to the scheduler methods. * It is stored in a hash table, and holds a list of all * flowsets and scheduler instances. * XXX sch must be at the beginning, see schk_hash(). */ struct dn_schk { struct dn_sch sch; struct dn_alg *fp; /* Pointer to scheduler functions */ struct dn_link link; /* The link, embedded */ struct dn_profile *profile; /* delay profile, if any */ struct dn_id *cfg; /* extra config arguments */ SLIST_ENTRY(dn_schk) schk_next; /* hash chain for schedhash */ struct dn_fsk_head fsk_list; /* all fsk linked to me */ struct dn_fsk *fs; /* Flowset for !MULTIQUEUE */ /* bucket index used by the drain routine to drain the scheduler * instance for this flowset. */ int drain_bucket; /* Hash table of all instances (through sch.sched_mask) * or single instance if no mask. Always valid. */ struct dn_ht *siht; }; /* * Scheduler instance. * Contains variables and all queues relative to a this instance. * This struct is created a runtime. */ struct dn_sch_inst { struct dn_flow ni; /* oid, flowid and stats */ SLIST_ENTRY(dn_sch_inst) si_next; /* hash chain for siht */ struct delay_line dline; struct dn_schk *sched; /* the template */ int kflags; /* DN_ACTIVE */ int64_t credit; /* bits I can transmit (more or less). */ uint64_t sched_time; /* time link was scheduled in ready_heap */ uint64_t idle_time; /* start of scheduler instance idle time */ /* q_count is the number of queues that this instance is using. * The counter is incremented or decremented when * a reference from the queue is created or deleted. * It is used to make sure that a scheduler instance can be safely * deleted by the drain routine. See notes below. */ int q_count; }; /* * NOTE about object drain. * The system will automatically (XXX check when) drain queues and * scheduler instances when they are idle. * A queue is idle when it has no packets; an instance is idle when * it is not in the evheap heap, and the corresponding delay line is empty. * A queue can be safely deleted when it is idle because of the scheduler * function xxx_free_queue() will remove any references to it. * An instance can be only deleted when no queues reference it. To be sure * of that, a counter (q_count) stores the number of queues that are pointing * to the instance. * * XXX * Order of scan: * - take all flowset in a bucket for the flowset hash table * - take all queues in a bucket for the flowset * - increment the queue bucket * - scan next flowset bucket * Nothing is done if a bucket contains no entries. * * The same schema is used for sceduler instances */ /* kernel-side flags. Linux has DN_DELETE in fcntl.h */ enum { /* 1 and 2 are reserved for the SCAN flags */ DN_DESTROY = 0x0004, /* destroy */ DN_DELETE_FS = 0x0008, /* destroy flowset */ DN_DETACH = 0x0010, DN_ACTIVE = 0x0020, /* object is in evheap */ DN_F_DLINE = 0x0040, /* object is a delay line */ DN_DEL_SAFE = 0x0080, /* delete a queue only if no longer needed * by scheduler */ DN_QHT_IS_Q = 0x0100, /* in flowset, qht is a single queue */ }; extern struct dn_parms dn_cfg; //VNET_DECLARE(struct dn_parms, _base_dn_cfg); //#define dn_cfg VNET(_base_dn_cfg) int dummynet_io(struct mbuf **, int , struct ip_fw_args *); void dummynet_task(void *context, int pending); void dn_reschedule(void); struct dn_queue *ipdn_q_find(struct dn_fsk *, struct dn_sch_inst *, struct ipfw_flow_id *); struct dn_sch_inst *ipdn_si_find(struct dn_schk *, struct ipfw_flow_id *); /* * copy_range is a template for requests for ranges of pipes/queues/scheds. * The number of ranges is variable and can be derived by o.len. * As a default, we use a small number of entries so that the struct * fits easily on the stack and is sufficient for most common requests. */ #define DEFAULT_RANGES 5 struct copy_range { struct dn_id o; uint32_t r[ 2 * DEFAULT_RANGES ]; }; struct copy_args { char **start; char *end; int flags; int type; struct copy_range *extra; /* extra filtering */ }; struct sockopt; int ip_dummynet_compat(struct sockopt *sopt); int dummynet_get(struct sockopt *sopt, void **compat); int dn_c_copy_q (void *_ni, void *arg); int dn_c_copy_pipe(struct dn_schk *s, struct copy_args *a, int nq); int dn_c_copy_fs(struct dn_fsk *f, struct copy_args *a, int nq); int dn_compat_copy_queue(struct copy_args *a, void *_o); int dn_compat_copy_pipe(struct copy_args *a, void *_o); int copy_data_helper_compat(void *_o, void *_arg); int dn_compat_calc_size(void); int do_config(void *p, int l); /* function to drain idle object */ void dn_drain_scheduler(void); void dn_drain_queue(void); #endif /* _IP_DN_PRIVATE_H */ ipfw-user/sys/netinet/ipfw/ip_dummynet.c000644 000423 000000 00000165600 12007435564 021202 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1998-2002,2010 Luigi Rizzo, Universita` di Pisa * Portions Copyright (c) 2000 Akamba Corp. * All rights reserved * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD: head/sys/netinet/ipfw/ip_dummynet.c 238988 2012-08-02 12:45:13Z luigi $"); /* * Configuration and internal object management for dummynet. */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */ #include #include /* ip_output(), IP_FORWARDING */ #include #include #include #include #include #include /* which objects to copy */ #define DN_C_LINK 0x01 #define DN_C_SCH 0x02 #define DN_C_FLOW 0x04 #define DN_C_FS 0x08 #define DN_C_QUEUE 0x10 /* we use this argument in case of a schk_new */ struct schk_new_arg { struct dn_alg *fp; struct dn_sch *sch; }; /*---- callout hooks. ----*/ static struct callout dn_timeout; static struct task dn_task; static struct taskqueue *dn_tq = NULL; static void dummynet(void *arg) { (void)arg; /* UNUSED */ taskqueue_enqueue(dn_tq, &dn_task); } void dn_reschedule(void) { callout_reset(&dn_timeout, 1, dummynet, NULL); } /*----- end of callout hooks -----*/ /* Return a scheduler descriptor given the type or name. */ static struct dn_alg * find_sched_type(int type, char *name) { struct dn_alg *d; SLIST_FOREACH(d, &dn_cfg.schedlist, next) { if (d->type == type || (name && !strcasecmp(d->name, name))) return d; } return NULL; /* not found */ } int ipdn_bound_var(int *v, int dflt, int lo, int hi, const char *msg) { int oldv = *v; const char *op = NULL; if (dflt < lo) dflt = lo; if (dflt > hi) dflt = hi; if (oldv < lo) { *v = dflt; op = "Bump"; } else if (oldv > hi) { *v = hi; op = "Clamp"; } else return *v; if (op && msg) printf("%s %s to %d (was %d)\n", op, msg, *v, oldv); return *v; } /*---- flow_id mask, hash and compare functions ---*/ /* * The flow_id includes the 5-tuple, the queue/pipe number * which we store in the extra area in host order, * and for ipv6 also the flow_id6. * XXX see if we want the tos byte (can store in 'flags') */ static struct ipfw_flow_id * flow_id_mask(struct ipfw_flow_id *mask, struct ipfw_flow_id *id) { int is_v6 = IS_IP6_FLOW_ID(id); id->dst_port &= mask->dst_port; id->src_port &= mask->src_port; id->proto &= mask->proto; id->extra &= mask->extra; if (is_v6) { APPLY_MASK(&id->dst_ip6, &mask->dst_ip6); APPLY_MASK(&id->src_ip6, &mask->src_ip6); id->flow_id6 &= mask->flow_id6; } else { id->dst_ip &= mask->dst_ip; id->src_ip &= mask->src_ip; } return id; } /* computes an OR of two masks, result in dst and also returned */ static struct ipfw_flow_id * flow_id_or(struct ipfw_flow_id *src, struct ipfw_flow_id *dst) { int is_v6 = IS_IP6_FLOW_ID(dst); dst->dst_port |= src->dst_port; dst->src_port |= src->src_port; dst->proto |= src->proto; dst->extra |= src->extra; if (is_v6) { #define OR_MASK(_d, _s) \ (_d)->__u6_addr.__u6_addr32[0] |= (_s)->__u6_addr.__u6_addr32[0]; \ (_d)->__u6_addr.__u6_addr32[1] |= (_s)->__u6_addr.__u6_addr32[1]; \ (_d)->__u6_addr.__u6_addr32[2] |= (_s)->__u6_addr.__u6_addr32[2]; \ (_d)->__u6_addr.__u6_addr32[3] |= (_s)->__u6_addr.__u6_addr32[3]; OR_MASK(&dst->dst_ip6, &src->dst_ip6); OR_MASK(&dst->src_ip6, &src->src_ip6); #undef OR_MASK dst->flow_id6 |= src->flow_id6; } else { dst->dst_ip |= src->dst_ip; dst->src_ip |= src->src_ip; } return dst; } static int nonzero_mask(struct ipfw_flow_id *m) { if (m->dst_port || m->src_port || m->proto || m->extra) return 1; if (IS_IP6_FLOW_ID(m)) { return m->dst_ip6.__u6_addr.__u6_addr32[0] || m->dst_ip6.__u6_addr.__u6_addr32[1] || m->dst_ip6.__u6_addr.__u6_addr32[2] || m->dst_ip6.__u6_addr.__u6_addr32[3] || m->src_ip6.__u6_addr.__u6_addr32[0] || m->src_ip6.__u6_addr.__u6_addr32[1] || m->src_ip6.__u6_addr.__u6_addr32[2] || m->src_ip6.__u6_addr.__u6_addr32[3] || m->flow_id6; } else { return m->dst_ip || m->src_ip; } } /* XXX we may want a better hash function */ static uint32_t flow_id_hash(struct ipfw_flow_id *id) { uint32_t i; if (IS_IP6_FLOW_ID(id)) { uint32_t *d = (uint32_t *)&id->dst_ip6; uint32_t *s = (uint32_t *)&id->src_ip6; i = (d[0] ) ^ (d[1]) ^ (d[2] ) ^ (d[3]) ^ (d[0] >> 15) ^ (d[1] >> 15) ^ (d[2] >> 15) ^ (d[3] >> 15) ^ (s[0] << 1) ^ (s[1] << 1) ^ (s[2] << 1) ^ (s[3] << 1) ^ (s[0] << 16) ^ (s[1] << 16) ^ (s[2] << 16) ^ (s[3] << 16) ^ (id->dst_port << 1) ^ (id->src_port) ^ (id->extra) ^ (id->proto ) ^ (id->flow_id6); } else { i = (id->dst_ip) ^ (id->dst_ip >> 15) ^ (id->src_ip << 1) ^ (id->src_ip >> 16) ^ (id->extra) ^ (id->dst_port << 1) ^ (id->src_port) ^ (id->proto); } return i; } /* Like bcmp, returns 0 if ids match, 1 otherwise. */ static int flow_id_cmp(struct ipfw_flow_id *id1, struct ipfw_flow_id *id2) { int is_v6 = IS_IP6_FLOW_ID(id1); if (!is_v6) { if (IS_IP6_FLOW_ID(id2)) return 1; /* different address families */ return (id1->dst_ip == id2->dst_ip && id1->src_ip == id2->src_ip && id1->dst_port == id2->dst_port && id1->src_port == id2->src_port && id1->proto == id2->proto && id1->extra == id2->extra) ? 0 : 1; } /* the ipv6 case */ return ( !bcmp(&id1->dst_ip6,&id2->dst_ip6, sizeof(id1->dst_ip6)) && !bcmp(&id1->src_ip6,&id2->src_ip6, sizeof(id1->src_ip6)) && id1->dst_port == id2->dst_port && id1->src_port == id2->src_port && id1->proto == id2->proto && id1->extra == id2->extra && id1->flow_id6 == id2->flow_id6) ? 0 : 1; } /*--------- end of flow-id mask, hash and compare ---------*/ /*--- support functions for the qht hashtable ---- * Entries are hashed by flow-id */ static uint32_t q_hash(uintptr_t key, int flags, void *arg) { /* compute the hash slot from the flow id */ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? &((struct dn_queue *)key)->ni.fid : (struct ipfw_flow_id *)key; return flow_id_hash(id); } static int q_match(void *obj, uintptr_t key, int flags, void *arg) { struct dn_queue *o = (struct dn_queue *)obj; struct ipfw_flow_id *id2; if (flags & DNHT_KEY_IS_OBJ) { /* compare pointers */ id2 = &((struct dn_queue *)key)->ni.fid; } else { id2 = (struct ipfw_flow_id *)key; } return (0 == flow_id_cmp(&o->ni.fid, id2)); } /* * create a new queue instance for the given 'key'. */ static void * q_new(uintptr_t key, int flags, void *arg) { struct dn_queue *q, *template = arg; struct dn_fsk *fs = template->fs; int size = sizeof(*q) + fs->sched->fp->q_datalen; q = malloc(size, M_DUMMYNET, M_NOWAIT | M_ZERO); if (q == NULL) { D("no memory for new queue"); return NULL; } set_oid(&q->ni.oid, DN_QUEUE, size); if (fs->fs.flags & DN_QHT_HASH) q->ni.fid = *(struct ipfw_flow_id *)key; q->fs = fs; q->_si = template->_si; q->_si->q_count++; if (fs->sched->fp->new_queue) fs->sched->fp->new_queue(q); dn_cfg.queue_count++; return q; } /* * Notify schedulers that a queue is going away. * If (flags & DN_DESTROY), also free the packets. * The version for callbacks is called q_delete_cb(). */ static void dn_delete_queue(struct dn_queue *q, int flags) { struct dn_fsk *fs = q->fs; // D("fs %p si %p\n", fs, q->_si); /* notify the parent scheduler that the queue is going away */ if (fs && fs->sched->fp->free_queue) fs->sched->fp->free_queue(q); q->_si->q_count--; q->_si = NULL; if (flags & DN_DESTROY) { if (q->mq.head) dn_free_pkts(q->mq.head); bzero(q, sizeof(*q)); // safety free(q, M_DUMMYNET); dn_cfg.queue_count--; } } static int q_delete_cb(void *q, void *arg) { int flags = (int)(uintptr_t)arg; dn_delete_queue(q, flags); return (flags & DN_DESTROY) ? DNHT_SCAN_DEL : 0; } /* * calls dn_delete_queue/q_delete_cb on all queues, * which notifies the parent scheduler and possibly drains packets. * flags & DN_DESTROY: drains queues and destroy qht; */ static void qht_delete(struct dn_fsk *fs, int flags) { ND("fs %d start flags %d qht %p", fs->fs.fs_nr, flags, fs->qht); if (!fs->qht) return; if (fs->fs.flags & DN_QHT_HASH) { dn_ht_scan(fs->qht, q_delete_cb, (void *)(uintptr_t)flags); if (flags & DN_DESTROY) { dn_ht_free(fs->qht, 0); fs->qht = NULL; } } else { dn_delete_queue((struct dn_queue *)(fs->qht), flags); if (flags & DN_DESTROY) fs->qht = NULL; } } /* * Find and possibly create the queue for a MULTIQUEUE scheduler. * We never call it for !MULTIQUEUE (the queue is in the sch_inst). */ struct dn_queue * ipdn_q_find(struct dn_fsk *fs, struct dn_sch_inst *si, struct ipfw_flow_id *id) { struct dn_queue template; template._si = si; template.fs = fs; if (fs->fs.flags & DN_QHT_HASH) { struct ipfw_flow_id masked_id; if (fs->qht == NULL) { fs->qht = dn_ht_init(NULL, fs->fs.buckets, offsetof(struct dn_queue, q_next), q_hash, q_match, q_new); if (fs->qht == NULL) return NULL; } masked_id = *id; flow_id_mask(&fs->fsk_mask, &masked_id); return dn_ht_find(fs->qht, (uintptr_t)&masked_id, DNHT_INSERT, &template); } else { if (fs->qht == NULL) fs->qht = q_new(0, 0, &template); return (struct dn_queue *)fs->qht; } } /*--- end of queue hash table ---*/ /*--- support functions for the sch_inst hashtable ---- * * These are hashed by flow-id */ static uint32_t si_hash(uintptr_t key, int flags, void *arg) { /* compute the hash slot from the flow id */ struct ipfw_flow_id *id = (flags & DNHT_KEY_IS_OBJ) ? &((struct dn_sch_inst *)key)->ni.fid : (struct ipfw_flow_id *)key; return flow_id_hash(id); } static int si_match(void *obj, uintptr_t key, int flags, void *arg) { struct dn_sch_inst *o = obj; struct ipfw_flow_id *id2; id2 = (flags & DNHT_KEY_IS_OBJ) ? &((struct dn_sch_inst *)key)->ni.fid : (struct ipfw_flow_id *)key; return flow_id_cmp(&o->ni.fid, id2) == 0; } /* * create a new instance for the given 'key' * Allocate memory for instance, delay line and scheduler private data. */ static void * si_new(uintptr_t key, int flags, void *arg) { struct dn_schk *s = arg; struct dn_sch_inst *si; int l = sizeof(*si) + s->fp->si_datalen; si = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); if (si == NULL) goto error; /* Set length only for the part passed up to userland. */ set_oid(&si->ni.oid, DN_SCH_I, sizeof(struct dn_flow)); set_oid(&(si->dline.oid), DN_DELAY_LINE, sizeof(struct delay_line)); /* mark si and dline as outside the event queue */ si->ni.oid.id = si->dline.oid.id = -1; si->sched = s; si->dline.si = si; if (s->fp->new_sched && s->fp->new_sched(si)) { D("new_sched error"); goto error; } if (s->sch.flags & DN_HAVE_MASK) si->ni.fid = *(struct ipfw_flow_id *)key; dn_cfg.si_count++; return si; error: if (si) { bzero(si, sizeof(*si)); // safety free(si, M_DUMMYNET); } return NULL; } /* * Callback from siht to delete all scheduler instances. Remove * si and delay line from the system heap, destroy all queues. * We assume that all flowset have been notified and do not * point to us anymore. */ static int si_destroy(void *_si, void *arg) { struct dn_sch_inst *si = _si; struct dn_schk *s = si->sched; struct delay_line *dl = &si->dline; if (dl->oid.subtype) /* remove delay line from event heap */ heap_extract(&dn_cfg.evheap, dl); dn_free_pkts(dl->mq.head); /* drain delay line */ if (si->kflags & DN_ACTIVE) /* remove si from event heap */ heap_extract(&dn_cfg.evheap, si); if (s->fp->free_sched) s->fp->free_sched(si); bzero(si, sizeof(*si)); /* safety */ free(si, M_DUMMYNET); dn_cfg.si_count--; return DNHT_SCAN_DEL; } /* * Find the scheduler instance for this packet. If we need to apply * a mask, do on a local copy of the flow_id to preserve the original. * Assume siht is always initialized if we have a mask. */ struct dn_sch_inst * ipdn_si_find(struct dn_schk *s, struct ipfw_flow_id *id) { if (s->sch.flags & DN_HAVE_MASK) { struct ipfw_flow_id id_t = *id; flow_id_mask(&s->sch.sched_mask, &id_t); return dn_ht_find(s->siht, (uintptr_t)&id_t, DNHT_INSERT, s); } if (!s->siht) s->siht = si_new(0, 0, s); return (struct dn_sch_inst *)s->siht; } /* callback to flush credit for the scheduler instance */ static int si_reset_credit(void *_si, void *arg) { struct dn_sch_inst *si = _si; struct dn_link *p = &si->sched->link; si->credit = p->burst + (dn_cfg.io_fast ? p->bandwidth : 0); return 0; } static void schk_reset_credit(struct dn_schk *s) { if (s->sch.flags & DN_HAVE_MASK) dn_ht_scan(s->siht, si_reset_credit, NULL); else if (s->siht) si_reset_credit(s->siht, NULL); } /*---- end of sch_inst hashtable ---------------------*/ /*------------------------------------------------------- * flowset hash (fshash) support. Entries are hashed by fs_nr. * New allocations are put in the fsunlinked list, from which * they are removed when they point to a specific scheduler. */ static uint32_t fsk_hash(uintptr_t key, int flags, void *arg) { uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : ((struct dn_fsk *)key)->fs.fs_nr; return ( (i>>8)^(i>>4)^i ); } static int fsk_match(void *obj, uintptr_t key, int flags, void *arg) { struct dn_fsk *fs = obj; int i = !(flags & DNHT_KEY_IS_OBJ) ? key : ((struct dn_fsk *)key)->fs.fs_nr; return (fs->fs.fs_nr == i); } static void * fsk_new(uintptr_t key, int flags, void *arg) { struct dn_fsk *fs; fs = malloc(sizeof(*fs), M_DUMMYNET, M_NOWAIT | M_ZERO); if (fs) { set_oid(&fs->fs.oid, DN_FS, sizeof(fs->fs)); dn_cfg.fsk_count++; fs->drain_bucket = 0; SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); } return fs; } /* * detach flowset from its current scheduler. Flags as follows: * DN_DETACH removes from the fsk_list * DN_DESTROY deletes individual queues * DN_DELETE_FS destroys the flowset (otherwise goes in unlinked). */ static void fsk_detach(struct dn_fsk *fs, int flags) { if (flags & DN_DELETE_FS) flags |= DN_DESTROY; ND("fs %d from sched %d flags %s %s %s", fs->fs.fs_nr, fs->fs.sched_nr, (flags & DN_DELETE_FS) ? "DEL_FS":"", (flags & DN_DESTROY) ? "DEL":"", (flags & DN_DETACH) ? "DET":""); if (flags & DN_DETACH) { /* detach from the list */ struct dn_fsk_head *h; h = fs->sched ? &fs->sched->fsk_list : &dn_cfg.fsu; SLIST_REMOVE(h, fs, dn_fsk, sch_chain); } /* Free the RED parameters, they will be recomputed on * subsequent attach if needed. */ if (fs->w_q_lookup) free(fs->w_q_lookup, M_DUMMYNET); fs->w_q_lookup = NULL; qht_delete(fs, flags); if (fs->sched && fs->sched->fp->free_fsk) fs->sched->fp->free_fsk(fs); fs->sched = NULL; if (flags & DN_DELETE_FS) { bzero(fs, sizeof(fs)); /* safety */ free(fs, M_DUMMYNET); dn_cfg.fsk_count--; } else { SLIST_INSERT_HEAD(&dn_cfg.fsu, fs, sch_chain); } } /* * Detach or destroy all flowsets in a list. * flags specifies what to do: * DN_DESTROY: flush all queues * DN_DELETE_FS: DN_DESTROY + destroy flowset * DN_DELETE_FS implies DN_DESTROY */ static void fsk_detach_list(struct dn_fsk_head *h, int flags) { struct dn_fsk *fs; int n = 0; /* only for stats */ ND("head %p flags %x", h, flags); while ((fs = SLIST_FIRST(h))) { SLIST_REMOVE_HEAD(h, sch_chain); n++; fsk_detach(fs, flags); } ND("done %d flowsets", n); } /* * called on 'queue X delete' -- removes the flowset from fshash, * deletes all queues for the flowset, and removes the flowset. */ static int delete_fs(int i, int locked) { struct dn_fsk *fs; int err = 0; if (!locked) DN_BH_WLOCK(); fs = dn_ht_find(dn_cfg.fshash, i, DNHT_REMOVE, NULL); ND("fs %d found %p", i, fs); if (fs) { fsk_detach(fs, DN_DETACH | DN_DELETE_FS); err = 0; } else err = EINVAL; if (!locked) DN_BH_WUNLOCK(); return err; } /*----- end of flowset hashtable support -------------*/ /*------------------------------------------------------------ * Scheduler hash. When searching by index we pass sched_nr, * otherwise we pass struct dn_sch * which is the first field in * struct dn_schk so we can cast between the two. We use this trick * because in the create phase (but it should be fixed). */ static uint32_t schk_hash(uintptr_t key, int flags, void *_arg) { uint32_t i = !(flags & DNHT_KEY_IS_OBJ) ? key : ((struct dn_schk *)key)->sch.sched_nr; return ( (i>>8)^(i>>4)^i ); } static int schk_match(void *obj, uintptr_t key, int flags, void *_arg) { struct dn_schk *s = (struct dn_schk *)obj; int i = !(flags & DNHT_KEY_IS_OBJ) ? key : ((struct dn_schk *)key)->sch.sched_nr; return (s->sch.sched_nr == i); } /* * Create the entry and intialize with the sched hash if needed. * Leave s->fp unset so we can tell whether a dn_ht_find() returns * a new object or a previously existing one. */ static void * schk_new(uintptr_t key, int flags, void *arg) { struct schk_new_arg *a = arg; struct dn_schk *s; int l = sizeof(*s) +a->fp->schk_datalen; s = malloc(l, M_DUMMYNET, M_NOWAIT | M_ZERO); if (s == NULL) return NULL; set_oid(&s->link.oid, DN_LINK, sizeof(s->link)); s->sch = *a->sch; // copy initial values s->link.link_nr = s->sch.sched_nr; SLIST_INIT(&s->fsk_list); /* initialize the hash table or create the single instance */ s->fp = a->fp; /* si_new needs this */ s->drain_bucket = 0; if (s->sch.flags & DN_HAVE_MASK) { s->siht = dn_ht_init(NULL, s->sch.buckets, offsetof(struct dn_sch_inst, si_next), si_hash, si_match, si_new); if (s->siht == NULL) { free(s, M_DUMMYNET); return NULL; } } s->fp = NULL; /* mark as a new scheduler */ dn_cfg.schk_count++; return s; } /* * Callback for sched delete. Notify all attached flowsets to * detach from the scheduler, destroy the internal flowset, and * all instances. The scheduler goes away too. * arg is 0 (only detach flowsets and destroy instances) * DN_DESTROY (detach & delete queues, delete schk) * or DN_DELETE_FS (delete queues and flowsets, delete schk) */ static int schk_delete_cb(void *obj, void *arg) { struct dn_schk *s = obj; #if 0 int a = (int)arg; ND("sched %d arg %s%s", s->sch.sched_nr, a&DN_DESTROY ? "DEL ":"", a&DN_DELETE_FS ? "DEL_FS":""); #endif fsk_detach_list(&s->fsk_list, arg ? DN_DESTROY : 0); /* no more flowset pointing to us now */ if (s->sch.flags & DN_HAVE_MASK) { dn_ht_scan(s->siht, si_destroy, NULL); dn_ht_free(s->siht, 0); } else if (s->siht) si_destroy(s->siht, NULL); if (s->profile) { free(s->profile, M_DUMMYNET); s->profile = NULL; } s->siht = NULL; if (s->fp->destroy) s->fp->destroy(s); bzero(s, sizeof(*s)); // safety free(obj, M_DUMMYNET); dn_cfg.schk_count--; return DNHT_SCAN_DEL; } /* * called on a 'sched X delete' command. Deletes a single scheduler. * This is done by removing from the schedhash, unlinking all * flowsets and deleting their traffic. */ static int delete_schk(int i) { struct dn_schk *s; s = dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); ND("%d %p", i, s); if (!s) return EINVAL; delete_fs(i + DN_MAX_ID, 1); /* first delete internal fs */ /* then detach flowsets, delete traffic */ schk_delete_cb(s, (void*)(uintptr_t)DN_DESTROY); return 0; } /*--- end of schk hashtable support ---*/ static int copy_obj(char **start, char *end, void *_o, const char *msg, int i) { struct dn_id *o = _o; int have = end - *start; if (have < o->len || o->len == 0 || o->type == 0) { D("(WARN) type %d %s %d have %d need %d", o->type, msg, i, have, o->len); return 1; } ND("type %d %s %d len %d", o->type, msg, i, o->len); bcopy(_o, *start, o->len); if (o->type == DN_LINK) { /* Adjust burst parameter for link */ struct dn_link *l = (struct dn_link *)*start; l->burst = div64(l->burst, 8 * hz); l->delay = l->delay * 1000 / hz; } else if (o->type == DN_SCH) { /* Set id->id to the number of instances */ struct dn_schk *s = _o; struct dn_id *id = (struct dn_id *)(*start); id->id = (s->sch.flags & DN_HAVE_MASK) ? dn_ht_entries(s->siht) : (s->siht ? 1 : 0); } *start += o->len; return 0; } /* Specific function to copy a queue. * Copies only the user-visible part of a queue (which is in * a struct dn_flow), and sets len accordingly. */ static int copy_obj_q(char **start, char *end, void *_o, const char *msg, int i) { struct dn_id *o = _o; int have = end - *start; int len = sizeof(struct dn_flow); /* see above comment */ if (have < len || o->len == 0 || o->type != DN_QUEUE) { D("ERROR type %d %s %d have %d need %d", o->type, msg, i, have, len); return 1; } ND("type %d %s %d len %d", o->type, msg, i, len); bcopy(_o, *start, len); ((struct dn_id*)(*start))->len = len; *start += len; return 0; } static int copy_q_cb(void *obj, void *arg) { struct dn_queue *q = obj; struct copy_args *a = arg; struct dn_flow *ni = (struct dn_flow *)(*a->start); if (copy_obj_q(a->start, a->end, &q->ni, "queue", -1)) return DNHT_SCAN_END; ni->oid.type = DN_FLOW; /* override the DN_QUEUE */ ni->oid.id = si_hash((uintptr_t)&ni->fid, 0, NULL); return 0; } static int copy_q(struct copy_args *a, struct dn_fsk *fs, int flags) { if (!fs->qht) return 0; if (fs->fs.flags & DN_QHT_HASH) dn_ht_scan(fs->qht, copy_q_cb, a); else copy_q_cb(fs->qht, a); return 0; } /* * This routine only copies the initial part of a profile ? XXX */ static int copy_profile(struct copy_args *a, struct dn_profile *p) { int have = a->end - *a->start; /* XXX here we check for max length */ int profile_len = sizeof(struct dn_profile) - ED_MAX_SAMPLES_NO*sizeof(int); if (p == NULL) return 0; if (have < profile_len) { D("error have %d need %d", have, profile_len); return 1; } bcopy(p, *a->start, profile_len); ((struct dn_id *)(*a->start))->len = profile_len; *a->start += profile_len; return 0; } static int copy_flowset(struct copy_args *a, struct dn_fsk *fs, int flags) { struct dn_fs *ufs = (struct dn_fs *)(*a->start); if (!fs) return 0; ND("flowset %d", fs->fs.fs_nr); if (copy_obj(a->start, a->end, &fs->fs, "flowset", fs->fs.fs_nr)) return DNHT_SCAN_END; ufs->oid.id = (fs->fs.flags & DN_QHT_HASH) ? dn_ht_entries(fs->qht) : (fs->qht ? 1 : 0); if (flags) { /* copy queues */ copy_q(a, fs, 0); } return 0; } static int copy_si_cb(void *obj, void *arg) { struct dn_sch_inst *si = obj; struct copy_args *a = arg; struct dn_flow *ni = (struct dn_flow *)(*a->start); if (copy_obj(a->start, a->end, &si->ni, "inst", si->sched->sch.sched_nr)) return DNHT_SCAN_END; ni->oid.type = DN_FLOW; /* override the DN_SCH_I */ ni->oid.id = si_hash((uintptr_t)si, DNHT_KEY_IS_OBJ, NULL); return 0; } static int copy_si(struct copy_args *a, struct dn_schk *s, int flags) { if (s->sch.flags & DN_HAVE_MASK) dn_ht_scan(s->siht, copy_si_cb, a); else if (s->siht) copy_si_cb(s->siht, a); return 0; } /* * compute a list of children of a scheduler and copy up */ static int copy_fsk_list(struct copy_args *a, struct dn_schk *s, int flags) { struct dn_fsk *fs; struct dn_id *o; uint32_t *p; int n = 0, space = sizeof(*o); SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { if (fs->fs.fs_nr < DN_MAX_ID) n++; } space += n * sizeof(uint32_t); DX(3, "sched %d has %d flowsets", s->sch.sched_nr, n); if (a->end - *(a->start) < space) return DNHT_SCAN_END; o = (struct dn_id *)(*(a->start)); o->len = space; *a->start += o->len; o->type = DN_TEXT; p = (uint32_t *)(o+1); SLIST_FOREACH(fs, &s->fsk_list, sch_chain) if (fs->fs.fs_nr < DN_MAX_ID) *p++ = fs->fs.fs_nr; return 0; } static int copy_data_helper(void *_o, void *_arg) { struct copy_args *a = _arg; uint32_t *r = a->extra->r; /* start of first range */ uint32_t *lim; /* first invalid pointer */ int n; lim = (uint32_t *)((char *)(a->extra) + a->extra->o.len); if (a->type == DN_LINK || a->type == DN_SCH) { /* pipe|sched show, we receive a dn_schk */ struct dn_schk *s = _o; n = s->sch.sched_nr; if (a->type == DN_SCH && n >= DN_MAX_ID) return 0; /* not a scheduler */ if (a->type == DN_LINK && n <= DN_MAX_ID) return 0; /* not a pipe */ /* see if the object is within one of our ranges */ for (;r < lim; r += 2) { if (n < r[0] || n > r[1]) continue; /* Found a valid entry, copy and we are done */ if (a->flags & DN_C_LINK) { if (copy_obj(a->start, a->end, &s->link, "link", n)) return DNHT_SCAN_END; if (copy_profile(a, s->profile)) return DNHT_SCAN_END; if (copy_flowset(a, s->fs, 0)) return DNHT_SCAN_END; } if (a->flags & DN_C_SCH) { if (copy_obj(a->start, a->end, &s->sch, "sched", n)) return DNHT_SCAN_END; /* list all attached flowsets */ if (copy_fsk_list(a, s, 0)) return DNHT_SCAN_END; } if (a->flags & DN_C_FLOW) copy_si(a, s, 0); break; } } else if (a->type == DN_FS) { /* queue show, skip internal flowsets */ struct dn_fsk *fs = _o; n = fs->fs.fs_nr; if (n >= DN_MAX_ID) return 0; /* see if the object is within one of our ranges */ for (;r < lim; r += 2) { if (n < r[0] || n > r[1]) continue; if (copy_flowset(a, fs, 0)) return DNHT_SCAN_END; copy_q(a, fs, 0); break; /* we are done */ } } return 0; } static inline struct dn_schk * locate_scheduler(int i) { return dn_ht_find(dn_cfg.schedhash, i, 0, NULL); } /* * red parameters are in fixed point arithmetic. */ static int config_red(struct dn_fsk *fs) { int64_t s, idle, weight, w0; int t, i; fs->w_q = fs->fs.w_q; fs->max_p = fs->fs.max_p; ND("called"); /* Doing stuff that was in userland */ i = fs->sched->link.bandwidth; s = (i <= 0) ? 0 : hz * dn_cfg.red_avg_pkt_size * 8 * SCALE(1) / i; idle = div64((s * 3) , fs->w_q); /* s, fs->w_q scaled; idle not scaled */ fs->lookup_step = div64(idle , dn_cfg.red_lookup_depth); /* fs->lookup_step not scaled, */ if (!fs->lookup_step) fs->lookup_step = 1; w0 = weight = SCALE(1) - fs->w_q; //fs->w_q scaled for (t = fs->lookup_step; t > 1; --t) weight = SCALE_MUL(weight, w0); fs->lookup_weight = (int)(weight); // scaled /* Now doing stuff that was in kerneland */ fs->min_th = SCALE(fs->fs.min_th); fs->max_th = SCALE(fs->fs.max_th); fs->c_1 = fs->max_p / (fs->fs.max_th - fs->fs.min_th); fs->c_2 = SCALE_MUL(fs->c_1, SCALE(fs->fs.min_th)); if (fs->fs.flags & DN_IS_GENTLE_RED) { fs->c_3 = (SCALE(1) - fs->max_p) / fs->fs.max_th; fs->c_4 = SCALE(1) - 2 * fs->max_p; } /* If the lookup table already exist, free and create it again. */ if (fs->w_q_lookup) { free(fs->w_q_lookup, M_DUMMYNET); fs->w_q_lookup = NULL; } if (dn_cfg.red_lookup_depth == 0) { printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth" "must be > 0\n"); fs->fs.flags &= ~DN_IS_RED; fs->fs.flags &= ~DN_IS_GENTLE_RED; return (EINVAL); } fs->lookup_depth = dn_cfg.red_lookup_depth; fs->w_q_lookup = (u_int *)malloc(fs->lookup_depth * sizeof(int), M_DUMMYNET, M_NOWAIT); if (fs->w_q_lookup == NULL) { printf("dummynet: sorry, cannot allocate red lookup table\n"); fs->fs.flags &= ~DN_IS_RED; fs->fs.flags &= ~DN_IS_GENTLE_RED; return(ENOSPC); } /* Fill the lookup table with (1 - w_q)^x */ fs->w_q_lookup[0] = SCALE(1) - fs->w_q; for (i = 1; i < fs->lookup_depth; i++) fs->w_q_lookup[i] = SCALE_MUL(fs->w_q_lookup[i - 1], fs->lookup_weight); if (dn_cfg.red_avg_pkt_size < 1) dn_cfg.red_avg_pkt_size = 512; fs->avg_pkt_size = dn_cfg.red_avg_pkt_size; if (dn_cfg.red_max_pkt_size < 1) dn_cfg.red_max_pkt_size = 1500; fs->max_pkt_size = dn_cfg.red_max_pkt_size; ND("exit"); return 0; } /* Scan all flowset attached to this scheduler and update red */ static void update_red(struct dn_schk *s) { struct dn_fsk *fs; SLIST_FOREACH(fs, &s->fsk_list, sch_chain) { if (fs && (fs->fs.flags & DN_IS_RED)) config_red(fs); } } /* attach flowset to scheduler s, possibly requeue */ static void fsk_attach(struct dn_fsk *fs, struct dn_schk *s) { ND("remove fs %d from fsunlinked, link to sched %d", fs->fs.fs_nr, s->sch.sched_nr); SLIST_REMOVE(&dn_cfg.fsu, fs, dn_fsk, sch_chain); fs->sched = s; SLIST_INSERT_HEAD(&s->fsk_list, fs, sch_chain); if (s->fp->new_fsk) s->fp->new_fsk(fs); /* XXX compute fsk_mask */ fs->fsk_mask = fs->fs.flow_mask; if (fs->sched->sch.flags & DN_HAVE_MASK) flow_id_or(&fs->sched->sch.sched_mask, &fs->fsk_mask); if (fs->qht) { /* * we must drain qht according to the old * type, and reinsert according to the new one. * The requeue is complex -- in general we need to * reclassify every single packet. * For the time being, let's hope qht is never set * when we reach this point. */ D("XXX TODO requeue from fs %d to sch %d", fs->fs.fs_nr, s->sch.sched_nr); fs->qht = NULL; } /* set the new type for qht */ if (nonzero_mask(&fs->fsk_mask)) fs->fs.flags |= DN_QHT_HASH; else fs->fs.flags &= ~DN_QHT_HASH; /* XXX config_red() can fail... */ if (fs->fs.flags & DN_IS_RED) config_red(fs); } /* update all flowsets which may refer to this scheduler */ static void update_fs(struct dn_schk *s) { struct dn_fsk *fs, *tmp; SLIST_FOREACH_SAFE(fs, &dn_cfg.fsu, sch_chain, tmp) { if (s->sch.sched_nr != fs->fs.sched_nr) { D("fs %d for sch %d not %d still unlinked", fs->fs.fs_nr, fs->fs.sched_nr, s->sch.sched_nr); continue; } fsk_attach(fs, s); } } /* * Configuration -- to preserve backward compatibility we use * the following scheme (N is 65536) * NUMBER SCHED LINK FLOWSET * 1 .. N-1 (1)WFQ (2)WFQ (3)queue * N+1 .. 2N-1 (4)FIFO (5)FIFO (6)FIFO for sched 1..N-1 * 2N+1 .. 3N-1 -- -- (7)FIFO for sched N+1..2N-1 * * "pipe i config" configures #1, #2 and #3 * "sched i config" configures #1 and possibly #6 * "queue i config" configures #3 * #1 is configured with 'pipe i config' or 'sched i config' * #2 is configured with 'pipe i config', and created if not * existing with 'sched i config' * #3 is configured with 'queue i config' * #4 is automatically configured after #1, can only be FIFO * #5 is automatically configured after #2 * #6 is automatically created when #1 is !MULTIQUEUE, * and can be updated. * #7 is automatically configured after #2 */ /* * configure a link (and its FIFO instance) */ static int config_link(struct dn_link *p, struct dn_id *arg) { int i; if (p->oid.len != sizeof(*p)) { D("invalid pipe len %d", p->oid.len); return EINVAL; } i = p->link_nr; if (i <= 0 || i >= DN_MAX_ID) return EINVAL; /* * The config program passes parameters as follows: * bw = bits/second (0 means no limits), * delay = ms, must be translated into ticks. * qsize = slots/bytes * burst ??? */ p->delay = (p->delay * hz) / 1000; /* Scale burst size: bytes -> bits * hz */ p->burst *= 8 * hz; DN_BH_WLOCK(); /* do it twice, base link and FIFO link */ for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { struct dn_schk *s = locate_scheduler(i); if (s == NULL) { DN_BH_WUNLOCK(); D("sched %d not found", i); return EINVAL; } /* remove profile if exists */ if (s->profile) { free(s->profile, M_DUMMYNET); s->profile = NULL; } /* copy all parameters */ s->link.oid = p->oid; s->link.link_nr = i; s->link.delay = p->delay; if (s->link.bandwidth != p->bandwidth) { /* XXX bandwidth changes, need to update red params */ s->link.bandwidth = p->bandwidth; update_red(s); } s->link.burst = p->burst; schk_reset_credit(s); } dn_cfg.id++; DN_BH_WUNLOCK(); return 0; } /* * configure a flowset. Can be called from inside with locked=1, */ static struct dn_fsk * config_fs(struct dn_fs *nfs, struct dn_id *arg, int locked) { int i; struct dn_fsk *fs; if (nfs->oid.len != sizeof(*nfs)) { D("invalid flowset len %d", nfs->oid.len); return NULL; } i = nfs->fs_nr; if (i <= 0 || i >= 3*DN_MAX_ID) return NULL; ND("flowset %d", i); /* XXX other sanity checks */ if (nfs->flags & DN_QSIZE_BYTES) { ipdn_bound_var(&nfs->qsize, 16384, 1500, dn_cfg.byte_limit, NULL); // "queue byte size"); } else { ipdn_bound_var(&nfs->qsize, 50, 1, dn_cfg.slot_limit, NULL); // "queue slot size"); } if (nfs->flags & DN_HAVE_MASK) { /* make sure we have some buckets */ ipdn_bound_var((int *)&nfs->buckets, dn_cfg.hash_size, 1, dn_cfg.max_hash_size, "flowset buckets"); } else { nfs->buckets = 1; /* we only need 1 */ } if (!locked) DN_BH_WLOCK(); do { /* exit with break when done */ struct dn_schk *s; int flags = nfs->sched_nr ? DNHT_INSERT : 0; int j; int oldc = dn_cfg.fsk_count; fs = dn_ht_find(dn_cfg.fshash, i, flags, NULL); if (fs == NULL) { D("missing sched for flowset %d", i); break; } /* grab some defaults from the existing one */ if (nfs->sched_nr == 0) /* reuse */ nfs->sched_nr = fs->fs.sched_nr; for (j = 0; j < sizeof(nfs->par)/sizeof(nfs->par[0]); j++) { if (nfs->par[j] == -1) /* reuse */ nfs->par[j] = fs->fs.par[j]; } if (bcmp(&fs->fs, nfs, sizeof(*nfs)) == 0) { ND("flowset %d unchanged", i); break; /* no change, nothing to do */ } if (oldc != dn_cfg.fsk_count) /* new item */ dn_cfg.id++; s = locate_scheduler(nfs->sched_nr); /* detach from old scheduler if needed, preserving * queues if we need to reattach. Then update the * configuration, and possibly attach to the new sched. */ DX(2, "fs %d changed sched %d@%p to %d@%p", fs->fs.fs_nr, fs->fs.sched_nr, fs->sched, nfs->sched_nr, s); if (fs->sched) { int flags = s ? DN_DETACH : (DN_DETACH | DN_DESTROY); flags |= DN_DESTROY; /* XXX temporary */ fsk_detach(fs, flags); } fs->fs = *nfs; /* copy configuration */ if (s != NULL) fsk_attach(fs, s); } while (0); if (!locked) DN_BH_WUNLOCK(); return fs; } /* * config/reconfig a scheduler and its FIFO variant. * For !MULTIQUEUE schedulers, also set up the flowset. * * On reconfigurations (detected because s->fp is set), * detach existing flowsets preserving traffic, preserve link, * and delete the old scheduler creating a new one. */ static int config_sched(struct dn_sch *_nsch, struct dn_id *arg) { struct dn_schk *s; struct schk_new_arg a; /* argument for schk_new */ int i; struct dn_link p; /* copy of oldlink */ struct dn_profile *pf = NULL; /* copy of old link profile */ /* Used to preserv mask parameter */ struct ipfw_flow_id new_mask; int new_buckets = 0; int new_flags = 0; int pipe_cmd; int err = ENOMEM; a.sch = _nsch; if (a.sch->oid.len != sizeof(*a.sch)) { D("bad sched len %d", a.sch->oid.len); return EINVAL; } i = a.sch->sched_nr; if (i <= 0 || i >= DN_MAX_ID) return EINVAL; /* make sure we have some buckets */ if (a.sch->flags & DN_HAVE_MASK) ipdn_bound_var((int *)&a.sch->buckets, dn_cfg.hash_size, 1, dn_cfg.max_hash_size, "sched buckets"); /* XXX other sanity checks */ bzero(&p, sizeof(p)); pipe_cmd = a.sch->flags & DN_PIPE_CMD; a.sch->flags &= ~DN_PIPE_CMD; //XXX do it even if is not set? if (pipe_cmd) { /* Copy mask parameter */ new_mask = a.sch->sched_mask; new_buckets = a.sch->buckets; new_flags = a.sch->flags; } DN_BH_WLOCK(); again: /* run twice, for wfq and fifo */ /* * lookup the type. If not supplied, use the previous one * or default to WF2Q+. Otherwise, return an error. */ dn_cfg.id++; a.fp = find_sched_type(a.sch->oid.subtype, a.sch->name); if (a.fp != NULL) { /* found. Lookup or create entry */ s = dn_ht_find(dn_cfg.schedhash, i, DNHT_INSERT, &a); } else if (a.sch->oid.subtype == 0 && !a.sch->name[0]) { /* No type. search existing s* or retry with WF2Q+ */ s = dn_ht_find(dn_cfg.schedhash, i, 0, &a); if (s != NULL) { a.fp = s->fp; /* Scheduler exists, skip to FIFO scheduler * if command was pipe config... */ if (pipe_cmd) goto next; } else { /* New scheduler, create a wf2q+ with no mask * if command was pipe config... */ if (pipe_cmd) { /* clear mask parameter */ bzero(&a.sch->sched_mask, sizeof(new_mask)); a.sch->buckets = 0; a.sch->flags &= ~DN_HAVE_MASK; } a.sch->oid.subtype = DN_SCHED_WF2QP; goto again; } } else { D("invalid scheduler type %d %s", a.sch->oid.subtype, a.sch->name); err = EINVAL; goto error; } /* normalize name and subtype */ a.sch->oid.subtype = a.fp->type; bzero(a.sch->name, sizeof(a.sch->name)); strlcpy(a.sch->name, a.fp->name, sizeof(a.sch->name)); if (s == NULL) { D("cannot allocate scheduler %d", i); goto error; } /* restore existing link if any */ if (p.link_nr) { s->link = p; if (!pf || pf->link_nr != p.link_nr) { /* no saved value */ s->profile = NULL; /* XXX maybe not needed */ } else { s->profile = malloc(sizeof(struct dn_profile), M_DUMMYNET, M_NOWAIT | M_ZERO); if (s->profile == NULL) { D("cannot allocate profile"); goto error; //XXX } bcopy(pf, s->profile, sizeof(*pf)); } } p.link_nr = 0; if (s->fp == NULL) { DX(2, "sched %d new type %s", i, a.fp->name); } else if (s->fp != a.fp || bcmp(a.sch, &s->sch, sizeof(*a.sch)) ) { /* already existing. */ DX(2, "sched %d type changed from %s to %s", i, s->fp->name, a.fp->name); DX(4, " type/sub %d/%d -> %d/%d", s->sch.oid.type, s->sch.oid.subtype, a.sch->oid.type, a.sch->oid.subtype); if (s->link.link_nr == 0) D("XXX WARNING link 0 for sched %d", i); p = s->link; /* preserve link */ if (s->profile) {/* preserve profile */ if (!pf) pf = malloc(sizeof(*pf), M_DUMMYNET, M_NOWAIT | M_ZERO); if (pf) /* XXX should issue a warning otherwise */ bcopy(s->profile, pf, sizeof(*pf)); } /* remove from the hash */ dn_ht_find(dn_cfg.schedhash, i, DNHT_REMOVE, NULL); /* Detach flowsets, preserve queues. */ // schk_delete_cb(s, NULL); // XXX temporarily, kill queues schk_delete_cb(s, (void *)DN_DESTROY); goto again; } else { DX(4, "sched %d unchanged type %s", i, a.fp->name); } /* complete initialization */ s->sch = *a.sch; s->fp = a.fp; s->cfg = arg; // XXX schk_reset_credit(s); /* create the internal flowset if needed, * trying to reuse existing ones if available */ if (!(s->fp->flags & DN_MULTIQUEUE) && !s->fs) { s->fs = dn_ht_find(dn_cfg.fshash, i, 0, NULL); if (!s->fs) { struct dn_fs fs; bzero(&fs, sizeof(fs)); set_oid(&fs.oid, DN_FS, sizeof(fs)); fs.fs_nr = i + DN_MAX_ID; fs.sched_nr = i; s->fs = config_fs(&fs, NULL, 1 /* locked */); } if (!s->fs) { schk_delete_cb(s, (void *)DN_DESTROY); D("error creating internal fs for %d", i); goto error; } } /* call init function after the flowset is created */ if (s->fp->config) s->fp->config(s); update_fs(s); next: if (i < DN_MAX_ID) { /* now configure the FIFO instance */ i += DN_MAX_ID; if (pipe_cmd) { /* Restore mask parameter for FIFO */ a.sch->sched_mask = new_mask; a.sch->buckets = new_buckets; a.sch->flags = new_flags; } else { /* sched config shouldn't modify the FIFO scheduler */ if (dn_ht_find(dn_cfg.schedhash, i, 0, &a) != NULL) { /* FIFO already exist, don't touch it */ err = 0; /* and this is not an error */ goto error; } } a.sch->sched_nr = i; a.sch->oid.subtype = DN_SCHED_FIFO; bzero(a.sch->name, sizeof(a.sch->name)); goto again; } err = 0; error: DN_BH_WUNLOCK(); if (pf) free(pf, M_DUMMYNET); return err; } /* * attach a profile to a link */ static int config_profile(struct dn_profile *pf, struct dn_id *arg) { struct dn_schk *s; int i, olen, err = 0; if (pf->oid.len < sizeof(*pf)) { D("short profile len %d", pf->oid.len); return EINVAL; } i = pf->link_nr; if (i <= 0 || i >= DN_MAX_ID) return EINVAL; /* XXX other sanity checks */ DN_BH_WLOCK(); for (; i < 2*DN_MAX_ID; i += DN_MAX_ID) { s = locate_scheduler(i); if (s == NULL) { err = EINVAL; break; } dn_cfg.id++; /* * If we had a profile and the new one does not fit, * or it is deleted, then we need to free memory. */ if (s->profile && (pf->samples_no == 0 || s->profile->oid.len < pf->oid.len)) { free(s->profile, M_DUMMYNET); s->profile = NULL; } if (pf->samples_no == 0) continue; /* * new profile, possibly allocate memory * and copy data. */ if (s->profile == NULL) s->profile = malloc(pf->oid.len, M_DUMMYNET, M_NOWAIT | M_ZERO); if (s->profile == NULL) { D("no memory for profile %d", i); err = ENOMEM; break; } /* preserve larger length XXX double check */ olen = s->profile->oid.len; if (olen < pf->oid.len) olen = pf->oid.len; bcopy(pf, s->profile, pf->oid.len); s->profile->oid.len = olen; } DN_BH_WUNLOCK(); return err; } /* * Delete all objects: */ static void dummynet_flush(void) { /* delete all schedulers and related links/queues/flowsets */ dn_ht_scan(dn_cfg.schedhash, schk_delete_cb, (void *)(uintptr_t)DN_DELETE_FS); /* delete all remaining (unlinked) flowsets */ DX(4, "still %d unlinked fs", dn_cfg.fsk_count); dn_ht_free(dn_cfg.fshash, DNHT_REMOVE); fsk_detach_list(&dn_cfg.fsu, DN_DELETE_FS); /* Reinitialize system heap... */ heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); } /* * Main handler for configuration. We are guaranteed to be called * with an oid which is at least a dn_id. * - the first object is the command (config, delete, flush, ...) * - config_link must be issued after the corresponding config_sched * - parameters (DN_TXT) for an object must preceed the object * processed on a config_sched. */ int do_config(void *p, int l) { struct dn_id *next, *o; int err = 0, err2 = 0; struct dn_id *arg = NULL; uintptr_t *a; o = p; if (o->id != DN_API_VERSION) { D("invalid api version got %d need %d", o->id, DN_API_VERSION); return EINVAL; } for (; l >= sizeof(*o); o = next) { struct dn_id *prev = arg; if (o->len < sizeof(*o) || l < o->len) { D("bad len o->len %d len %d", o->len, l); err = EINVAL; break; } l -= o->len; next = (struct dn_id *)((char *)o + o->len); err = 0; switch (o->type) { default: D("cmd %d not implemented", o->type); break; #ifdef EMULATE_SYSCTL /* sysctl emulation. * if we recognize the command, jump to the correct * handler and return */ case DN_SYSCTL_SET: err = kesysctl_emu_set(p, l); return err; #endif case DN_CMD_CONFIG: /* simply a header */ break; case DN_CMD_DELETE: /* the argument is in the first uintptr_t after o */ a = (uintptr_t *)(o+1); if (o->len < sizeof(*o) + sizeof(*a)) { err = EINVAL; break; } switch (o->subtype) { case DN_LINK: /* delete base and derived schedulers */ DN_BH_WLOCK(); err = delete_schk(*a); err2 = delete_schk(*a + DN_MAX_ID); DN_BH_WUNLOCK(); if (!err) err = err2; break; default: D("invalid delete type %d", o->subtype); err = EINVAL; break; case DN_FS: err = (*a <1 || *a >= DN_MAX_ID) ? EINVAL : delete_fs(*a, 0) ; break; } break; case DN_CMD_FLUSH: DN_BH_WLOCK(); dummynet_flush(); DN_BH_WUNLOCK(); break; case DN_TEXT: /* store argument the next block */ prev = NULL; arg = o; break; case DN_LINK: err = config_link((struct dn_link *)o, arg); break; case DN_PROFILE: err = config_profile((struct dn_profile *)o, arg); break; case DN_SCH: err = config_sched((struct dn_sch *)o, arg); break; case DN_FS: err = (NULL==config_fs((struct dn_fs *)o, arg, 0)); break; } if (prev) arg = NULL; if (err != 0) break; } return err; } static int compute_space(struct dn_id *cmd, struct copy_args *a) { int x = 0, need = 0; int profile_size = sizeof(struct dn_profile) - ED_MAX_SAMPLES_NO*sizeof(int); /* NOTE about compute space: * NP = dn_cfg.schk_count * NSI = dn_cfg.si_count * NF = dn_cfg.fsk_count * NQ = dn_cfg.queue_count * - ipfw pipe show * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler * link, scheduler template, flowset * integrated in scheduler and header * for flowset list * (NSI)*(dn_flow) all scheduler instance (includes * the queue instance) * - ipfw sched show * (NP/2)*(dn_link + dn_sch + dn_id + dn_fs) only half scheduler * link, scheduler template, flowset * integrated in scheduler and header * for flowset list * (NSI * dn_flow) all scheduler instances * (NF * sizeof(uint_32)) space for flowset list linked to scheduler * (NQ * dn_queue) all queue [XXXfor now not listed] * - ipfw queue show * (NF * dn_fs) all flowset * (NQ * dn_queue) all queues */ switch (cmd->subtype) { default: return -1; /* XXX where do LINK and SCH differ ? */ /* 'ipfw sched show' could list all queues associated to * a scheduler. This feature for now is disabled */ case DN_LINK: /* pipe show */ x = DN_C_LINK | DN_C_SCH | DN_C_FLOW; need += dn_cfg.schk_count * (sizeof(struct dn_fs) + profile_size) / 2; need += dn_cfg.fsk_count * sizeof(uint32_t); break; case DN_SCH: /* sched show */ need += dn_cfg.schk_count * (sizeof(struct dn_fs) + profile_size) / 2; need += dn_cfg.fsk_count * sizeof(uint32_t); x = DN_C_SCH | DN_C_LINK | DN_C_FLOW; break; case DN_FS: /* queue show */ x = DN_C_FS | DN_C_QUEUE; break; case DN_GET_COMPAT: /* compatibility mode */ need = dn_compat_calc_size(); break; } a->flags = x; if (x & DN_C_SCH) { need += dn_cfg.schk_count * sizeof(struct dn_sch) / 2; /* NOT also, each fs might be attached to a sched */ need += dn_cfg.schk_count * sizeof(struct dn_id) / 2; } if (x & DN_C_FS) need += dn_cfg.fsk_count * sizeof(struct dn_fs); if (x & DN_C_LINK) { need += dn_cfg.schk_count * sizeof(struct dn_link) / 2; } /* * When exporting a queue to userland, only pass up the * struct dn_flow, which is the only visible part. */ if (x & DN_C_QUEUE) need += dn_cfg.queue_count * sizeof(struct dn_flow); if (x & DN_C_FLOW) need += dn_cfg.si_count * (sizeof(struct dn_flow)); return need; } /* * If compat != NULL dummynet_get is called in compatibility mode. * *compat will be the pointer to the buffer to pass to ipfw */ int dummynet_get(struct sockopt *sopt, void **compat) { int have, i, need, error; char *start = NULL, *buf; size_t sopt_valsize; struct dn_id *cmd; struct copy_args a; struct copy_range r; int l = sizeof(struct dn_id); bzero(&a, sizeof(a)); bzero(&r, sizeof(r)); /* save and restore original sopt_valsize around copyin */ sopt_valsize = sopt->sopt_valsize; cmd = &r.o; if (!compat) { /* copy at least an oid, and possibly a full object */ error = sooptcopyin(sopt, cmd, sizeof(r), sizeof(*cmd)); sopt->sopt_valsize = sopt_valsize; if (error) goto done; l = cmd->len; #ifdef EMULATE_SYSCTL /* sysctl emulation. */ if (cmd->type == DN_SYSCTL_GET) return kesysctl_emu_get(sopt); #endif if (l > sizeof(r)) { /* request larger than default, allocate buffer */ cmd = malloc(l, M_DUMMYNET, M_WAITOK); error = sooptcopyin(sopt, cmd, l, l); sopt->sopt_valsize = sopt_valsize; if (error) goto done; } } else { /* compatibility */ error = 0; cmd->type = DN_CMD_GET; cmd->len = sizeof(struct dn_id); cmd->subtype = DN_GET_COMPAT; // cmd->id = sopt_valsize; D("compatibility mode"); } a.extra = (struct copy_range *)cmd; if (cmd->len == sizeof(*cmd)) { /* no range, create a default */ uint32_t *rp = (uint32_t *)(cmd + 1); cmd->len += 2* sizeof(uint32_t); rp[0] = 1; rp[1] = DN_MAX_ID - 1; if (cmd->subtype == DN_LINK) { rp[0] += DN_MAX_ID; rp[1] += DN_MAX_ID; } } /* Count space (under lock) and allocate (outside lock). * Exit with lock held if we manage to get enough buffer. * Try a few times then give up. */ for (have = 0, i = 0; i < 10; i++) { DN_BH_WLOCK(); need = compute_space(cmd, &a); /* if there is a range, ignore value from compute_space() */ if (l > sizeof(*cmd)) need = sopt_valsize - sizeof(*cmd); if (need < 0) { DN_BH_WUNLOCK(); error = EINVAL; goto done; } need += sizeof(*cmd); cmd->id = need; if (have >= need) break; DN_BH_WUNLOCK(); if (start) free(start, M_DUMMYNET); start = NULL; if (need > sopt_valsize) break; have = need; start = malloc(have, M_DUMMYNET, M_WAITOK | M_ZERO); } if (start == NULL) { if (compat) { *compat = NULL; error = 1; // XXX } else { error = sooptcopyout(sopt, cmd, sizeof(*cmd)); } goto done; } ND("have %d:%d sched %d, %d:%d links %d, %d:%d flowsets %d, " "%d:%d si %d, %d:%d queues %d", dn_cfg.schk_count, sizeof(struct dn_sch), DN_SCH, dn_cfg.schk_count, sizeof(struct dn_link), DN_LINK, dn_cfg.fsk_count, sizeof(struct dn_fs), DN_FS, dn_cfg.si_count, sizeof(struct dn_flow), DN_SCH_I, dn_cfg.queue_count, sizeof(struct dn_queue), DN_QUEUE); sopt->sopt_valsize = sopt_valsize; a.type = cmd->subtype; if (compat == NULL) { bcopy(cmd, start, sizeof(*cmd)); ((struct dn_id*)(start))->len = sizeof(struct dn_id); buf = start + sizeof(*cmd); } else buf = start; a.start = &buf; a.end = start + have; /* start copying other objects */ if (compat) { a.type = DN_COMPAT_PIPE; dn_ht_scan(dn_cfg.schedhash, copy_data_helper_compat, &a); a.type = DN_COMPAT_QUEUE; dn_ht_scan(dn_cfg.fshash, copy_data_helper_compat, &a); } else if (a.type == DN_FS) { dn_ht_scan(dn_cfg.fshash, copy_data_helper, &a); } else { dn_ht_scan(dn_cfg.schedhash, copy_data_helper, &a); } DN_BH_WUNLOCK(); if (compat) { *compat = start; sopt->sopt_valsize = buf - start; /* free() is done by ip_dummynet_compat() */ start = NULL; //XXX hack } else { error = sooptcopyout(sopt, start, buf - start); } done: if (cmd && cmd != &r.o) free(cmd, M_DUMMYNET); if (start) free(start, M_DUMMYNET); return error; } /* Callback called on scheduler instance to delete it if idle */ static int drain_scheduler_cb(void *_si, void *arg) { struct dn_sch_inst *si = _si; if ((si->kflags & DN_ACTIVE) || si->dline.mq.head != NULL) return 0; if (si->sched->fp->flags & DN_MULTIQUEUE) { if (si->q_count == 0) return si_destroy(si, NULL); else return 0; } else { /* !DN_MULTIQUEUE */ if ((si+1)->ni.length == 0) return si_destroy(si, NULL); else return 0; } return 0; /* unreachable */ } /* Callback called on scheduler to check if it has instances */ static int drain_scheduler_sch_cb(void *_s, void *arg) { struct dn_schk *s = _s; if (s->sch.flags & DN_HAVE_MASK) { dn_ht_scan_bucket(s->siht, &s->drain_bucket, drain_scheduler_cb, NULL); s->drain_bucket++; } else { if (s->siht) { if (drain_scheduler_cb(s->siht, NULL) == DNHT_SCAN_DEL) s->siht = NULL; } } return 0; } /* Called every tick, try to delete a 'bucket' of scheduler */ void dn_drain_scheduler(void) { dn_ht_scan_bucket(dn_cfg.schedhash, &dn_cfg.drain_sch, drain_scheduler_sch_cb, NULL); dn_cfg.drain_sch++; } /* Callback called on queue to delete if it is idle */ static int drain_queue_cb(void *_q, void *arg) { struct dn_queue *q = _q; if (q->ni.length == 0) { dn_delete_queue(q, DN_DESTROY); return DNHT_SCAN_DEL; /* queue is deleted */ } return 0; /* queue isn't deleted */ } /* Callback called on flowset used to check if it has queues */ static int drain_queue_fs_cb(void *_fs, void *arg) { struct dn_fsk *fs = _fs; if (fs->fs.flags & DN_QHT_HASH) { /* Flowset has a hash table for queues */ dn_ht_scan_bucket(fs->qht, &fs->drain_bucket, drain_queue_cb, NULL); fs->drain_bucket++; } else { /* No hash table for this flowset, null the pointer * if the queue is deleted */ if (fs->qht) { if (drain_queue_cb(fs->qht, NULL) == DNHT_SCAN_DEL) fs->qht = NULL; } } return 0; } /* Called every tick, try to delete a 'bucket' of queue */ void dn_drain_queue(void) { /* scan a bucket of flowset */ dn_ht_scan_bucket(dn_cfg.fshash, &dn_cfg.drain_fs, drain_queue_fs_cb, NULL); dn_cfg.drain_fs++; } /* * Handler for the various dummynet socket options */ static int ip_dn_ctl(struct sockopt *sopt) { void *p = NULL; int error, l; error = priv_check(sopt->sopt_td, PRIV_NETINET_DUMMYNET); if (error) return (error); /* Disallow sets in really-really secure mode. */ if (sopt->sopt_dir == SOPT_SET) { error = securelevel_ge(sopt->sopt_td->td_ucred, 3); if (error) return (error); } switch (sopt->sopt_name) { default : D("dummynet: unknown option %d", sopt->sopt_name); error = EINVAL; break; case IP_DUMMYNET_FLUSH: case IP_DUMMYNET_CONFIGURE: case IP_DUMMYNET_DEL: /* remove a pipe or queue */ case IP_DUMMYNET_GET: D("dummynet: compat option %d", sopt->sopt_name); error = ip_dummynet_compat(sopt); break; case IP_DUMMYNET3 : if (sopt->sopt_dir == SOPT_GET) { error = dummynet_get(sopt, NULL); break; } l = sopt->sopt_valsize; if (l < sizeof(struct dn_id) || l > 12000) { D("argument len %d invalid", l); break; } p = malloc(l, M_TEMP, M_WAITOK); // XXX can it fail ? error = sooptcopyin(sopt, p, l, l); if (error) break ; error = do_config(p, l); break; } if (p != NULL) free(p, M_TEMP); return error ; } static void ip_dn_init(void) { if (dn_cfg.init_done) return; printf("DUMMYNET %p with IPv6 initialized (100409)\n", curvnet); dn_cfg.init_done = 1; /* Set defaults here. MSVC does not accept initializers, * and this is also useful for vimages */ /* queue limits */ dn_cfg.slot_limit = 100; /* Foot shooting limit for queues. */ dn_cfg.byte_limit = 1024 * 1024; dn_cfg.expire = 1; /* RED parameters */ dn_cfg.red_lookup_depth = 256; /* default lookup table depth */ dn_cfg.red_avg_pkt_size = 512; /* default medium packet size */ dn_cfg.red_max_pkt_size = 1500; /* default max packet size */ /* hash tables */ dn_cfg.max_hash_size = 65536; /* max in the hash tables */ dn_cfg.hash_size = 64; /* default hash size */ /* create hash tables for schedulers and flowsets. * In both we search by key and by pointer. */ dn_cfg.schedhash = dn_ht_init(NULL, dn_cfg.hash_size, offsetof(struct dn_schk, schk_next), schk_hash, schk_match, schk_new); dn_cfg.fshash = dn_ht_init(NULL, dn_cfg.hash_size, offsetof(struct dn_fsk, fsk_next), fsk_hash, fsk_match, fsk_new); /* bucket index to drain object */ dn_cfg.drain_fs = 0; dn_cfg.drain_sch = 0; heap_init(&dn_cfg.evheap, 16, offsetof(struct dn_id, id)); SLIST_INIT(&dn_cfg.fsu); SLIST_INIT(&dn_cfg.schedlist); DN_LOCK_INIT(); TASK_INIT(&dn_task, 0, dummynet_task, curvnet); dn_tq = taskqueue_create("dummynet", M_WAITOK, taskqueue_thread_enqueue, &dn_tq); taskqueue_start_threads(&dn_tq, 1, PI_NET, "dummynet"); callout_init(&dn_timeout, CALLOUT_MPSAFE); callout_reset(&dn_timeout, 1, dummynet, NULL); /* Initialize curr_time adjustment mechanics. */ getmicrouptime(&dn_cfg.prev_t); } #ifdef KLD_MODULE static void ip_dn_destroy(int last) { callout_drain(&dn_timeout); DN_BH_WLOCK(); if (last) { ND("removing last instance\n"); ip_dn_ctl_ptr = NULL; ip_dn_io_ptr = NULL; } dummynet_flush(); DN_BH_WUNLOCK(); taskqueue_drain(dn_tq, &dn_task); taskqueue_free(dn_tq); dn_ht_free(dn_cfg.schedhash, 0); dn_ht_free(dn_cfg.fshash, 0); heap_free(&dn_cfg.evheap); DN_LOCK_DESTROY(); } #endif /* KLD_MODULE */ static int dummynet_modevent(module_t mod, int type, void *data) { if (type == MOD_LOAD) { if (ip_dn_io_ptr) { printf("DUMMYNET already loaded\n"); return EEXIST ; } ip_dn_init(); ip_dn_ctl_ptr = ip_dn_ctl; ip_dn_io_ptr = dummynet_io; return 0; } else if (type == MOD_UNLOAD) { #if !defined(KLD_MODULE) printf("dummynet statically compiled, cannot unload\n"); return EINVAL ; #else ip_dn_destroy(1 /* last */); return 0; #endif } else return EOPNOTSUPP; } /* modevent helpers for the modules */ static int load_dn_sched(struct dn_alg *d) { struct dn_alg *s; if (d == NULL) return 1; /* error */ ip_dn_init(); /* just in case, we need the lock */ /* Check that mandatory funcs exists */ if (d->enqueue == NULL || d->dequeue == NULL) { D("missing enqueue or dequeue for %s", d->name); return 1; } /* Search if scheduler already exists */ DN_BH_WLOCK(); SLIST_FOREACH(s, &dn_cfg.schedlist, next) { if (strcmp(s->name, d->name) == 0) { D("%s already loaded", d->name); break; /* scheduler already exists */ } } if (s == NULL) SLIST_INSERT_HEAD(&dn_cfg.schedlist, d, next); DN_BH_WUNLOCK(); D("dn_sched %s %sloaded", d->name, s ? "not ":""); return s ? 1 : 0; } static int unload_dn_sched(struct dn_alg *s) { struct dn_alg *tmp, *r; int err = EINVAL; ND("called for %s", s->name); DN_BH_WLOCK(); SLIST_FOREACH_SAFE(r, &dn_cfg.schedlist, next, tmp) { if (strcmp(s->name, r->name) != 0) continue; ND("ref_count = %d", r->ref_count); err = (r->ref_count != 0) ? EBUSY : 0; if (err == 0) SLIST_REMOVE(&dn_cfg.schedlist, r, dn_alg, next); break; } DN_BH_WUNLOCK(); D("dn_sched %s %sunloaded", s->name, err ? "not ":""); return err; } int dn_sched_modevent(module_t mod, int cmd, void *arg) { struct dn_alg *sch = arg; if (cmd == MOD_LOAD) return load_dn_sched(sch); else if (cmd == MOD_UNLOAD) return unload_dn_sched(sch); else return EINVAL; } static moduledata_t dummynet_mod = { "dummynet", dummynet_modevent, NULL }; #define DN_SI_SUB SI_SUB_PROTO_IFATTACHDOMAIN #define DN_MODEV_ORD (SI_ORDER_ANY - 128) /* after ipfw */ DECLARE_MODULE(dummynet, dummynet_mod, DN_SI_SUB, DN_MODEV_ORD); MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); MODULE_VERSION(dummynet, 3); /* * Starting up. Done in order after dummynet_modevent() has been called. * VNET_SYSINIT is also called for each existing vnet and each new vnet. */ //VNET_SYSINIT(vnet_dn_init, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_init, NULL); /* * Shutdown handlers up shop. These are done in REVERSE ORDER, but still * after dummynet_modevent() has been called. Not called on reboot. * VNET_SYSUNINIT is also called for each exiting vnet as it exits. * or when the module is unloaded. */ //VNET_SYSUNINIT(vnet_dn_uninit, DN_SI_SUB, DN_MODEV_ORD+2, ip_dn_destroy, NULL); /* end of file */ ipfw-user/sys/net/pfil.h000644 000423 000000 00000007733 12006744005 015754 0ustar00luigiwheel000000 000000 /* $FreeBSD: head/sys/net/pfil.h 210121 2010-07-15 14:41:06Z luigi $ */ /* $NetBSD: pfil.h,v 1.22 2003/06/23 12:57:08 martin Exp $ */ /*- * Copyright (c) 1996 Matthew R. Green * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NET_PFIL_H_ #define _NET_PFIL_H_ #include #include #include #include #include #include struct mbuf; struct ifnet; struct inpcb; /* * The packet filter hooks are designed for anything to call them to * possibly intercept the packet. */ struct packet_filter_hook { TAILQ_ENTRY(packet_filter_hook) pfil_link; int (*pfil_func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *); void *pfil_arg; }; #define PFIL_IN 0x00000001 #define PFIL_OUT 0x00000002 #define PFIL_WAITOK 0x00000004 #define PFIL_ALL (PFIL_IN|PFIL_OUT) typedef TAILQ_HEAD(pfil_list, packet_filter_hook) pfil_list_t; #define PFIL_TYPE_AF 1 /* key is AF_* type */ #define PFIL_TYPE_IFNET 2 /* key is ifnet pointer */ struct pfil_head { pfil_list_t ph_in; pfil_list_t ph_out; int ph_type; int ph_nhooks; #if defined( __linux__ ) || defined( _WIN32 ) rwlock_t ph_mtx; #else struct rmlock ph_lock; #endif union { u_long phu_val; void *phu_ptr; } ph_un; #define ph_af ph_un.phu_val #define ph_ifnet ph_un.phu_ptr LIST_ENTRY(pfil_head) ph_list; }; int pfil_add_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *, int, struct pfil_head *); int pfil_remove_hook(int (*func)(void *, struct mbuf **, struct ifnet *, int, struct inpcb *), void *, int, struct pfil_head *); int pfil_run_hooks(struct pfil_head *, struct mbuf **, struct ifnet *, int, struct inpcb *inp); int pfil_head_register(struct pfil_head *); int pfil_head_unregister(struct pfil_head *); struct pfil_head *pfil_head_get(int, u_long); #define PFIL_HOOKED(p) ((p)->ph_nhooks > 0) #define PFIL_LOCK_INIT(p) \ rm_init_flags(&(p)->ph_lock, "PFil hook read/write mutex", RM_RECURSE) #define PFIL_LOCK_DESTROY(p) rm_destroy(&(p)->ph_lock) #define PFIL_RLOCK(p, t) rm_rlock(&(p)->ph_lock, (t)) #define PFIL_WLOCK(p) rm_wlock(&(p)->ph_lock) #define PFIL_RUNLOCK(p, t) rm_runlock(&(p)->ph_lock, (t)) #define PFIL_WUNLOCK(p) rm_wunlock(&(p)->ph_lock) #define PFIL_LIST_LOCK() mtx_lock(&pfil_global_lock) #define PFIL_LIST_UNLOCK() mtx_unlock(&pfil_global_lock) static __inline struct packet_filter_hook * pfil_hook_get(int dir, struct pfil_head *ph) { if (dir == PFIL_IN) return (TAILQ_FIRST(&ph->ph_in)); else if (dir == PFIL_OUT) return (TAILQ_FIRST(&ph->ph_out)); else return (NULL); } #endif /* _NET_PFIL_H_ */ ipfw-user/sys/net/radix.c000644 000423 000000 00000076435 12006744005 016131 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1988, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)radix.c 8.5 (Berkeley) 5/19/95 * $FreeBSD: head/sys/net/radix.c 210122 2010-07-15 14:41:59Z luigi $ */ /* * Routines to build and maintain radix trees for routing lookups. */ #include #ifdef _KERNEL #include #include #include #include #include #include #include #include "opt_mpath.h" #ifdef RADIX_MPATH #include #endif #else /* !_KERNEL */ #include #include #include #define log(x, arg...) fprintf(stderr, ## arg) #define panic(x) fprintf(stderr, "PANIC: %s", x), exit(1) #define min(a, b) ((a) < (b) ? (a) : (b) ) #include #endif /* !_KERNEL */ static int rn_walktree_from(struct radix_node_head *h, void *a, void *m, walktree_f_t *f, void *w); static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *); static struct radix_node *rn_insert(void *, struct radix_node_head *, int *, struct radix_node [2]), *rn_newpair(void *, int, struct radix_node[2]), *rn_search(void *, struct radix_node *), *rn_search_m(void *, struct radix_node *, void *); static int max_keylen; static struct radix_mask *rn_mkfreelist; static struct radix_node_head *mask_rnhead; /* * Work area -- the following point to 3 buffers of size max_keylen, * allocated in this order in a block of memory malloc'ed by rn_init. * rn_zeros, rn_ones are set in rn_init and used in readonly afterwards. * addmask_key is used in rn_addmask in rw mode and not thread-safe. */ static char *rn_zeros, *rn_ones, *addmask_key; #define MKGet(m) { \ if (rn_mkfreelist) { \ m = rn_mkfreelist; \ rn_mkfreelist = (m)->rm_mklist; \ } else \ R_Malloc(m, struct radix_mask *, sizeof (struct radix_mask)); } #define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);} #define rn_masktop (mask_rnhead->rnh_treetop) static int rn_lexobetter(void *m_arg, void *n_arg); static struct radix_mask * rn_new_radix_mask(struct radix_node *tt, struct radix_mask *next); static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip); /* * The data structure for the keys is a radix tree with one way * branching removed. The index rn_bit at an internal node n represents a bit * position to be tested. The tree is arranged so that all descendants * of a node n have keys whose bits all agree up to position rn_bit - 1. * (We say the index of n is rn_bit.) * * There is at least one descendant which has a one bit at position rn_bit, * and at least one with a zero there. * * A route is determined by a pair of key and mask. We require that the * bit-wise logical and of the key and mask to be the key. * We define the index of a route to associated with the mask to be * the first bit number in the mask where 0 occurs (with bit number 0 * representing the highest order bit). * * We say a mask is normal if every bit is 0, past the index of the mask. * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit, * and m is a normal mask, then the route applies to every descendant of n. * If the index(m) < rn_bit, this implies the trailing last few bits of k * before bit b are all 0, (and hence consequently true of every descendant * of n), so the route applies to all descendants of the node as well. * * Similar logic shows that a non-normal mask m such that * index(m) <= index(n) could potentially apply to many children of n. * Thus, for each non-host route, we attach its mask to a list at an internal * node as high in the tree as we can go. * * The present version of the code makes use of normal routes in short- * circuiting an explict mask and compare operation when testing whether * a key satisfies a normal route, and also in remembering the unique leaf * that governs a subtree. */ /* * Most of the functions in this code assume that the key/mask arguments * are sockaddr-like structures, where the first byte is an u_char * indicating the size of the entire structure. * * To make the assumption more explicit, we use the LEN() macro to access * this field. It is safe to pass an expression with side effects * to LEN() as the argument is evaluated only once. * We cast the result to int as this is the dominant usage. */ #define LEN(x) ( (int) (*(const u_char *)(x)) ) /* * XXX THIS NEEDS TO BE FIXED * In the code, pointers to keys and masks are passed as either * 'void *' (because callers use to pass pointers of various kinds), or * 'caddr_t' (which is fine for pointer arithmetics, but not very * clean when you dereference it to access data). Furthermore, caddr_t * is really 'char *', while the natural type to operate on keys and * masks would be 'u_char'. This mismatch require a lot of casts and * intermediate variables to adapt types that clutter the code. */ /* * Search a node in the tree matching the key. */ static struct radix_node * rn_search(v_arg, head) void *v_arg; struct radix_node *head; { register struct radix_node *x; register caddr_t v; for (x = head, v = v_arg; x->rn_bit >= 0;) { if (x->rn_bmask & v[x->rn_offset]) x = x->rn_right; else x = x->rn_left; } return (x); } /* * Same as above, but with an additional mask. * XXX note this function is used only once. */ static struct radix_node * rn_search_m(v_arg, head, m_arg) struct radix_node *head; void *v_arg, *m_arg; { register struct radix_node *x; register caddr_t v = v_arg, m = m_arg; for (x = head; x->rn_bit >= 0;) { if ((x->rn_bmask & m[x->rn_offset]) && (x->rn_bmask & v[x->rn_offset])) x = x->rn_right; else x = x->rn_left; } return x; } int rn_refines(m_arg, n_arg) void *m_arg, *n_arg; { register caddr_t m = m_arg, n = n_arg; register caddr_t lim, lim2 = lim = n + LEN(n); int longer = LEN(n++) - LEN(m++); int masks_are_equal = 1; if (longer > 0) lim -= longer; while (n < lim) { if (*n & ~(*m)) return 0; if (*n++ != *m++) masks_are_equal = 0; } while (n < lim2) if (*n++) return 0; if (masks_are_equal && (longer < 0)) for (lim2 = m - longer; m < lim2; ) if (*m++) return 1; return (!masks_are_equal); } struct radix_node * rn_lookup(v_arg, m_arg, head) void *v_arg, *m_arg; struct radix_node_head *head; { register struct radix_node *x; caddr_t netmask = 0; if (m_arg) { x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset); if (x == 0) return (0); netmask = x->rn_key; } x = rn_match(v_arg, head); if (x && netmask) { while (x && x->rn_mask != netmask) x = x->rn_dupedkey; } return x; } static int rn_satisfies_leaf(trial, leaf, skip) char *trial; register struct radix_node *leaf; int skip; { register char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask; char *cplim; int length = min(LEN(cp), LEN(cp2)); if (cp3 == NULL) cp3 = rn_ones; else length = min(length, LEN(cp3)); cplim = cp + length; cp3 += skip; cp2 += skip; for (cp += skip; cp < cplim; cp++, cp2++, cp3++) if ((*cp ^ *cp2) & *cp3) return 0; return 1; } struct radix_node * rn_match(v_arg, head) void *v_arg; struct radix_node_head *head; { caddr_t v = v_arg; register struct radix_node *t = head->rnh_treetop, *x; register caddr_t cp = v, cp2; caddr_t cplim; struct radix_node *saved_t, *top = t; int off = t->rn_offset, vlen = LEN(cp), matched_off; register int test, b, rn_bit; /* * Open code rn_search(v, top) to avoid overhead of extra * subroutine call. */ for (; t->rn_bit >= 0; ) { if (t->rn_bmask & cp[t->rn_offset]) t = t->rn_right; else t = t->rn_left; } /* * See if we match exactly as a host destination * or at least learn how many bits match, for normal mask finesse. * * It doesn't hurt us to limit how many bytes to check * to the length of the mask, since if it matches we had a genuine * match and the leaf we have is the most specific one anyway; * if it didn't match with a shorter length it would fail * with a long one. This wins big for class B&C netmasks which * are probably the most common case... */ if (t->rn_mask) vlen = *(u_char *)t->rn_mask; cp += off; cp2 = t->rn_key + off; cplim = v + vlen; for (; cp < cplim; cp++, cp2++) if (*cp != *cp2) goto on1; /* * This extra grot is in case we are explicitly asked * to look up the default. Ugh! * * Never return the root node itself, it seems to cause a * lot of confusion. */ if (t->rn_flags & RNF_ROOT) t = t->rn_dupedkey; return t; on1: test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ for (b = 7; (test >>= 1) > 0;) b--; matched_off = cp - v; b += matched_off << 3; rn_bit = -1 - b; /* * If there is a host route in a duped-key chain, it will be first. */ if ((saved_t = t)->rn_mask == 0) t = t->rn_dupedkey; for (; t; t = t->rn_dupedkey) /* * Even if we don't match exactly as a host, * we may match if the leaf we wound up at is * a route to a net. */ if (t->rn_flags & RNF_NORMAL) { if (rn_bit <= t->rn_bit) return t; } else if (rn_satisfies_leaf(v, t, matched_off)) return t; t = saved_t; /* start searching up the tree */ do { register struct radix_mask *m; t = t->rn_parent; m = t->rn_mklist; /* * If non-contiguous masks ever become important * we can restore the masking and open coding of * the search and satisfaction test and put the * calculation of "off" back before the "do". */ while (m) { if (m->rm_flags & RNF_NORMAL) { if (rn_bit <= m->rm_bit) return (m->rm_leaf); } else { off = min(t->rn_offset, matched_off); x = rn_search_m(v, t, m->rm_mask); while (x && x->rn_mask != m->rm_mask) x = x->rn_dupedkey; if (x && rn_satisfies_leaf(v, x, off)) return x; } m = m->rm_mklist; } } while (t != top); return 0; } #ifdef RN_DEBUG int rn_nodenum; struct radix_node *rn_clist; int rn_saveinfo; int rn_debug = 1; #endif /* * Whenever we add a new leaf to the tree, we also add a parent node, * so we allocate them as an array of two elements: the first one must be * the leaf (see RNTORT() in route.c), the second one is the parent. * This routine initializes the relevant fields of the nodes, so that * the leaf is the left child of the parent node, and both nodes have * (almost) all all fields filled as appropriate. * (XXX some fields are left unset, see the '#if 0' section). * The function returns a pointer to the parent node. */ static struct radix_node * rn_newpair(v, b, nodes) void *v; int b; struct radix_node nodes[2]; { register struct radix_node *tt = nodes, *t = tt + 1; t->rn_bit = b; t->rn_bmask = 0x80 >> (b & 7); t->rn_left = tt; t->rn_offset = b >> 3; #if 0 /* XXX perhaps we should fill these fields as well. */ t->rn_parent = t->rn_right = NULL; tt->rn_mask = NULL; tt->rn_dupedkey = NULL; tt->rn_bmask = 0; #endif tt->rn_bit = -1; tt->rn_key = (caddr_t)v; tt->rn_parent = t; tt->rn_flags = t->rn_flags = RNF_ACTIVE; tt->rn_mklist = t->rn_mklist = 0; #ifdef RN_DEBUG tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; #endif return t; } static struct radix_node * rn_insert(v_arg, head, dupentry, nodes) void *v_arg; struct radix_node_head *head; int *dupentry; struct radix_node nodes[2]; { caddr_t v = v_arg; struct radix_node *top = head->rnh_treetop; int head_off = top->rn_offset, vlen = LEN(v); register struct radix_node *t = rn_search(v_arg, top); register caddr_t cp = v + head_off; register int b; struct radix_node *tt; /* * Find first bit at which v and t->rn_key differ */ { register caddr_t cp2 = t->rn_key + head_off; register int cmp_res; caddr_t cplim = v + vlen; while (cp < cplim) if (*cp2++ != *cp++) goto on1; *dupentry = 1; return t; on1: *dupentry = 0; cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; for (b = (cp - v) << 3; cmp_res; b--) cmp_res >>= 1; } { register struct radix_node *p, *x = top; cp = v; do { p = x; if (cp[x->rn_offset] & x->rn_bmask) x = x->rn_right; else x = x->rn_left; } while (b > (unsigned) x->rn_bit); /* x->rn_bit < b && x->rn_bit >= 0 */ #ifdef RN_DEBUG if (rn_debug) log(LOG_DEBUG, "rn_insert: Going In:\n"), traverse(p); #endif t = rn_newpair(v_arg, b, nodes); tt = t->rn_left; if ((cp[p->rn_offset] & p->rn_bmask) == 0) p->rn_left = t; else p->rn_right = t; x->rn_parent = t; t->rn_parent = p; /* frees x, p as temp vars below */ if ((cp[t->rn_offset] & t->rn_bmask) == 0) { t->rn_right = x; } else { t->rn_right = tt; t->rn_left = x; } #ifdef RN_DEBUG if (rn_debug) log(LOG_DEBUG, "rn_insert: Coming Out:\n"), traverse(p); #endif } return (tt); } struct radix_node * rn_addmask(n_arg, search, skip) int search, skip; void *n_arg; { caddr_t netmask = (caddr_t)n_arg; register struct radix_node *x; register caddr_t cp, cplim; register int b = 0, mlen, j; int maskduplicated, m0, isnormal; struct radix_node *saved_x; static int last_zeroed = 0; if ((mlen = LEN(netmask)) > max_keylen) mlen = max_keylen; if (skip == 0) skip = 1; if (mlen <= skip) return (mask_rnhead->rnh_nodes); if (skip > 1) bcopy(rn_ones + 1, addmask_key + 1, skip - 1); if ((m0 = mlen) > skip) bcopy(netmask + skip, addmask_key + skip, mlen - skip); /* * Trim trailing zeroes. */ for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;) cp--; mlen = cp - addmask_key; if (mlen <= skip) { if (m0 >= last_zeroed) last_zeroed = mlen; return (mask_rnhead->rnh_nodes); } if (m0 < last_zeroed) bzero(addmask_key + m0, last_zeroed - m0); *addmask_key = last_zeroed = mlen; x = rn_search(addmask_key, rn_masktop); if (bcmp(addmask_key, x->rn_key, mlen) != 0) x = 0; if (x || search) return (x); R_Zalloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x)); if ((saved_x = x) == 0) return (0); netmask = cp = (caddr_t)(x + 2); bcopy(addmask_key, cp, mlen); x = rn_insert(cp, mask_rnhead, &maskduplicated, x); if (maskduplicated) { log(LOG_ERR, "rn_addmask: mask impossibly already in tree"); Free(saved_x); return (x); } /* * Calculate index of mask, and check for normalcy. * First find the first byte with a 0 bit, then if there are * more bits left (remember we already trimmed the trailing 0's), * the pattern must be one of those in normal_chars[], or we have * a non-contiguous mask. */ cplim = netmask + mlen; isnormal = 1; for (cp = netmask + skip; (cp < cplim) && *(u_char *)cp == 0xff;) cp++; if (cp != cplim) { static char normal_chars[] = { 0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; for (j = 0x80; (j & *cp) != 0; j >>= 1) b++; if (*cp != normal_chars[b] || cp != (cplim - 1)) isnormal = 0; } b += (cp - netmask) << 3; x->rn_bit = -1 - b; if (isnormal) x->rn_flags |= RNF_NORMAL; return (x); } static int /* XXX: arbitrary ordering for non-contiguous masks */ rn_lexobetter(m_arg, n_arg) void *m_arg, *n_arg; { register u_char *mp = m_arg, *np = n_arg, *lim; if (LEN(mp) > LEN(np)) return 1; /* not really, but need to check longer one first */ if (LEN(mp) == LEN(np)) for (lim = mp + LEN(mp); mp < lim;) if (*mp++ > *np++) return 1; return 0; } static struct radix_mask * rn_new_radix_mask(tt, next) register struct radix_node *tt; register struct radix_mask *next; { register struct radix_mask *m; MKGet(m); if (m == 0) { log(LOG_ERR, "Mask for route not entered\n"); return (0); } bzero(m, sizeof *m); m->rm_bit = tt->rn_bit; m->rm_flags = tt->rn_flags; if (tt->rn_flags & RNF_NORMAL) m->rm_leaf = tt; else m->rm_mask = tt->rn_mask; m->rm_mklist = next; tt->rn_mklist = m; return m; } struct radix_node * rn_addroute(v_arg, n_arg, head, treenodes) void *v_arg, *n_arg; struct radix_node_head *head; struct radix_node treenodes[2]; { caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg; register struct radix_node *t, *x = 0, *tt; struct radix_node *saved_tt, *top = head->rnh_treetop; short b = 0, b_leaf = 0; int keyduplicated; caddr_t mmask; struct radix_mask *m, **mp; /* * In dealing with non-contiguous masks, there may be * many different routes which have the same mask. * We will find it useful to have a unique pointer to * the mask to speed avoiding duplicate references at * nodes and possibly save time in calculating indices. */ if (netmask) { if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0) return (0); b_leaf = x->rn_bit; b = -1 - x->rn_bit; netmask = x->rn_key; } /* * Deal with duplicated keys: attach node to previous instance */ saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes); if (keyduplicated) { for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) { #ifdef RADIX_MPATH /* permit multipath, if enabled for the family */ if (rn_mpath_capable(head) && netmask == tt->rn_mask) { /* * go down to the end of multipaths, so that * new entry goes into the end of rn_dupedkey * chain. */ do { t = tt; tt = tt->rn_dupedkey; } while (tt && t->rn_mask == tt->rn_mask); break; } #endif if (tt->rn_mask == netmask) return (0); if (netmask == 0 || (tt->rn_mask && ((b_leaf < tt->rn_bit) /* index(netmask) > node */ || rn_refines(netmask, tt->rn_mask) || rn_lexobetter(netmask, tt->rn_mask)))) break; } /* * If the mask is not duplicated, we wouldn't * find it among possible duplicate key entries * anyway, so the above test doesn't hurt. * * We sort the masks for a duplicated key the same way as * in a masklist -- most specific to least specific. * This may require the unfortunate nuisance of relocating * the head of the list. * * We also reverse, or doubly link the list through the * parent pointer. */ if (tt == saved_tt) { struct radix_node *xx = x; /* link in at head of list */ (tt = treenodes)->rn_dupedkey = t; tt->rn_flags = t->rn_flags; tt->rn_parent = x = t->rn_parent; t->rn_parent = tt; /* parent */ if (x->rn_left == t) x->rn_left = tt; else x->rn_right = tt; saved_tt = tt; x = xx; } else { (tt = treenodes)->rn_dupedkey = t->rn_dupedkey; t->rn_dupedkey = tt; tt->rn_parent = t; /* parent */ if (tt->rn_dupedkey) /* parent */ tt->rn_dupedkey->rn_parent = tt; /* parent */ } #ifdef RN_DEBUG t=tt+1; tt->rn_info = rn_nodenum++; t->rn_info = rn_nodenum++; tt->rn_twin = t; tt->rn_ybro = rn_clist; rn_clist = tt; #endif tt->rn_key = (caddr_t) v; tt->rn_bit = -1; tt->rn_flags = RNF_ACTIVE; } /* * Put mask in tree. */ if (netmask) { tt->rn_mask = netmask; tt->rn_bit = x->rn_bit; tt->rn_flags |= x->rn_flags & RNF_NORMAL; } t = saved_tt->rn_parent; if (keyduplicated) goto on2; b_leaf = -1 - t->rn_bit; if (t->rn_right == saved_tt) x = t->rn_left; else x = t->rn_right; /* Promote general routes from below */ if (x->rn_bit < 0) { for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) { *mp = m = rn_new_radix_mask(x, 0); if (m) mp = &m->rm_mklist; } } else if (x->rn_mklist) { /* * Skip over masks whose index is > that of new node */ for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) if (m->rm_bit >= b_leaf) break; t->rn_mklist = m; *mp = 0; } on2: /* Add new route to highest possible ancestor's list */ if ((netmask == 0) || (b > t->rn_bit )) return tt; /* can't lift at all */ b_leaf = tt->rn_bit; do { x = t; t = t->rn_parent; } while (b <= t->rn_bit && x != top); /* * Search through routes associated with node to * insert new route according to index. * Need same criteria as when sorting dupedkeys to avoid * double loop on deletion. */ for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) { if (m->rm_bit < b_leaf) continue; if (m->rm_bit > b_leaf) break; if (m->rm_flags & RNF_NORMAL) { mmask = m->rm_leaf->rn_mask; if (tt->rn_flags & RNF_NORMAL) { #if !defined(RADIX_MPATH) log(LOG_ERR, "Non-unique normal route, mask not entered\n"); #endif return tt; } } else mmask = m->rm_mask; if (mmask == netmask) { m->rm_refs++; tt->rn_mklist = m; return tt; } if (rn_refines(netmask, mmask) || rn_lexobetter(netmask, mmask)) break; } *mp = rn_new_radix_mask(tt, *mp); return tt; } struct radix_node * rn_delete(v_arg, netmask_arg, head) void *v_arg, *netmask_arg; struct radix_node_head *head; { register struct radix_node *t, *p, *x, *tt; struct radix_mask *m, *saved_m, **mp; struct radix_node *dupedkey, *saved_tt, *top; caddr_t v, netmask; int b, head_off, vlen; v = v_arg; netmask = netmask_arg; x = head->rnh_treetop; tt = rn_search(v, x); head_off = x->rn_offset; vlen = LEN(v); saved_tt = tt; top = x; if (tt == 0 || bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off)) return (0); /* * Delete our route from mask lists. */ if (netmask) { if ((x = rn_addmask(netmask, 1, head_off)) == 0) return (0); netmask = x->rn_key; while (tt->rn_mask != netmask) if ((tt = tt->rn_dupedkey) == 0) return (0); } if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0) goto on1; if (tt->rn_flags & RNF_NORMAL) { if (m->rm_leaf != tt || m->rm_refs > 0) { log(LOG_ERR, "rn_delete: inconsistent annotation\n"); return 0; /* dangling ref could cause disaster */ } } else { if (m->rm_mask != tt->rn_mask) { log(LOG_ERR, "rn_delete: inconsistent annotation\n"); goto on1; } if (--m->rm_refs >= 0) goto on1; } b = -1 - tt->rn_bit; t = saved_tt->rn_parent; if (b > t->rn_bit) goto on1; /* Wasn't lifted at all */ do { x = t; t = t->rn_parent; } while (b <= t->rn_bit && x != top); for (mp = &x->rn_mklist; (m = *mp); mp = &m->rm_mklist) if (m == saved_m) { *mp = m->rm_mklist; MKFree(m); break; } if (m == 0) { log(LOG_ERR, "rn_delete: couldn't find our annotation\n"); if (tt->rn_flags & RNF_NORMAL) return (0); /* Dangling ref to us */ } on1: /* * Eliminate us from tree */ if (tt->rn_flags & RNF_ROOT) return (0); #ifdef RN_DEBUG /* Get us out of the creation list */ for (t = rn_clist; t && t->rn_ybro != tt; t = t->rn_ybro) {} if (t) t->rn_ybro = tt->rn_ybro; #endif t = tt->rn_parent; dupedkey = saved_tt->rn_dupedkey; if (dupedkey) { /* * Here, tt is the deletion target and * saved_tt is the head of the dupekey chain. */ if (tt == saved_tt) { /* remove from head of chain */ x = dupedkey; x->rn_parent = t; if (t->rn_left == tt) t->rn_left = x; else t->rn_right = x; } else { /* find node in front of tt on the chain */ for (x = p = saved_tt; p && p->rn_dupedkey != tt;) p = p->rn_dupedkey; if (p) { p->rn_dupedkey = tt->rn_dupedkey; if (tt->rn_dupedkey) /* parent */ tt->rn_dupedkey->rn_parent = p; /* parent */ } else log(LOG_ERR, "rn_delete: couldn't find us\n"); } t = tt + 1; if (t->rn_flags & RNF_ACTIVE) { #ifndef RN_DEBUG *++x = *t; p = t->rn_parent; #else b = t->rn_info; *++x = *t; t->rn_info = b; p = t->rn_parent; #endif if (p->rn_left == t) p->rn_left = x; else p->rn_right = x; x->rn_left->rn_parent = x; x->rn_right->rn_parent = x; } goto out; } if (t->rn_left == tt) x = t->rn_right; else x = t->rn_left; p = t->rn_parent; if (p->rn_right == t) p->rn_right = x; else p->rn_left = x; x->rn_parent = p; /* * Demote routes attached to us. */ if (t->rn_mklist) { if (x->rn_bit >= 0) { for (mp = &x->rn_mklist; (m = *mp);) mp = &m->rm_mklist; *mp = t->rn_mklist; } else { /* If there are any key,mask pairs in a sibling duped-key chain, some subset will appear sorted in the same order attached to our mklist */ for (m = t->rn_mklist; m && x; x = x->rn_dupedkey) if (m == x->rn_mklist) { struct radix_mask *mm = m->rm_mklist; x->rn_mklist = 0; if (--(m->rm_refs) < 0) MKFree(m); m = mm; } if (m) log(LOG_ERR, "rn_delete: Orphaned Mask %p at %p\n", m, x); } } /* * We may be holding an active internal node in the tree. */ x = tt + 1; if (t != x) { #ifndef RN_DEBUG *t = *x; #else b = t->rn_info; *t = *x; t->rn_info = b; #endif t->rn_left->rn_parent = t; t->rn_right->rn_parent = t; p = x->rn_parent; if (p->rn_left == x) p->rn_left = t; else p->rn_right = t; } out: tt->rn_flags &= ~RNF_ACTIVE; tt[1].rn_flags &= ~RNF_ACTIVE; return (tt); } /* * This is the same as rn_walktree() except for the parameters and the * exit. */ static int rn_walktree_from(h, a, m, f, w) struct radix_node_head *h; void *a, *m; walktree_f_t *f; void *w; { int error; struct radix_node *base, *next; u_char *xa = (u_char *)a; u_char *xm = (u_char *)m; register struct radix_node *rn, *last = 0 /* shut up gcc */; int stopping = 0; int lastb; /* * rn_search_m is sort-of-open-coded here. We cannot use the * function because we need to keep track of the last node seen. */ /* printf("about to search\n"); */ for (rn = h->rnh_treetop; rn->rn_bit >= 0; ) { last = rn; /* printf("rn_bit %d, rn_bmask %x, xm[rn_offset] %x\n", rn->rn_bit, rn->rn_bmask, xm[rn->rn_offset]); */ if (!(rn->rn_bmask & xm[rn->rn_offset])) { break; } if (rn->rn_bmask & xa[rn->rn_offset]) { rn = rn->rn_right; } else { rn = rn->rn_left; } } /* printf("done searching\n"); */ /* * Two cases: either we stepped off the end of our mask, * in which case last == rn, or we reached a leaf, in which * case we want to start from the last node we looked at. * Either way, last is the node we want to start from. */ rn = last; lastb = rn->rn_bit; /* printf("rn %p, lastb %d\n", rn, lastb);*/ /* * This gets complicated because we may delete the node * while applying the function f to it, so we need to calculate * the successor node in advance. */ while (rn->rn_bit >= 0) rn = rn->rn_left; while (!stopping) { /* printf("node %p (%d)\n", rn, rn->rn_bit); */ base = rn; /* If at right child go back up, otherwise, go right */ while (rn->rn_parent->rn_right == rn && !(rn->rn_flags & RNF_ROOT)) { rn = rn->rn_parent; /* if went up beyond last, stop */ if (rn->rn_bit <= lastb) { stopping = 1; /* printf("up too far\n"); */ /* * XXX we should jump to the 'Process leaves' * part, because the values of 'rn' and 'next' * we compute will not be used. Not a big deal * because this loop will terminate, but it is * inefficient and hard to understand! */ } } /* * At the top of the tree, no need to traverse the right * half, prevent the traversal of the entire tree in the * case of default route. */ if (rn->rn_parent->rn_flags & RNF_ROOT) stopping = 1; /* Find the next *leaf* since next node might vanish, too */ for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) rn = rn->rn_left; next = rn; /* Process leaves */ while ((rn = base) != 0) { base = rn->rn_dupedkey; /* printf("leaf %p\n", rn); */ if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w))) return (error); } rn = next; if (rn->rn_flags & RNF_ROOT) { /* printf("root, stopping"); */ stopping = 1; } } return 0; } static int rn_walktree(h, f, w) struct radix_node_head *h; walktree_f_t *f; void *w; { int error; struct radix_node *base, *next; register struct radix_node *rn = h->rnh_treetop; /* * This gets complicated because we may delete the node * while applying the function f to it, so we need to calculate * the successor node in advance. */ /* First time through node, go left */ while (rn->rn_bit >= 0) rn = rn->rn_left; for (;;) { base = rn; /* If at right child go back up, otherwise, go right */ while (rn->rn_parent->rn_right == rn && (rn->rn_flags & RNF_ROOT) == 0) rn = rn->rn_parent; /* Find the next *leaf* since next node might vanish, too */ for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0;) rn = rn->rn_left; next = rn; /* Process leaves */ while ((rn = base)) { base = rn->rn_dupedkey; if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w))) return (error); } rn = next; if (rn->rn_flags & RNF_ROOT) return (0); } /* NOTREACHED */ } /* * Allocate and initialize an empty tree. This has 3 nodes, which are * part of the radix_node_head (in the order ) and are * marked RNF_ROOT so they cannot be freed. * The leaves have all-zero and all-one keys, with significant * bits starting at 'off'. * Return 1 on success, 0 on error. */ int rn_inithead(head, off) void **head; int off; { register struct radix_node_head *rnh; register struct radix_node *t, *tt, *ttt; if (*head) return (1); R_Zalloc(rnh, struct radix_node_head *, sizeof (*rnh)); if (rnh == 0) return (0); #ifdef _KERNEL RADIX_NODE_HEAD_LOCK_INIT(rnh); #endif *head = rnh; t = rn_newpair(rn_zeros, off, rnh->rnh_nodes); ttt = rnh->rnh_nodes + 2; t->rn_right = ttt; t->rn_parent = t; tt = t->rn_left; /* ... which in turn is rnh->rnh_nodes */ tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE; tt->rn_bit = -1 - off; *ttt = *tt; ttt->rn_key = rn_ones; rnh->rnh_addaddr = rn_addroute; rnh->rnh_deladdr = rn_delete; rnh->rnh_matchaddr = rn_match; rnh->rnh_lookup = rn_lookup; rnh->rnh_walktree = rn_walktree; rnh->rnh_walktree_from = rn_walktree_from; rnh->rnh_treetop = t; return (1); } int rn_detachhead(void **head) { struct radix_node_head *rnh; KASSERT((head != NULL && *head != NULL), ("%s: head already freed", __func__)); rnh = *head; /* Free nodes. */ Free(rnh); *head = NULL; return (1); } void rn_init(int maxk) { char *cp, *cplim; max_keylen = maxk; if (max_keylen == 0) { log(LOG_ERR, "rn_init: radix functions require max_keylen be set\n"); return; } R_Malloc(rn_zeros, char *, 3 * max_keylen); if (rn_zeros == NULL) panic("rn_init"); bzero(rn_zeros, 3 * max_keylen); rn_ones = cp = rn_zeros + max_keylen; addmask_key = cplim = rn_ones + max_keylen; while (cp < cplim) *cp++ = -1; if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0) panic("rn_init 2"); } ipfw-user/sys/net/radix.h000644 000423 000000 00000015370 12006744005 016125 0ustar00luigiwheel000000 000000 /*- * Copyright (c) 1988, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)radix.h 8.2 (Berkeley) 10/31/94 * $FreeBSD: head/sys/net/radix.h 225698 2011-09-20 20:27:26Z kmacy $ */ #ifndef _RADIX_H_ #define _RADIX_H_ #ifdef _KERNEL #include #include #include #endif #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_RTABLE); #endif /* * Radix search tree node layout. */ struct radix_node { struct radix_mask *rn_mklist; /* list of masks contained in subtree */ struct radix_node *rn_parent; /* parent */ short rn_bit; /* bit offset; -1-index(netmask) */ char rn_bmask; /* node: mask for bit test*/ u_char rn_flags; /* enumerated next */ #define RNF_NORMAL 1 /* leaf contains normal route */ #define RNF_ROOT 2 /* leaf is root leaf for tree */ #define RNF_ACTIVE 4 /* This node is alive (for rtfree) */ union { struct { /* leaf only data: */ caddr_t rn_Key; /* object of search */ caddr_t rn_Mask; /* netmask, if present */ struct radix_node *rn_Dupedkey; } rn_leaf; struct { /* node only data: */ int rn_Off; /* where to start compare */ struct radix_node *rn_L;/* progeny */ struct radix_node *rn_R;/* progeny */ } rn_node; } rn_u; #ifdef RN_DEBUG int rn_info; struct radix_node *rn_twin; struct radix_node *rn_ybro; #endif }; #define rn_dupedkey rn_u.rn_leaf.rn_Dupedkey #define rn_key rn_u.rn_leaf.rn_Key #define rn_mask rn_u.rn_leaf.rn_Mask #define rn_offset rn_u.rn_node.rn_Off #define rn_left rn_u.rn_node.rn_L #define rn_right rn_u.rn_node.rn_R /* * Annotations to tree concerning potential routes applying to subtrees. */ struct radix_mask { short rm_bit; /* bit offset; -1-index(netmask) */ char rm_unused; /* cf. rn_bmask */ u_char rm_flags; /* cf. rn_flags */ struct radix_mask *rm_mklist; /* more masks to try */ union { caddr_t rmu_mask; /* the mask */ struct radix_node *rmu_leaf; /* for normal routes */ } rm_rmu; int rm_refs; /* # of references to this struct */ }; #define rm_mask rm_rmu.rmu_mask #define rm_leaf rm_rmu.rmu_leaf /* extra field would make 32 bytes */ typedef int walktree_f_t(struct radix_node *, void *); struct radix_node_head { struct radix_node *rnh_treetop; u_int rnh_gen; /* generation counter */ int rnh_multipath; /* multipath capable ? */ int rnh_addrsize; /* permit, but not require fixed keys */ int rnh_pktsize; /* permit, but not require fixed keys */ struct radix_node *(*rnh_addaddr) /* add based on sockaddr */ (void *v, void *mask, struct radix_node_head *head, struct radix_node nodes[]); struct radix_node *(*rnh_addpkt) /* add based on packet hdr */ (void *v, void *mask, struct radix_node_head *head, struct radix_node nodes[]); struct radix_node *(*rnh_deladdr) /* remove based on sockaddr */ (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_delpkt) /* remove based on packet hdr */ (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */ (void *v, struct radix_node_head *head); struct radix_node *(*rnh_lookup) /* locate based on sockaddr */ (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_matchpkt) /* locate based on packet hdr */ (void *v, struct radix_node_head *head); int (*rnh_walktree) /* traverse tree */ (struct radix_node_head *head, walktree_f_t *f, void *w); int (*rnh_walktree_from) /* traverse tree below a */ (struct radix_node_head *head, void *a, void *m, walktree_f_t *f, void *w); void (*rnh_close) /* do something when the last ref drops */ (struct radix_node *rn, struct radix_node_head *head); struct radix_node rnh_nodes[3]; /* empty tree for common case */ #ifdef _KERNEL struct rwlock rnh_lock; /* locks entire radix tree */ #endif }; #ifndef _KERNEL #define R_Malloc(p, t, n) (p = (t) malloc((unsigned int)(n))) #define R_Zalloc(p, t, n) (p = (t) calloc(1,(unsigned int)(n))) #define Free(p) free((char *)p); #else #define R_Malloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT)) #define R_Zalloc(p, t, n) (p = (t) malloc((unsigned long)(n), M_RTABLE, M_NOWAIT | M_ZERO)) #define Free(p) free((caddr_t)p, M_RTABLE); #define RADIX_NODE_HEAD_LOCK_INIT(rnh) \ rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0) #define RADIX_NODE_HEAD_LOCK(rnh) rw_wlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_UNLOCK(rnh) rw_wunlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_RLOCK(rnh) rw_rlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_RUNLOCK(rnh) rw_runlock(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh) rw_try_upgrade(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_DESTROY(rnh) rw_destroy(&(rnh)->rnh_lock) #define RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED) #define RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED) #endif /* _KERNEL */ void rn_init(int); int rn_inithead(void **, int); int rn_detachhead(void **); int rn_refines(void *, void *); struct radix_node *rn_addmask(void *, int, int), *rn_addroute (void *, void *, struct radix_node_head *, struct radix_node [2]), *rn_delete(void *, void *, struct radix_node_head *), *rn_lookup (void *v_arg, void *m_arg, struct radix_node_head *head), *rn_match(void *, struct radix_node_head *); #endif /* _RADIX_H_ */