summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/ipf
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet/ipf')
-rw-r--r--usr/src/uts/common/inet/ipf/cfw.c659
-rw-r--r--usr/src/uts/common/inet/ipf/fil.c5
-rw-r--r--usr/src/uts/common/inet/ipf/ip_fil_solaris.c304
-rw-r--r--usr/src/uts/common/inet/ipf/ip_log.c4
-rw-r--r--usr/src/uts/common/inet/ipf/ip_state.c19
-rw-r--r--usr/src/uts/common/inet/ipf/ipf.conf5
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/Makefile7
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ip_fil.h46
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ip_state.h4
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h69
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ipf_stack.h15
-rw-r--r--usr/src/uts/common/inet/ipf/solaris.c10
12 files changed, 1089 insertions, 58 deletions
diff --git a/usr/src/uts/common/inet/ipf/cfw.c b/usr/src/uts/common/inet/ipf/cfw.c
new file mode 100644
index 0000000000..941aeac328
--- /dev/null
+++ b/usr/src/uts/common/inet/ipf/cfw.c
@@ -0,0 +1,659 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019, Joyent, Inc.
+ */
+
+/* IPF oddness for compilation in userland for IPF tests. */
+#if defined(KERNEL) || defined(_KERNEL)
+#undef KERNEL
+#undef _KERNEL
+#define KERNEL 1
+#define _KERNEL 1
+#endif
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include "netinet/ip_compat.h"
+#ifdef USE_INET6
+#include <netinet/icmp6.h>
+#endif
+#include <netinet/tcpip.h>
+#include "netinet/ip_fil.h"
+#include "netinet/ip_nat.h"
+#include "netinet/ip_frag.h"
+#include "netinet/ip_state.h"
+#include "netinet/ip_proxy.h"
+#include "netinet/ip_auth.h"
+#include "netinet/ipf_stack.h"
+#ifdef IPFILTER_SCAN
+#include "netinet/ip_scan.h"
+#endif
+#ifdef IPFILTER_SYNC
+#include "netinet/ip_sync.h"
+#endif
+#include "netinet/ip_pool.h"
+#include "netinet/ip_htable.h"
+#ifdef IPFILTER_COMPILED
+#include "netinet/ip_rules.h"
+#endif
+#if defined(_KERNEL)
+#include <sys/sunddi.h>
+#endif
+
+#include "netinet/ipf_cfw.h"
+#include <sys/file.h>
+#include <sys/uio.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+
+/*
+ * cfw == Cloud Firewall ==> routines for a global-zone data collector about
+ * ipf events for SmartOS. The only ones that CFW cares about are ones
+ * enforced by global-zone-controlled rulesets.
+ *
+ * The variable below is tied into the GZ-only ipf device /dev/ipfev, that
+ * flips this on when there is an open instance. This feature will also
+ * consume an fr_flag to have per-rule granularity.
+ */
+boolean_t ipf_cfwlog_enabled;
+
+/*
+ * Because ipf's test tools in $SRC/cmd insert all of these files, we need to
+ * stub out what we can vs. drag in even more headers and who knows what else.
+ */
+#ifdef _KERNEL
+
+/*
+ * CFW event ring buffer. Remember, this is for ALL ZONES because only a
+ * global-zone event-reader will be consuming these. In other words, it's
+ * not something to instantiate per-netstack.
+ *
+ * We may want to get more sophisticated and performant (e.g. per-processor),
+ * but for now keep the ring buffer simple and stupid.
+ * Must be a power of 2, to be bitmaskable, and must be countable by a uint_t
+ *
+ * Resizeable, see ipf_cfw_ring_resize() below.
+ */
+#define IPF_CFW_DEFAULT_RING_BUFS 1024
+#define IPF_CFW_MIN_RING_BUFS 8
+#define IPF_CFW_MAX_RING_BUFS (1U << 31U)
+
+/* Assume C's init-to-zero is sufficient for these types... */
+static kmutex_t cfw_ringlock;
+static kcondvar_t cfw_ringcv;
+
+static cfwev_t *cfw_ring; /* NULL by default. */
+static uint32_t cfw_ringsize; /* 0 by default, number of array elements. */
+static uint32_t cfw_ringmask; /* 0 by default. */
+
+/* If these are equal, we're either empty or full. */
+static uint_t cfw_ringstart, cfw_ringend;
+static boolean_t cfw_ringfull; /* Tell the difference here! */
+/* Bean-counters. */
+static uint64_t cfw_evreports;
+static uint64_t cfw_evdrops;
+
+/*
+ * Place an event in the CFW event ring buffer.
+ *
+ * For now, be simple and drop the oldest event if we overflow. We may wish to
+ * selectively drop older events based on type in the future.
+ */
+static void
+ipf_cfwev_report(cfwev_t *event)
+{
+ mutex_enter(&cfw_ringlock);
+ cfw_ring[cfw_ringend] = *event;
+ cfw_ringend++;
+ cfw_ringend &= cfw_ringmask;
+ if (cfw_ringfull) {
+ cfw_ringstart++;
+ cfw_ringstart &= cfw_ringmask;
+ ASSERT3U(cfw_ringstart, ==, cfw_ringend);
+ DTRACE_PROBE(ipf__cfw__evdrop);
+ cfw_evdrops++;
+ } else {
+ cfw_ringfull = (cfw_ringend == cfw_ringstart);
+ }
+ cfw_evreports++;
+ cv_broadcast(&cfw_ringcv);
+ mutex_exit(&cfw_ringlock);
+}
+
+/*
+ * Provide access to multiple CFW events that can allow copying straight from
+ * the ring buffer up to userland. Requires a callback (which could call
+ * uiomove() directly, OR to a local still-in-kernel buffer) that must do the
+ * data copying-out.
+ *
+ * Callback function is of the form:
+ *
+ * uint_t cfw_many_cb(cfwev_t *evptr, int num_avail, void *cbarg);
+ *
+ * The function must return how many events got consumed, which MUST be <= the
+ * number available. The function must ALSO UNDERSTAND that cfw_ringlock is
+ * held and must not be released during this time. The function may be called
+ * more than once, if the available buffers wrap-around OR "block" is set and
+ * we don't have enough buffers. If any callback returns 0, exit the function
+ * with however many were consumed.
+ *
+ * This function, like the callback, returns the number of events *CONSUMED*.
+ *
+ * . . .
+ *
+ * Tunables for ipf_cfwev_consume_many().
+ *
+ * If you wish to attempt to coalesce reads (to reduce the likelihood of one
+ * event at a time during high load) change the number of tries below to
+ * something not 0. Early experiments set this to 10.
+ *
+ * The wait between tries is in usecs in cfw_timeout_wait. The pessimal
+ * case for this is a timeout_wait-spaced trickle of one event at a time.
+ */
+uint_t cfw_timeout_tries = 0;
+uint_t cfw_timeout_wait = 10000; /* 10ms wait. */
+
+typedef struct uio_error_s {
+ struct uio *ue_uio;
+ int ue_error;
+} uio_error_t;
+
+static uint_t
+ipf_cfwev_consume_many(uint_t num_requested, boolean_t block,
+ cfwmanycb_t cfw_many_cb, void *cbarg)
+{
+ uint_t consumed = 0, cb_consumed, contig_size;
+ uint_t timeout_tries = cfw_timeout_tries;
+ boolean_t eintr = B_FALSE;
+
+ mutex_enter(&cfw_ringlock);
+
+ while (num_requested > 0) {
+ clock_t delta;
+
+ /* Silly reality checks */
+ ASSERT3U(cfw_ringstart, <, cfw_ringsize);
+ ASSERT3U(cfw_ringend, <, cfw_ringsize);
+
+ if (cfw_ringstart > cfw_ringend || cfw_ringfull) {
+ /* We have from ringstart to the buffer's end. */
+ contig_size = cfw_ringsize - cfw_ringstart;
+ } else if (cfw_ringstart < cfw_ringend) {
+ /* We have no potential wrapping at this time. */
+ contig_size = cfw_ringend - cfw_ringstart;
+ } else if (block && cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) {
+ /* Maybe something to consume now, try again. */
+ continue;
+ } else {
+ /* Nothing (more) to consume, return! */
+ eintr = (block && consumed == 0);
+ break;
+ }
+
+ /* Less asked-for than what we needed. */
+ if (num_requested < contig_size)
+ contig_size = num_requested;
+
+ cb_consumed =
+ cfw_many_cb(&(cfw_ring[cfw_ringstart]), contig_size, cbarg);
+ ASSERT3U(cb_consumed, <=, contig_size);
+
+ cfw_ringstart += cb_consumed;
+ ASSERT3U(cfw_ringstart, <=, cfw_ringmask + 1);
+ cfw_ringstart &= cfw_ringmask; /* In case of wraparound. */
+ consumed += cb_consumed;
+ cfw_ringfull = (cfw_ringfull && cb_consumed == 0);
+ if (cb_consumed < contig_size) {
+ /*
+ * Callback returned less than given.
+ * This is likely a uio error, but we have
+ * something. Get out of here.
+ */
+ break;
+ }
+ ASSERT3U(cb_consumed, ==, contig_size);
+ num_requested -= contig_size;
+
+ if (num_requested == 0) {
+ /* All done! */
+ break;
+ }
+
+ if (cfw_ringstart != cfw_ringend) {
+ /*
+ * We wrapped around the end of the buffer, and
+ * we have more available to fill our request.
+ */
+ ASSERT0(cfw_ringstart);
+ ASSERT(!cfw_ringfull);
+ continue;
+ }
+
+ /*
+ * We obtained some of the events we requested, but not all.
+ * Since we have nothing to consume, wait *a little* longer.
+ */
+ if (timeout_tries == 0)
+ break; /* Don't bother... */
+ delta = drv_usectohz(cfw_timeout_wait);
+ timeout_tries--;
+
+ switch (cv_reltimedwait_sig(&cfw_ringcv, &cfw_ringlock, delta,
+ TR_CLOCK_TICK)) {
+ case 0:
+ /*
+ * Received signal! Return what we have OR if we have
+ * nothing, EINTR.
+ */
+ DTRACE_PROBE1(ipf__cfw__timedsignal, int, consumed);
+ eintr = (consumed == 0);
+ num_requested = 0;
+ break;
+ case -1:
+ /* Time reached! Bail with what we got. */
+ DTRACE_PROBE(ipf__cfw__timedexpired);
+ num_requested = 0;
+ break;
+ default:
+ /* Aha! We've got more! */
+ DTRACE_PROBE(ipf__cfw__moredata);
+ break;
+ }
+ }
+
+ mutex_exit(&cfw_ringlock);
+ if (eintr)
+ ((uio_error_t *)cbarg)->ue_error = EINTR;
+ return (consumed);
+}
+
+/*
+ * SmartOS likes using the zone's debug id. Make sure we squirrel that away in
+ * the ipf netstack instance if it's not there.
+ */
+static inline zoneid_t
+ifs_to_did(ipf_stack_t *ifs)
+{
+ if (ifs->ifs_zone_did == 0) {
+ zone_t *zone;
+
+ /*
+ * We can't get the zone_did at initialization time because
+ * most zone data isn't readily available then, cement the did
+ * in place now.
+ */
+ VERIFY3U(ifs->ifs_zone, !=, GLOBAL_ZONEID);
+ zone = zone_find_by_id(ifs->ifs_zone);
+ if (zone != NULL) {
+ ifs->ifs_zone_did = zone->zone_did;
+ zone_rele(zone);
+ }
+ /* Else we are either in shutdown or something weirder. */
+ }
+ return (ifs->ifs_zone_did);
+}
+
+/*
+ * ipf_block_cfwlog()
+ *
+ * Called by fr_check(). Record drop events for the global-zone data
+ * collector. Use rest-of-ipf-style names for the parameters.
+ */
+void
+ipf_block_cfwlog(frentry_t *fr, fr_info_t *fin, ipf_stack_t *ifs)
+{
+ cfwev_t event = {0};
+
+ /*
+ * We need a rule.
+ * Capture failure by using dtrace on this function's entry.
+ * 'ipf_block_cfwlog:entry /arg0 == NULL/ { printf("GOTCHA!\n"); }'
+ */
+ if (fr == NULL)
+ return;
+
+ event.cfwev_type = CFWEV_BLOCK;
+ event.cfwev_length = sizeof (event);
+ /*
+ * IPF code elsewhere does the cheesy single-flag check, even though
+ * there are two flags in a rule (one for in, one for out).
+ */
+ event.cfwev_direction = (fr->fr_flags & FR_INQUE) ?
+ CFWDIR_IN : CFWDIR_OUT;
+
+ event.cfwev_protocol = fin->fin_p;
+ /*
+ * NOTE: fin_*port is in host/native order, and ICMP info is here too.
+ */
+ event.cfwev_sport = htons(fin->fin_sport);
+ event.cfwev_dport = htons(fin->fin_dport);
+
+ switch (fin->fin_v) {
+ case IPV4_VERSION:
+ IN6_INADDR_TO_V4MAPPED(&fin->fin_src, &event.cfwev_saddr);
+ IN6_INADDR_TO_V4MAPPED(&fin->fin_dst, &event.cfwev_daddr);
+ break;
+ case IPV6_VERSION:
+ event.cfwev_saddr = fin->fin_src6.in6;
+ event.cfwev_daddr = fin->fin_dst6.in6;
+ break;
+ default:
+ /* We should never reach here, but mark it if we do. */
+ DTRACE_PROBE1(ipf__cfw__frinfo__badipversion, frinfo_t *, fin);
+ return;
+ }
+
+ /*
+ * uniqtime() is what ipf's GETKTIME() uses.
+ * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
+ */
+ uniqtime(&event.cfwev_tstamp);
+ event.cfwev_zonedid = ifs_to_did(ifs);
+ event.cfwev_ruleid = fin->fin_rule;
+ memcpy(event.cfwev_ruleuuid, fr->fr_uuid, sizeof (uuid_t));
+
+ ipf_cfwev_report(&event);
+}
+
+/*
+ * ipf_log_cfwlog()
+ *
+ * Twin of ipstate_log(), but records state events for the global-zone data
+ * collector.
+ */
+void
+ipf_log_cfwlog(struct ipstate *is, uint_t type, ipf_stack_t *ifs)
+{
+ cfwev_t event = {0};
+
+ switch (type) {
+ case ISL_NEW:
+ case ISL_CLONE:
+ event.cfwev_type = CFWEV_BEGIN;
+ break;
+ case ISL_EXPIRE:
+ case ISL_FLUSH:
+ case ISL_REMOVE:
+ case ISL_KILLED:
+ case ISL_ORPHAN:
+ /*
+ * We don't care about session disappearances in CFW logging
+ * for now. (Possible future: CFWEV_END)
+ */
+ return;
+ default:
+ event.cfwev_type = CFWEV_BLOCK;
+ break;
+ }
+
+ /*
+ * IPF code elsewhere does the cheesy single-flag check, even though
+ * there are two flags in a rule (one for in, one for out). Follow
+ * suit here.
+ */
+ event.cfwev_length = sizeof (event);
+ ASSERT(is->is_rule != NULL);
+ event.cfwev_direction = (is->is_rule->fr_flags & FR_INQUE) ?
+ CFWDIR_IN : CFWDIR_OUT;
+ event.cfwev_protocol = is->is_p;
+ switch (is->is_p) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ /* NOTE: is_*port is in network order. */
+ event.cfwev_sport = is->is_sport;
+ event.cfwev_dport = is->is_dport;
+ break;
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ /* Scribble the ICMP type in sport... */
+ event.cfwev_sport = is->is_icmp.ici_type;
+ break;
+ /* Other protocols leave the event's port fields empty. */
+ }
+
+ switch(is->is_v) {
+ case IPV4_VERSION:
+ IN6_INADDR_TO_V4MAPPED(&is->is_src.in4, &event.cfwev_saddr);
+ IN6_INADDR_TO_V4MAPPED(&is->is_dst.in4, &event.cfwev_daddr);
+ break;
+ case IPV6_VERSION:
+ event.cfwev_saddr = is->is_src.in6;
+ event.cfwev_daddr = is->is_dst.in6;
+ break;
+ default:
+ /* Can't parse addresses if we don't know the version. Drop. */
+ DTRACE_PROBE1(ipf__cfw__ipstate__badipversion,
+ struct ipstate *, is);
+ return;
+ }
+
+ /*
+ * uniqtime() is what ipf's GETKTIME() uses.
+ * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
+ */
+ uniqtime(&event.cfwev_tstamp);
+ event.cfwev_zonedid = ifs_to_did(ifs);
+ event.cfwev_ruleid = is->is_rulen;
+ memcpy(event.cfwev_ruleuuid, is->is_uuid, sizeof (uuid_t));
+
+ ipf_cfwev_report(&event);
+}
+
+/*
+ * Callback routine we use for ipf_cfwev_consume_many().
+ * Returning 0 means error indication.
+ */
+static uint_t
+cfwlog_read_manycb(cfwev_t *evptr, uint_t num_avail, void *cbarg)
+{
+ uio_error_t *ue = (uio_error_t *)cbarg;
+
+ ASSERT(MUTEX_HELD(&cfw_ringlock));
+
+ if (ue->ue_error != 0)
+ return (0);
+
+ ue->ue_error = uiomove((caddr_t)evptr, num_avail * sizeof (*evptr),
+ UIO_READ, ue->ue_uio);
+ if (ue->ue_error != 0)
+ return (0);
+
+ return (num_avail);
+}
+
+/*
+ * Resize the CFW event ring buffer.
+ *
+ * The caller must ensure the new size is a power of 2 between
+ * IPF_CFW_{MIN,MAX}_RING_BUFS (inclusive) or the special values
+ * IPF_CFW_RING_ALLOCATE (first-time creation) or IPF_CFW_RING_DESTROY
+ * (netstack-unload destruction).
+ *
+ * Everything in the current ring will be destroyed (and reported as a drop)
+ * upon resize.
+ */
+int
+ipf_cfw_ring_resize(uint32_t newsize)
+{
+ ASSERT(MUTEX_HELD(&cfw_ringlock) || newsize == IPF_CFW_RING_ALLOCATE ||
+ newsize == IPF_CFW_RING_DESTROY);
+
+ if (newsize == IPF_CFW_RING_ALLOCATE) {
+ if (cfw_ring != NULL)
+ return (EBUSY);
+ newsize = IPF_CFW_DEFAULT_RING_BUFS;
+ /* Fall through to allocating a new ring buffer. */
+ } else {
+ /* We may be called during error cleanup, so be liberal here. */
+ if ((cfw_ring == NULL && newsize == IPF_CFW_RING_DESTROY) ||
+ newsize == cfw_ringsize) {
+ return (0);
+ }
+ kmem_free(cfw_ring, cfw_ringsize * sizeof (cfwev_t));
+ cfw_ring = NULL;
+ if (cfw_ringfull) {
+ cfw_evdrops += cfw_ringsize;
+ } else if (cfw_ringstart > cfw_ringend) {
+ cfw_evdrops += cfw_ringend +
+ (cfw_ringsize - cfw_ringstart);
+ } else {
+ cfw_evdrops += cfw_ringend - cfw_ringstart;
+ }
+ cfw_ringsize = cfw_ringmask = cfw_ringstart = cfw_ringend = 0;
+ cfw_ringfull = B_FALSE;
+
+ if (newsize == IPF_CFW_RING_DESTROY)
+ return (0);
+ /*
+ * Keep the reports & drops around because if we're just
+ * resizing, we need to know what we lost.
+ */
+ }
+
+ ASSERT(ISP2(newsize));
+ cfw_ring = kmem_alloc(newsize * sizeof (cfwev_t), KM_SLEEP);
+ /* KM_SLEEP means we always succeed. */
+ cfw_ringsize = newsize;
+ cfw_ringmask = cfw_ringsize - 1;
+
+ return (0);
+}
+
+/*
+ * ioctl handler for /dev/ipfev. Only supports SIOCIPFCFWCFG (get data
+ * collector statistics and configuration), and SIOCIPFCFWNEWSZ (resize the
+ * event ring buffer).
+ */
+/* ARGSUSED */
+int
+ipf_cfwlog_ioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cp,
+ int *rp)
+{
+ ipfcfwcfg_t cfginfo;
+ int error;
+
+ if (cmd != SIOCIPFCFWCFG && cmd != SIOCIPFCFWNEWSZ)
+ return (EIO);
+
+ if (crgetzoneid(cp) != GLOBAL_ZONEID)
+ return (EACCES);
+
+ error = COPYIN((caddr_t)data, (caddr_t)&cfginfo, sizeof (cfginfo));
+ if (error != 0)
+ return (EFAULT);
+
+ cfginfo.ipfcfwc_maxevsize = sizeof (cfwev_t);
+ mutex_enter(&cfw_ringlock);
+ cfginfo.ipfcfwc_evreports = cfw_evreports;
+ if (cmd == SIOCIPFCFWNEWSZ) {
+ uint32_t newsize = cfginfo.ipfcfwc_evringsize;
+
+ /* Do ioctl parameter checking here, then call the resizer. */
+ if (newsize < IPF_CFW_MIN_RING_BUFS ||
+ newsize > IPF_CFW_MAX_RING_BUFS || !ISP2(newsize)) {
+ error = EINVAL;
+ } else {
+ error = ipf_cfw_ring_resize(cfginfo.ipfcfwc_evringsize);
+ }
+ } else {
+ error = 0;
+ }
+ /* Both cfw_evdrops and cfw_ringsize are affected by resize. */
+ cfginfo.ipfcfwc_evdrops = cfw_evdrops;
+ cfginfo.ipfcfwc_evringsize = cfw_ringsize;
+ mutex_exit(&cfw_ringlock);
+
+ if (error != 0)
+ return (error);
+
+ error = COPYOUT((caddr_t)&cfginfo, (caddr_t)data, sizeof (cfginfo));
+ if (error != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+/*
+ * Send events up via /dev/ipfev reads. Will return only complete events.
+ */
+/* ARGSUSED */
+int
+ipf_cfwlog_read(dev_t dev, struct uio *uio, cred_t *cp)
+{
+ uint_t requested, consumed;
+ uio_error_t ue = {uio, 0};
+ boolean_t block;
+
+ if (uio->uio_resid == 0)
+ return (0);
+ if (uio->uio_resid < sizeof (cfwev_t))
+ return (EINVAL);
+
+ block = ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) == 0);
+ requested = uio->uio_resid / sizeof (cfwev_t);
+
+ /*
+ * As stated earlier, ipf_cfwev_consume_many() takes a callback.
+ * The callback may be called multiple times before we return.
+ * The callback will execute uiomove().
+ */
+ consumed = ipf_cfwev_consume_many(requested, block, cfwlog_read_manycb,
+ &ue);
+ ASSERT3U(consumed, <=, requested);
+ if (!block && consumed == 0 && ue.ue_error == 0) {
+ /* No data available. */
+ ue.ue_error = EWOULDBLOCK;
+ } else if (ue.ue_error != 0 && ue.ue_error != EINTR) {
+ /*
+ * We had a problem that wasn't simply a
+ * case of cv_wait_sig() receiving a signal.
+ */
+ DTRACE_PROBE1(ipf__cfw__uiodiscard, int, consumed);
+ mutex_enter(&cfw_ringlock);
+ cfw_evdrops += consumed;
+ mutex_exit(&cfw_ringlock);
+ }
+ return (ue.ue_error);
+}
+
+#else /* _KERNEL */
+
+/* Blank stubs to satisfy userland's test compilations. */
+
+int
+ipf_cfw_ring_resize(uint32_t a)
+{
+ return (0);
+}
+
+void
+ipf_log_cfwlog(struct ipstate *a, uint_t b, ipf_stack_t *c)
+{
+}
+
+void
+ipf_block_cfwlog(frentry_t *a, fr_info_t *b, ipf_stack_t *c)
+{
+}
+
+#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ipf/fil.c b/usr/src/uts/common/inet/ipf/fil.c
index 78980be106..48fa6e7325 100644
--- a/usr/src/uts/common/inet/ipf/fil.c
+++ b/usr/src/uts/common/inet/ipf/fil.c
@@ -5,7 +5,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#if defined(KERNEL) || defined(_KERNEL)
@@ -2588,6 +2588,9 @@ ipf_stack_t *ifs;
}
#endif
+ if (IFS_CFWLOG(ifs, fr) && FR_ISBLOCK(pass))
+ ipf_block_cfwlog(fr, fin, ifs);
+
/*
* The FI_STATE flag is cleared here so that calling fr_checkstate
* will work when called from inside of fr_fastroute. Although
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index c9d5f03e13..0d34e0fce3 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -5,7 +5,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#if !defined(lint)
@@ -85,6 +85,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
void *));
static int ipf_hook6 __P((hook_data_t, int, int, void *));
+static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t,
+ void *));
static int ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *));
static int ipf_hookviona_out __P((hook_event_token_t, hook_data_t,
@@ -116,7 +124,7 @@ u_long *ip_forwarding = NULL;
#endif
vmem_t *ipf_minor; /* minor number arena */
-void *ipf_state; /* DDI state */
+void *ipf_state; /* DDI state */
/*
* GZ-controlled and per-zone stacks:
@@ -141,28 +149,38 @@ void *ipf_state; /* DDI state */
*/
/* IPv4 hook names */
-char *hook4_nicevents = "ipfilter_hook4_nicevents";
-char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz";
-char *hook4_in = "ipfilter_hook4_in";
-char *hook4_in_gz = "ipfilter_hook4_in_gz";
-char *hook4_out = "ipfilter_hook4_out";
-char *hook4_out_gz = "ipfilter_hook4_out_gz";
-char *hook4_loop_in = "ipfilter_hook4_loop_in";
-char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz";
-char *hook4_loop_out = "ipfilter_hook4_loop_out";
-char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz";
+char *hook4_nicevents = "ipfilter_hook4_nicevents";
+char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz";
+char *hook4_in = "ipfilter_hook4_in";
+char *hook4_in_gz = "ipfilter_hook4_in_gz";
+char *hook4_out = "ipfilter_hook4_out";
+char *hook4_out_gz = "ipfilter_hook4_out_gz";
+char *hook4_loop_in = "ipfilter_hook4_loop_in";
+char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz";
+char *hook4_loop_out = "ipfilter_hook4_loop_out";
+char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz";
/* IPv6 hook names */
-char *hook6_nicevents = "ipfilter_hook6_nicevents";
-char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz";
-char *hook6_in = "ipfilter_hook6_in";
-char *hook6_in_gz = "ipfilter_hook6_in_gz";
-char *hook6_out = "ipfilter_hook6_out";
-char *hook6_out_gz = "ipfilter_hook6_out_gz";
-char *hook6_loop_in = "ipfilter_hook6_loop_in";
-char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz";
-char *hook6_loop_out = "ipfilter_hook6_loop_out";
-char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz";
+char *hook6_nicevents = "ipfilter_hook6_nicevents";
+char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz";
+char *hook6_in = "ipfilter_hook6_in";
+char *hook6_in_gz = "ipfilter_hook6_in_gz";
+char *hook6_out = "ipfilter_hook6_out";
+char *hook6_out_gz = "ipfilter_hook6_out_gz";
+char *hook6_loop_in = "ipfilter_hook6_loop_in";
+char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz";
+char *hook6_loop_out = "ipfilter_hook6_loop_out";
+char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz";
+
+/* vnd IPv4/v6 hook names */
+char *hook4_vnd_in = "ipfilter_hookvndl3v4_in";
+char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz";
+char *hook6_vnd_in = "ipfilter_hookvndl3v6_in";
+char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz";
+char *hook4_vnd_out = "ipfilter_hookvndl3v4_out";
+char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz";
+char *hook6_vnd_out = "ipfilter_hookvndl3v6_out";
+char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz";
/* viona hook names */
char *hook_viona_in = "ipfilter_hookviona_in";
@@ -170,6 +188,39 @@ char *hook_viona_in_gz = "ipfilter_hookviona_in_gz";
char *hook_viona_out = "ipfilter_hookviona_out";
char *hook_viona_out_gz = "ipfilter_hookviona_out_gz";
+/*
+ * For VIONA. The net_{instance,protocol}_notify_register() functions only
+ * deal with per-callback-function granularity. We need two wrapper functions
+ * for GZ-controlled and per-zone instances.
+ */
+static int
+ipf_hook_instance_notify_gz(hook_notify_cmd_t command, void *arg,
+ const char *netid, const char *dummy, const char *instance)
+{
+ return (ipf_hook_instance_notify(command, arg, netid, dummy, instance));
+}
+
+static int
+ipf_hook_instance_notify_ngz(hook_notify_cmd_t command, void *arg,
+ const char *netid, const char *dummy, const char *instance)
+{
+ return (ipf_hook_instance_notify(command, arg, netid, dummy, instance));
+}
+
+static int
+ipf_hook_protocol_notify_gz(hook_notify_cmd_t command, void *arg,
+ const char *name, const char *dummy, const char *he_name)
+{
+ return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name));
+}
+
+static int
+ipf_hook_protocol_notify_ngz(hook_notify_cmd_t command, void *arg,
+ const char *name, const char *dummy, const char *he_name)
+{
+ return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name));
+}
+
/* ------------------------------------------------------------------------ */
/* Function: ipldetach */
/* Returns: int - 0 == success, else error. */
@@ -267,10 +318,36 @@ ipf_stack_t *ifs;
}
/*
+ * Remove VND hooks
+ */
+ if (ifs->ifs_ipf_vndl3v4 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in);
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v4 = NULL;
+ }
+
+ if (ifs->ifs_ipf_vndl3v6 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in);
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v6 = NULL;
+ }
+
+ /*
* Remove notification of viona hooks
*/
net_instance_notify_unregister(ifs->ifs_netid,
- ipf_hook_instance_notify);
+ ifs->ifs_gz_controlled ? ipf_hook_instance_notify_gz :
+ ipf_hook_instance_notify_ngz);
#undef UNDO_HOOK
@@ -278,6 +355,10 @@ ipf_stack_t *ifs;
* Normally, viona will unregister itself before ipldetach() is called,
* so these will be no-ops, but out of caution, we try to make sure
* we've removed any of our references.
+ *
+ * For now, the _gz and _ngz versions are both wrappers to what's
+ * below. Just call it directly, but if that changes fix here as
+ * well.
*/
(void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
NH_PHYSICAL_IN);
@@ -295,6 +376,10 @@ ipf_stack_t *ifs;
* traced, we pass the same value the nethook framework would
* pass, even though the callback does not currently use the
* value.
+ *
+ * For now, the _gz and _ngz versions are both wrappers to
+ * what's below. Just call it directly, but if that changes
+ * fix here as well.
*/
(void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr,
NULL, Hn_VIONA);
@@ -495,6 +580,49 @@ ipf_stack_t *ifs;
}
/*
+ * Add VND INET hooks
+ */
+ ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET);
+ if (ifs->ifs_ipf_vndl3v4 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in,
+ hook4_vnd_in, hook4_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out,
+ hook4_vnd_out, hook4_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_out)
+ goto hookup_failed;
+
+
+ /*
+ * VND INET6 hooks
+ */
+ ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6);
+ if (ifs->ifs_ipf_vndl3v6 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in,
+ hook6_vnd_in, hook6_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out,
+ hook6_vnd_out, hook6_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_out)
+ goto hookup_failed;
+
+ /*
* VIONA INET hooks. While the nethook framework allows us to register
* hooks for events that haven't been registered yet, we instead
* register and unregister our hooks in response to notifications
@@ -504,9 +632,15 @@ ipf_stack_t *ifs;
* is unloaded, the viona module cannot later re-register them if it
* gets reloaded. As the ip, vnd, and ipf modules are rarely unloaded
* even on DEBUG kernels, they do not experience this issue.
+ *
+ * Today, the per-zone ones don't matter for a BHYVE-branded zone, BUT
+ * the ipf_hook_protocol_notify() function is GZ vs. per-zone aware.
+ * Employ two different versions of ipf_hook_instance_notify(), one for
+ * the GZ-controlled, and one for the per-zone one.
*/
- if (net_instance_notify_register(id, ipf_hook_instance_notify,
- ifs) != 0)
+ if (net_instance_notify_register(id, ifs->ifs_gz_controlled ?
+ ipf_hook_instance_notify_gz : ipf_hook_instance_notify_ngz, ifs) !=
+ 0)
goto hookup_failed;
/*
@@ -688,6 +822,7 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
{
ipf_stack_t *ifs = arg;
int ret = 0;
+ const boolean_t gz = ifs->ifs_gz_controlled;
/* We currently only care about viona hooks */
if (strcmp(instance, Hn_VIONA) != 0)
@@ -705,14 +840,16 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
return (EPROTONOSUPPORT);
ret = net_protocol_notify_register(ifs->ifs_ipf_viona,
- ipf_hook_protocol_notify, ifs);
+ gz ? ipf_hook_protocol_notify_gz :
+ ipf_hook_protocol_notify_ngz, ifs);
VERIFY(ret == 0 || ret == ESHUTDOWN);
break;
case HN_UNREGISTER:
if (ifs->ifs_ipf_viona == NULL)
break;
VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona,
- ipf_hook_protocol_notify));
+ gz ? ipf_hook_protocol_notify_gz :
+ ipf_hook_protocol_notify_ngz));
VERIFY0(net_protocol_release(ifs->ifs_ipf_viona));
ifs->ifs_ipf_viona = NULL;
break;
@@ -821,6 +958,9 @@ int *rp;
return ENXIO;
unit = isp->ipfs_minor;
+ if (unit == IPL_LOGEV)
+ return (ipf_cfwlog_ioctl(dev, cmd, data, mode, cp, rp));
+
zid = crgetzoneid(cp);
if (cmd == SIOCIPFZONESET) {
if (zid == GLOBAL_ZONEID)
@@ -1129,14 +1269,14 @@ ipf_stack_t *ifs;
{
net_handle_t nif;
- if (v == 4)
- nif = ifs->ifs_ipf_ipv4;
- else if (v == 6)
- nif = ifs->ifs_ipf_ipv6;
- else
- return 0;
-
- return (net_phylookup(nif, name));
+ if (v == 4)
+ nif = ifs->ifs_ipf_ipv4;
+ else if (v == 6)
+ nif = ifs->ifs_ipf_ipv6;
+ else
+ return 0;
+
+ return (net_phylookup(nif, name));
}
/*
@@ -1161,11 +1301,35 @@ cred_t *cred;
if (IPL_LOGMAX < min)
return ENXIO;
+ /* Special-case ipfev: global-zone-open only. */
+ if (min == IPL_LOGEV) {
+ if (crgetzoneid(cred) != GLOBAL_ZONEID)
+ return (ENXIO);
+ /*
+ * Else enable the CFW logging of events.
+ * NOTE: For now, we only allow one open at a time.
+ * Use atomic_cas to confirm/deny. And also for now,
+ * assume sizeof (boolean_t) == sizeof (uint_t).
+ *
+ * Per the *_{refrele,REFRELE}() in other parts of inet,
+ * ensure all loads/stores complete before calling cas.
+ * membar_exit() does this.
+ */
+ membar_exit();
+ if (atomic_cas_uint(&ipf_cfwlog_enabled, 0, 1) != 0)
+ return (EBUSY);
+ }
+
minor = (minor_t)(uintptr_t)vmem_alloc(ipf_minor, 1,
VM_BESTFIT | VM_SLEEP);
if (ddi_soft_state_zalloc(ipf_state, minor) != 0) {
vmem_free(ipf_minor, (void *)(uintptr_t)minor, 1);
+ if (min == IPL_LOGEV) {
+ /* See above... */
+ membar_exit();
+ VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1);
+ }
return ENXIO;
}
@@ -1187,6 +1351,7 @@ int flags, otype;
cred_t *cred;
{
minor_t min = getminor(dev);
+ ipf_devstate_t *isp;
#ifdef IPFDEBUG
cmn_err(CE_CONT, "iplclose(%x,%x,%x,%x)\n", dev, flags, otype, cred);
@@ -1195,6 +1360,15 @@ cred_t *cred;
if (IPL_LOGMAX < min)
return ENXIO;
+ isp = ddi_get_soft_state(ipf_state, min);
+ if (isp != NULL && isp->ipfs_minor == IPL_LOGEV) {
+ /*
+ * Disable CFW logging. See iplopen() for details.
+ */
+ membar_exit();
+ VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1);
+ }
+
ddi_soft_state_free(ipf_state, min);
vmem_free(ipf_minor, (void *)(uintptr_t)min, 1);
@@ -1225,6 +1399,8 @@ cred_t *cp;
return ENXIO;
unit = isp->ipfs_minor;
+ if (unit == IPL_LOGEV)
+ return (ipf_cfwlog_read(dev, uio, cp));
/*
* ipf_find_stack returns with a read lock on ifs_ipf_global
@@ -1277,6 +1453,9 @@ cred_t *cp;
return ENXIO;
unit = isp->ipfs_minor;
+ if (unit == IPL_LOGEV)
+ return (EIO); /* ipfev doesn't support write yet. */
+
/*
* ipf_find_stack returns with a read lock on ifs_ipf_global
*/
@@ -2068,8 +2247,11 @@ frdest_t *fdp;
return (-1);
}
- /* Check the src here, fin_ifp is the src interface. */
- if (!(fin->fin_flx & FI_GENERATED) &&
+ /*
+ * If we're forwarding (vs. injecting), check the src here, fin_ifp is
+ * the src interface.
+ */
+ if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) &&
!fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p)) {
return (-1);
}
@@ -2138,8 +2320,8 @@ frdest_t *fdp;
inj->ni_physical = net_routeto(net_data_p, sinp, NULL);
}
- /* we're checking the destination here */
- if (!(fin->fin_flx & FI_GENERATED) &&
+ /* If we're forwarding (vs. injecting), check the destinatation here. */
+ if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) &&
!fr_forwarding_enabled(inj->ni_physical, net_data_p)) {
goto bad_fastroute;
}
@@ -2355,6 +2537,42 @@ int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg,
}
/* ------------------------------------------------------------------------ */
+/* Function: ipf_hookvndl3_in */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: event(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The vnd hooks are private hooks to ON. They represents a layer 2 */
+/* datapath generally used to implement virtual machines. The driver sends */
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */
+/* them is in the upper 16 bits while the remaining bits are the */
+/* traditional packet hook flags. */
+/* */
+/* They end up calling the appropriate traditional ip hooks. */
+/* ------------------------------------------------------------------------ */
+/*ARGSUSED*/
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_in(token, info, arg);
+}
+
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_in(token, info, arg);
+}
+
+/*ARGSUSED*/
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_out(token, info, arg);
+}
+
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_out(token, info, arg);
+}
+
+/* ------------------------------------------------------------------------ */
/* Function: ipf_hookviona_{in,out} */
/* Returns: int - 0 == packet ok, else problem, free packet if not done */
/* Parameters: event(I) - pointer to event */
@@ -3120,16 +3338,16 @@ fr_info_t *fin;
/* both IP versions. The details are going to be explained here. */
/* */
/* The packet looks as follows: */
-/* xxx | IP hdr | IP payload ... | */
-/* ^ ^ ^ ^ */
-/* | | | | */
+/* xxx | IP hdr | IP payload ... | */
+/* ^ ^ ^ ^ */
+/* | | | | */
/* | | | fin_m->b_wptr = fin->fin_dp + fin->fin_dlen */
/* | | | */
/* | | `- fin_m->fin_dp (in case of IPv4 points to L4 header) */
/* | | */
/* | `- fin_m->b_rptr + fin_ipoff (fin_ipoff is most likely 0 in case */
/* | of loopback) */
-/* | */
+/* | */
/* `- fin_m->b_rptr - points to L2 header in case of physical NIC */
/* */
/* All relevant IP headers are pulled up into the first mblk. It happened */
diff --git a/usr/src/uts/common/inet/ipf/ip_log.c b/usr/src/uts/common/inet/ipf/ip_log.c
index 584ee42d9a..b70e320def 100644
--- a/usr/src/uts/common/inet/ipf/ip_log.c
+++ b/usr/src/uts/common/inet/ipf/ip_log.c
@@ -8,7 +8,7 @@
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/param.h>
@@ -373,9 +373,11 @@ u_int flags;
if (fin->fin_fr != NULL) {
ipfl.fl_loglevel = fin->fin_fr->fr_loglevel;
ipfl.fl_logtag = fin->fin_fr->fr_logtag;
+ bcopy(fin->fin_fr->fr_uuid, ipfl.fl_uuid, sizeof (uuid_t));
} else {
ipfl.fl_loglevel = 0xffff;
ipfl.fl_logtag = FR_NOLOGTAG;
+ bzero(ipfl.fl_uuid, sizeof (uuid_t));
}
if (fin->fin_nattag != NULL)
bcopy(fin->fin_nattag, (void *)&ipfl.fl_nattag,
diff --git a/usr/src/uts/common/inet/ipf/ip_state.c b/usr/src/uts/common/inet/ipf/ip_state.c
index 184f8775b6..a45bcbfdaf 100644
--- a/usr/src/uts/common/inet/ipf/ip_state.c
+++ b/usr/src/uts/common/inet/ipf/ip_state.c
@@ -5,7 +5,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#if defined(KERNEL) || defined(_KERNEL)
@@ -108,6 +108,7 @@ struct file;
# include <sys/systm.h>
# endif
#endif
+#include <sys/uuid.h>
/* END OF INCLUDES */
@@ -1445,6 +1446,7 @@ u_int flags;
is->is_sti.tqe_flags |= TQE_RULEBASED;
}
is->is_tag = fr->fr_logtag;
+ memcpy(is->is_uuid, fr->fr_uuid, sizeof (uuid_t));
is->is_ifp[(out << 1) + 1] = fr->fr_ifas[1];
is->is_ifp[(1 - out) << 1] = fr->fr_ifas[2];
@@ -1524,6 +1526,9 @@ u_int flags;
if (ifs->ifs_ipstate_logging)
ipstate_log(is, ISL_NEW, ifs);
+ if (IFS_CFWLOG(ifs, is->is_rule))
+ ipf_log_cfwlog(is, ISL_NEW, ifs);
+
RWLOCK_EXIT(&ifs->ifs_ipf_state);
fin->fin_rev = IP6_NEQ(&is->is_dst, &fin->fin_daddr);
fin->fin_flx |= FI_STATE;
@@ -2314,6 +2319,8 @@ u_32_t cmask;
is->is_flags &= ~(SI_W_SPORT|SI_W_DPORT);
if ((flags & SI_CLONED) && ifs->ifs_ipstate_logging)
ipstate_log(is, ISL_CLONE, ifs);
+ if ((flags & SI_CLONED) && IFS_CFWLOG(ifs, is->is_rule))
+ ipf_log_cfwlog(is, ISL_CLONE, ifs);
}
ret = -1;
@@ -3397,6 +3404,15 @@ ipf_stack_t *ifs;
if (ifs->ifs_ipstate_logging != 0 && why != 0)
ipstate_log(is, why, ifs);
+ /*
+ * For now, ipf_log_cfwlog() copes with all "why" values. Strictly
+ * speaking, though, they all map to one event (CFWEV_END), which for
+ * now is not supported, hence no code calling ipf_log_cfwlog() like
+ * below:
+ *
+ * if (why != 0 && IFS_CFWLOG(ifs, is->is_rule))
+ * ipf_log_cfwlog(is, why, ifs);
+ */
if (is->is_rule != NULL) {
is->is_rule->fr_statecnt--;
@@ -3931,7 +3947,6 @@ int flags;
return rval;
}
-
/* ------------------------------------------------------------------------ */
/* Function: ipstate_log */
/* Returns: Nil */
diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf
index 6b36f9fdbf..f49e024a72 100644
--- a/usr/src/uts/common/inet/ipf/ipf.conf
+++ b/usr/src/uts/common/inet/ipf/ipf.conf
@@ -1,3 +1,8 @@
#
#
name="ipf" parent="pseudo" instance=0;
+
+# Increase the state table limits. fr_statemax should be ~70% of fr_statesize,
+# and both should be prime numbers
+fr_statesize=151007;
+fr_statemax=113279;
diff --git a/usr/src/uts/common/inet/ipf/netinet/Makefile b/usr/src/uts/common/inet/ipf/netinet/Makefile
index cca3b48ac4..88f91e633f 100644
--- a/usr/src/uts/common/inet/ipf/netinet/Makefile
+++ b/usr/src/uts/common/inet/ipf/netinet/Makefile
@@ -1,16 +1,15 @@
#
-#ident "%Z%%M% %I% %E% SMI"
-#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
+# Copyright 2019 Joyent, Inc.
#
# uts/common/inet/ipf/netinet/Makefile
#
# include global definitions
include ../../../../../Makefile.master
-HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h \
- ip_frag.h ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h
+HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h ip_frag.h \
+ ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h ipf_cfw.h
ROOTDIRS= $(ROOT)/usr/include/netinet
diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h
index 4c3c5683b5..bb5ce7bd6c 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h
@@ -8,7 +8,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019, Joyent, Inc.
*/
#ifndef __IP_FIL_H__
@@ -16,6 +16,7 @@
#include "netinet/ip_compat.h"
#include <sys/zone.h>
+#include <sys/uuid.h>
#ifdef SOLARIS
#undef SOLARIS
@@ -115,6 +116,8 @@
#define SIOCDELFR SIOCRMAFR
#define SIOCINSFR SIOCINAFR
# define SIOCIPFZONESET _IOWR('r', 97, struct ipfzoneobj)
+# define SIOCIPFCFWCFG _IOR('r', 98, struct ipfcfwcfg)
+# define SIOCIPFCFWNEWSZ _IOWR('r', 99, struct ipfcfwcfg)
/*
* What type of table is getting flushed?
@@ -600,6 +603,7 @@ typedef struct frentry {
u_32_t fr_flags; /* per-rule flags && options (see below) */
u_32_t fr_logtag; /* user defined log tag # */
u_32_t fr_collect; /* collection number */
+ uuid_t fr_uuid; /* user defined uuid */
u_int fr_arg; /* misc. numeric arg for rule */
u_int fr_loglevel; /* syslog log facility + priority */
u_int fr_age[2]; /* non-TCP timeouts */
@@ -728,6 +732,7 @@ typedef struct frentry {
#define FR_NEWISN 0x400000 /* new ISN for outgoing TCP */
#define FR_NOICMPERR 0x800000 /* do not match ICMP errors in state */
#define FR_STATESYNC 0x1000000 /* synchronize state to slave */
+#define FR_CFWLOG 0x2000000 /* Global CFW logging enabled */
#define FR_NOMATCH 0x8000000 /* no match occured */
/* 0x10000000 FF_LOGPASS */
/* 0x20000000 FF_LOGBLOCK */
@@ -883,6 +888,7 @@ typedef struct ipflog {
u_32_t fl_lflags;
u_32_t fl_logtag;
ipftag_t fl_nattag;
+ uuid_t fl_uuid;
u_short fl_plen; /* extra data after hlen */
u_short fl_loglevel; /* syslog log level */
char fl_group[FR_GROUPLEN];
@@ -931,6 +937,7 @@ typedef struct ipflog {
#define IPSYNC_NAME "/dev/ipsync"
#define IPSCAN_NAME "/dev/ipscan"
#define IPLOOKUP_NAME "/dev/iplookup"
+#define IPFEV_NAME "/dev/ipfev"
#define IPL_LOGIPF 0 /* Minor device #'s for accessing logs */
#define IPL_LOGNAT 1
@@ -939,8 +946,9 @@ typedef struct ipflog {
#define IPL_LOGSYNC 4
#define IPL_LOGSCAN 5
#define IPL_LOGLOOKUP 6
-#define IPL_LOGCOUNT 7
-#define IPL_LOGMAX 7
+#define IPL_LOGEV 7
+#define IPL_LOGCOUNT 8
+#define IPL_LOGMAX 8
#define IPL_LOGSIZE (IPL_LOGMAX + 1)
#define IPL_LOGALL -1
#define IPL_LOGNONE -2
@@ -1181,6 +1189,21 @@ typedef struct ipfzoneobj {
char ipfz_zonename[ZONENAME_MAX]; /* zone to act on */
} ipfzoneobj_t;
+/* ioctl to grab CFW logging parameters */
+typedef struct ipfcfwcfg {
+ /* CFG => Max event size, NEWSZ => ignored in, like CFG out. */
+ uint32_t ipfcfwc_maxevsize;
+ /*
+ * CFG => Current ring size,
+ * NEWSZ => New ring size, must be 2^N for 3 <= N <= 31.
+ */
+ uint32_t ipfcfwc_evringsize;
+ /* CFG => Number of event reports, NEWSZ => ignored in, like CFG out. */
+ uint64_t ipfcfwc_evreports;
+ /* CFG => Number of event drops, NEWSZ => ignored in, like CFG out. */
+ uint64_t ipfcfwc_evdrops;
+} ipfcfwcfg_t;
+
#if defined(_KERNEL)
/* Set ipfs_zoneid to this if no zone has been set: */
#define IPFS_ZONE_UNSET -2
@@ -1560,6 +1583,23 @@ extern int ipllog __P((int, fr_info_t *, void **, size_t *, int *, int,
ipf_stack_t *));
extern void fr_logunload __P((ipf_stack_t *));
+/* SmartOS single-FD global-zone state accumulator (see cfw.c) */
+extern boolean_t ipf_cfwlog_enabled;
+struct ipstate; /* Ugggh. */
+extern void ipf_log_cfwlog __P((struct ipstate *, uint_t, ipf_stack_t *));
+extern void ipf_block_cfwlog __P((frentry_t *, fr_info_t *, ipf_stack_t *));
+#define IFS_CFWLOG(ifs, fr) ((ifs)->ifs_gz_controlled && ipf_cfwlog_enabled &&\
+ fr != NULL && ((fr)->fr_flags & FR_CFWLOG))
+struct cfwev_s; /* See ipf_cfw.h */
+extern boolean_t ipf_cfwev_consume __P((struct cfwev_s *, boolean_t));
+/* See cfw.c's ipf_cfwev_consume_many() for details. */
+typedef uint_t (*cfwmanycb_t) __P((struct cfwev_s *, uint_t, void *));
+extern int ipf_cfwlog_read __P((dev_t, struct uio *, struct cred *));
+extern int ipf_cfwlog_ioctl __P((dev_t, int, intptr_t, int, cred_t *, int *));
+#define IPF_CFW_RING_ALLOCATE 0
+#define IPF_CFW_RING_DESTROY 1
+extern int ipf_cfw_ring_resize(uint32_t);
+
extern frentry_t *fr_acctpkt __P((fr_info_t *, u_32_t *));
extern int fr_copytolog __P((int, char *, int));
extern u_short fr_cksum __P((mb_t *, ip_t *, int, void *));
diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_state.h b/usr/src/uts/common/inet/ipf/netinet/ip_state.h
index 4c605c1b89..ef315d5ef1 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ip_state.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ip_state.h
@@ -8,11 +8,14 @@
*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef __IP_STATE_H__
#define __IP_STATE_H__
+#include <sys/uuid.h>
+
#if defined(__STDC__) || defined(__GNUC__) || defined(_AIX51)
# define SIOCDELST _IOW('r', 61, struct ipfobj)
#else
@@ -66,6 +69,7 @@ typedef struct ipstate {
/* in both directions */
u_32_t is_optmsk[2]; /* " " mask */
/* in both directions */
+ uuid_t is_uuid;
u_short is_sec; /* security options set */
u_short is_secmsk; /* " " mask */
u_short is_auth; /* authentication options set */
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h
new file mode 100644
index 0000000000..1972d2b3f7
--- /dev/null
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h
@@ -0,0 +1,69 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef __IPF_CFW_H__
+#define __IPF_CFW_H__
+
+#include <sys/types.h>
+#include <inet/ip6.h>
+#include <sys/uuid.h>
+
+/* Because ipf compiles this kernel file in userland testing... */
+#ifndef ASSERT3U
+#define ASSERT3U(a, b, c) ASSERT(a ## b ## c);
+#endif /* ASSERT3U */
+
+/*
+ * CFW Event, which is emitted to a global-zone listener. The global-zone
+ * listener solves the one-fd-per-zone problem of using each zone's ipmon.
+ *
+ * These must be 64-bit aligned because they form an array in-kernel. There
+ * might be reserved fields to ensure that alignment.
+ */
+#define CFWEV_BLOCK 1
+#define CFWEV_BEGIN 2
+#define CFWEV_END 3
+#define CFWDIR_IN 1
+#define CFWDIR_OUT 2
+
+typedef struct cfwev_s {
+ uint16_t cfwev_type; /* BEGIN, END, BLOCK */
+ uint16_t cfwev_length; /* in bytes, so capped to 65535 bytes */
+ zoneid_t cfwev_zonedid; /* Pullable from ipf_stack_t. */
+
+ uint32_t cfwev_ruleid; /* Pullable from fr_info_t. */
+ uint16_t cfwev_sport; /* Source port (network order) */
+ uint16_t cfwev_dport; /* Dest. port (network order) */
+
+ uint8_t cfwev_protocol; /* IPPROTO_* */
+ /* "direction" informs if src/dst are local/remote or remote/local. */
+ uint8_t cfwev_direction;
+ uint8_t cfwev_reserved[6]; /* Ensures 64-bit alignment. */
+
+ in6_addr_t cfwev_saddr; /* IPv4 addresses are V4MAPPED. */
+ in6_addr_t cfwev_daddr;
+
+ /*
+ * Because of 'struct timeval' being different between 32-bit and
+ * 64-bit ABIs, this interface is only usable by 64-bit binaries.
+ */
+ struct timeval cfwev_tstamp;
+
+ uuid_t cfwev_ruleuuid; /* Pullable from fr_info_t. */
+} cfwev_t;
+
+
+
+#endif /* __IPF_CFW_H__ */
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index 0ceea1e921..0b2a8d826f 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -6,7 +6,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright 2018 Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef __IPF_STACK_H__
@@ -46,6 +46,7 @@ struct ipf_stack {
struct ipf_stack *ifs_gz_cont_ifs;
netid_t ifs_netid;
zoneid_t ifs_zone;
+ zoneid_t ifs_zone_did;
boolean_t ifs_gz_controlled;
/* ipf module */
@@ -126,6 +127,11 @@ struct ipf_stack {
hook_t *ifs_ipfhook6_loop_out;
hook_t *ifs_ipfhook6_nicevents;
+ hook_t *ifs_ipfhookvndl3v4_in;
+ hook_t *ifs_ipfhookvndl3v6_in;
+ hook_t *ifs_ipfhookvndl3v4_out;
+ hook_t *ifs_ipfhookvndl3v6_out;
+
hook_t *ifs_ipfhookviona_in;
hook_t *ifs_ipfhookviona_out;
@@ -140,12 +146,18 @@ struct ipf_stack {
boolean_t ifs_hook6_nic_events;
boolean_t ifs_hook6_loopback_in;
boolean_t ifs_hook6_loopback_out;
+ boolean_t ifs_hookvndl3v4_physical_in;
+ boolean_t ifs_hookvndl3v6_physical_in;
+ boolean_t ifs_hookvndl3v4_physical_out;
+ boolean_t ifs_hookvndl3v6_physical_out;
boolean_t ifs_hookviona_physical_in;
boolean_t ifs_hookviona_physical_out;
int ifs_ipf_loopback;
net_handle_t ifs_ipf_ipv4;
net_handle_t ifs_ipf_ipv6;
+ net_handle_t ifs_ipf_vndl3v4;
+ net_handle_t ifs_ipf_vndl3v6;
net_handle_t ifs_ipf_viona;
/* ip_auth.c */
@@ -305,6 +317,7 @@ struct ipf_stack {
char *ifs_addmask_key;
char *ifs_rn_zeros;
char *ifs_rn_ones;
+
#ifdef KERNEL
/* kstats for inbound and outbound */
kstat_t *ifs_kstatp[2];
diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c
index c541f4dddc..5ccbfa3188 100644
--- a/usr/src/uts/common/inet/ipf/solaris.c
+++ b/usr/src/uts/common/inet/ipf/solaris.c
@@ -6,7 +6,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -116,7 +116,7 @@ static void ipf_stack_shutdown __P((const netid_t, void *));
static int ipf_property_g_update __P((dev_info_t *));
static char *ipf_devfiles[] = { IPL_NAME, IPNAT_NAME, IPSTATE_NAME,
IPAUTH_NAME, IPSYNC_NAME, IPSCAN_NAME,
- IPLOOKUP_NAME, NULL };
+ IPLOOKUP_NAME, IPFEV_NAME, NULL };
extern void *ipf_state; /* DDI state */
extern vmem_t *ipf_minor; /* minor number arena */
@@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg)
/*
* Destroy things for ipf for one stack.
*/
-/* ARGSUSED */
static void
ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs)
{
@@ -742,6 +741,9 @@ ddi_attach_cmd_t cmd;
ipf_dev_info = dip;
+ if (ipf_cfw_ring_resize(IPF_CFW_RING_ALLOCATE) != 0)
+ goto attach_failed;
+
ipfncb = net_instance_alloc(NETINFO_VERSION);
if (ipfncb == NULL)
goto attach_failed;
@@ -769,6 +771,7 @@ ddi_attach_cmd_t cmd;
}
attach_failed:
+ (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY);
ddi_remove_minor_node(dip, NULL);
ddi_prop_remove_all(dip);
ddi_soft_state_fini(&ipf_state);
@@ -796,6 +799,7 @@ ddi_detach_cmd_t cmd;
* framework guarantees we are not active with this devinfo
* node in any other entry points at this time.
*/
+ (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY);
ddi_prop_remove_all(dip);
i = ddi_get_instance(dip);
ddi_remove_minor_node(dip, NULL);