diff options
Diffstat (limited to 'usr/src/uts/common/inet/ipf')
-rw-r--r-- | usr/src/uts/common/inet/ipf/cfw.c | 659 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/fil.c | 5 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/ip_fil_solaris.c | 304 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/ip_log.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/ip_state.c | 19 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/ipf.conf | 5 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/netinet/Makefile | 7 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/netinet/ip_fil.h | 46 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/netinet/ip_state.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h | 69 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/netinet/ipf_stack.h | 15 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipf/solaris.c | 10 |
12 files changed, 1089 insertions, 58 deletions
diff --git a/usr/src/uts/common/inet/ipf/cfw.c b/usr/src/uts/common/inet/ipf/cfw.c new file mode 100644 index 0000000000..941aeac328 --- /dev/null +++ b/usr/src/uts/common/inet/ipf/cfw.c @@ -0,0 +1,659 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +/* IPF oddness for compilation in userland for IPF tests. */ +#if defined(KERNEL) || defined(_KERNEL) +#undef KERNEL +#undef _KERNEL +#define KERNEL 1 +#define _KERNEL 1 +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/socket.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include "netinet/ip_compat.h" +#ifdef USE_INET6 +#include <netinet/icmp6.h> +#endif +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_auth.h" +#include "netinet/ipf_stack.h" +#ifdef IPFILTER_SCAN +#include "netinet/ip_scan.h" +#endif +#ifdef IPFILTER_SYNC +#include "netinet/ip_sync.h" +#endif +#include "netinet/ip_pool.h" +#include "netinet/ip_htable.h" +#ifdef IPFILTER_COMPILED +#include "netinet/ip_rules.h" +#endif +#if defined(_KERNEL) +#include <sys/sunddi.h> +#endif + +#include "netinet/ipf_cfw.h" +#include <sys/file.h> +#include <sys/uio.h> +#include <sys/cred.h> +#include <sys/ddi.h> + +/* + * cfw == Cloud Firewall ==> routines for a global-zone data collector about + * ipf events for SmartOS. The only ones that CFW cares about are ones + * enforced by global-zone-controlled rulesets. + * + * The variable below is tied into the GZ-only ipf device /dev/ipfev, that + * flips this on when there is an open instance. This feature will also + * consume an fr_flag to have per-rule granularity. + */ +boolean_t ipf_cfwlog_enabled; + +/* + * Because ipf's test tools in $SRC/cmd insert all of these files, we need to + * stub out what we can vs. drag in even more headers and who knows what else. + */ +#ifdef _KERNEL + +/* + * CFW event ring buffer. Remember, this is for ALL ZONES because only a + * global-zone event-reader will be consuming these. In other words, it's + * not something to instantiate per-netstack. + * + * We may want to get more sophisticated and performant (e.g. per-processor), + * but for now keep the ring buffer simple and stupid. + * Must be a power of 2, to be bitmaskable, and must be countable by a uint_t + * + * Resizeable, see ipf_cfw_ring_resize() below. + */ +#define IPF_CFW_DEFAULT_RING_BUFS 1024 +#define IPF_CFW_MIN_RING_BUFS 8 +#define IPF_CFW_MAX_RING_BUFS (1U << 31U) + +/* Assume C's init-to-zero is sufficient for these types... */ +static kmutex_t cfw_ringlock; +static kcondvar_t cfw_ringcv; + +static cfwev_t *cfw_ring; /* NULL by default. */ +static uint32_t cfw_ringsize; /* 0 by default, number of array elements. */ +static uint32_t cfw_ringmask; /* 0 by default. */ + +/* If these are equal, we're either empty or full. */ +static uint_t cfw_ringstart, cfw_ringend; +static boolean_t cfw_ringfull; /* Tell the difference here! */ +/* Bean-counters. */ +static uint64_t cfw_evreports; +static uint64_t cfw_evdrops; + +/* + * Place an event in the CFW event ring buffer. + * + * For now, be simple and drop the oldest event if we overflow. We may wish to + * selectively drop older events based on type in the future. + */ +static void +ipf_cfwev_report(cfwev_t *event) +{ + mutex_enter(&cfw_ringlock); + cfw_ring[cfw_ringend] = *event; + cfw_ringend++; + cfw_ringend &= cfw_ringmask; + if (cfw_ringfull) { + cfw_ringstart++; + cfw_ringstart &= cfw_ringmask; + ASSERT3U(cfw_ringstart, ==, cfw_ringend); + DTRACE_PROBE(ipf__cfw__evdrop); + cfw_evdrops++; + } else { + cfw_ringfull = (cfw_ringend == cfw_ringstart); + } + cfw_evreports++; + cv_broadcast(&cfw_ringcv); + mutex_exit(&cfw_ringlock); +} + +/* + * Provide access to multiple CFW events that can allow copying straight from + * the ring buffer up to userland. Requires a callback (which could call + * uiomove() directly, OR to a local still-in-kernel buffer) that must do the + * data copying-out. + * + * Callback function is of the form: + * + * uint_t cfw_many_cb(cfwev_t *evptr, int num_avail, void *cbarg); + * + * The function must return how many events got consumed, which MUST be <= the + * number available. The function must ALSO UNDERSTAND that cfw_ringlock is + * held and must not be released during this time. The function may be called + * more than once, if the available buffers wrap-around OR "block" is set and + * we don't have enough buffers. If any callback returns 0, exit the function + * with however many were consumed. + * + * This function, like the callback, returns the number of events *CONSUMED*. + * + * . . . + * + * Tunables for ipf_cfwev_consume_many(). + * + * If you wish to attempt to coalesce reads (to reduce the likelihood of one + * event at a time during high load) change the number of tries below to + * something not 0. Early experiments set this to 10. + * + * The wait between tries is in usecs in cfw_timeout_wait. The pessimal + * case for this is a timeout_wait-spaced trickle of one event at a time. + */ +uint_t cfw_timeout_tries = 0; +uint_t cfw_timeout_wait = 10000; /* 10ms wait. */ + +typedef struct uio_error_s { + struct uio *ue_uio; + int ue_error; +} uio_error_t; + +static uint_t +ipf_cfwev_consume_many(uint_t num_requested, boolean_t block, + cfwmanycb_t cfw_many_cb, void *cbarg) +{ + uint_t consumed = 0, cb_consumed, contig_size; + uint_t timeout_tries = cfw_timeout_tries; + boolean_t eintr = B_FALSE; + + mutex_enter(&cfw_ringlock); + + while (num_requested > 0) { + clock_t delta; + + /* Silly reality checks */ + ASSERT3U(cfw_ringstart, <, cfw_ringsize); + ASSERT3U(cfw_ringend, <, cfw_ringsize); + + if (cfw_ringstart > cfw_ringend || cfw_ringfull) { + /* We have from ringstart to the buffer's end. */ + contig_size = cfw_ringsize - cfw_ringstart; + } else if (cfw_ringstart < cfw_ringend) { + /* We have no potential wrapping at this time. */ + contig_size = cfw_ringend - cfw_ringstart; + } else if (block && cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) { + /* Maybe something to consume now, try again. */ + continue; + } else { + /* Nothing (more) to consume, return! */ + eintr = (block && consumed == 0); + break; + } + + /* Less asked-for than what we needed. */ + if (num_requested < contig_size) + contig_size = num_requested; + + cb_consumed = + cfw_many_cb(&(cfw_ring[cfw_ringstart]), contig_size, cbarg); + ASSERT3U(cb_consumed, <=, contig_size); + + cfw_ringstart += cb_consumed; + ASSERT3U(cfw_ringstart, <=, cfw_ringmask + 1); + cfw_ringstart &= cfw_ringmask; /* In case of wraparound. */ + consumed += cb_consumed; + cfw_ringfull = (cfw_ringfull && cb_consumed == 0); + if (cb_consumed < contig_size) { + /* + * Callback returned less than given. + * This is likely a uio error, but we have + * something. Get out of here. + */ + break; + } + ASSERT3U(cb_consumed, ==, contig_size); + num_requested -= contig_size; + + if (num_requested == 0) { + /* All done! */ + break; + } + + if (cfw_ringstart != cfw_ringend) { + /* + * We wrapped around the end of the buffer, and + * we have more available to fill our request. + */ + ASSERT0(cfw_ringstart); + ASSERT(!cfw_ringfull); + continue; + } + + /* + * We obtained some of the events we requested, but not all. + * Since we have nothing to consume, wait *a little* longer. + */ + if (timeout_tries == 0) + break; /* Don't bother... */ + delta = drv_usectohz(cfw_timeout_wait); + timeout_tries--; + + switch (cv_reltimedwait_sig(&cfw_ringcv, &cfw_ringlock, delta, + TR_CLOCK_TICK)) { + case 0: + /* + * Received signal! Return what we have OR if we have + * nothing, EINTR. + */ + DTRACE_PROBE1(ipf__cfw__timedsignal, int, consumed); + eintr = (consumed == 0); + num_requested = 0; + break; + case -1: + /* Time reached! Bail with what we got. */ + DTRACE_PROBE(ipf__cfw__timedexpired); + num_requested = 0; + break; + default: + /* Aha! We've got more! */ + DTRACE_PROBE(ipf__cfw__moredata); + break; + } + } + + mutex_exit(&cfw_ringlock); + if (eintr) + ((uio_error_t *)cbarg)->ue_error = EINTR; + return (consumed); +} + +/* + * SmartOS likes using the zone's debug id. Make sure we squirrel that away in + * the ipf netstack instance if it's not there. + */ +static inline zoneid_t +ifs_to_did(ipf_stack_t *ifs) +{ + if (ifs->ifs_zone_did == 0) { + zone_t *zone; + + /* + * We can't get the zone_did at initialization time because + * most zone data isn't readily available then, cement the did + * in place now. + */ + VERIFY3U(ifs->ifs_zone, !=, GLOBAL_ZONEID); + zone = zone_find_by_id(ifs->ifs_zone); + if (zone != NULL) { + ifs->ifs_zone_did = zone->zone_did; + zone_rele(zone); + } + /* Else we are either in shutdown or something weirder. */ + } + return (ifs->ifs_zone_did); +} + +/* + * ipf_block_cfwlog() + * + * Called by fr_check(). Record drop events for the global-zone data + * collector. Use rest-of-ipf-style names for the parameters. + */ +void +ipf_block_cfwlog(frentry_t *fr, fr_info_t *fin, ipf_stack_t *ifs) +{ + cfwev_t event = {0}; + + /* + * We need a rule. + * Capture failure by using dtrace on this function's entry. + * 'ipf_block_cfwlog:entry /arg0 == NULL/ { printf("GOTCHA!\n"); }' + */ + if (fr == NULL) + return; + + event.cfwev_type = CFWEV_BLOCK; + event.cfwev_length = sizeof (event); + /* + * IPF code elsewhere does the cheesy single-flag check, even though + * there are two flags in a rule (one for in, one for out). + */ + event.cfwev_direction = (fr->fr_flags & FR_INQUE) ? + CFWDIR_IN : CFWDIR_OUT; + + event.cfwev_protocol = fin->fin_p; + /* + * NOTE: fin_*port is in host/native order, and ICMP info is here too. + */ + event.cfwev_sport = htons(fin->fin_sport); + event.cfwev_dport = htons(fin->fin_dport); + + switch (fin->fin_v) { + case IPV4_VERSION: + IN6_INADDR_TO_V4MAPPED(&fin->fin_src, &event.cfwev_saddr); + IN6_INADDR_TO_V4MAPPED(&fin->fin_dst, &event.cfwev_daddr); + break; + case IPV6_VERSION: + event.cfwev_saddr = fin->fin_src6.in6; + event.cfwev_daddr = fin->fin_dst6.in6; + break; + default: + /* We should never reach here, but mark it if we do. */ + DTRACE_PROBE1(ipf__cfw__frinfo__badipversion, frinfo_t *, fin); + return; + } + + /* + * uniqtime() is what ipf's GETKTIME() uses. + * If cfwev_tstamp needs to be sourced from elsewhere, fix that here. + */ + uniqtime(&event.cfwev_tstamp); + event.cfwev_zonedid = ifs_to_did(ifs); + event.cfwev_ruleid = fin->fin_rule; + memcpy(event.cfwev_ruleuuid, fr->fr_uuid, sizeof (uuid_t)); + + ipf_cfwev_report(&event); +} + +/* + * ipf_log_cfwlog() + * + * Twin of ipstate_log(), but records state events for the global-zone data + * collector. + */ +void +ipf_log_cfwlog(struct ipstate *is, uint_t type, ipf_stack_t *ifs) +{ + cfwev_t event = {0}; + + switch (type) { + case ISL_NEW: + case ISL_CLONE: + event.cfwev_type = CFWEV_BEGIN; + break; + case ISL_EXPIRE: + case ISL_FLUSH: + case ISL_REMOVE: + case ISL_KILLED: + case ISL_ORPHAN: + /* + * We don't care about session disappearances in CFW logging + * for now. (Possible future: CFWEV_END) + */ + return; + default: + event.cfwev_type = CFWEV_BLOCK; + break; + } + + /* + * IPF code elsewhere does the cheesy single-flag check, even though + * there are two flags in a rule (one for in, one for out). Follow + * suit here. + */ + event.cfwev_length = sizeof (event); + ASSERT(is->is_rule != NULL); + event.cfwev_direction = (is->is_rule->fr_flags & FR_INQUE) ? + CFWDIR_IN : CFWDIR_OUT; + event.cfwev_protocol = is->is_p; + switch (is->is_p) { + case IPPROTO_TCP: + case IPPROTO_UDP: + /* NOTE: is_*port is in network order. */ + event.cfwev_sport = is->is_sport; + event.cfwev_dport = is->is_dport; + break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + /* Scribble the ICMP type in sport... */ + event.cfwev_sport = is->is_icmp.ici_type; + break; + /* Other protocols leave the event's port fields empty. */ + } + + switch(is->is_v) { + case IPV4_VERSION: + IN6_INADDR_TO_V4MAPPED(&is->is_src.in4, &event.cfwev_saddr); + IN6_INADDR_TO_V4MAPPED(&is->is_dst.in4, &event.cfwev_daddr); + break; + case IPV6_VERSION: + event.cfwev_saddr = is->is_src.in6; + event.cfwev_daddr = is->is_dst.in6; + break; + default: + /* Can't parse addresses if we don't know the version. Drop. */ + DTRACE_PROBE1(ipf__cfw__ipstate__badipversion, + struct ipstate *, is); + return; + } + + /* + * uniqtime() is what ipf's GETKTIME() uses. + * If cfwev_tstamp needs to be sourced from elsewhere, fix that here. + */ + uniqtime(&event.cfwev_tstamp); + event.cfwev_zonedid = ifs_to_did(ifs); + event.cfwev_ruleid = is->is_rulen; + memcpy(event.cfwev_ruleuuid, is->is_uuid, sizeof (uuid_t)); + + ipf_cfwev_report(&event); +} + +/* + * Callback routine we use for ipf_cfwev_consume_many(). + * Returning 0 means error indication. + */ +static uint_t +cfwlog_read_manycb(cfwev_t *evptr, uint_t num_avail, void *cbarg) +{ + uio_error_t *ue = (uio_error_t *)cbarg; + + ASSERT(MUTEX_HELD(&cfw_ringlock)); + + if (ue->ue_error != 0) + return (0); + + ue->ue_error = uiomove((caddr_t)evptr, num_avail * sizeof (*evptr), + UIO_READ, ue->ue_uio); + if (ue->ue_error != 0) + return (0); + + return (num_avail); +} + +/* + * Resize the CFW event ring buffer. + * + * The caller must ensure the new size is a power of 2 between + * IPF_CFW_{MIN,MAX}_RING_BUFS (inclusive) or the special values + * IPF_CFW_RING_ALLOCATE (first-time creation) or IPF_CFW_RING_DESTROY + * (netstack-unload destruction). + * + * Everything in the current ring will be destroyed (and reported as a drop) + * upon resize. + */ +int +ipf_cfw_ring_resize(uint32_t newsize) +{ + ASSERT(MUTEX_HELD(&cfw_ringlock) || newsize == IPF_CFW_RING_ALLOCATE || + newsize == IPF_CFW_RING_DESTROY); + + if (newsize == IPF_CFW_RING_ALLOCATE) { + if (cfw_ring != NULL) + return (EBUSY); + newsize = IPF_CFW_DEFAULT_RING_BUFS; + /* Fall through to allocating a new ring buffer. */ + } else { + /* We may be called during error cleanup, so be liberal here. */ + if ((cfw_ring == NULL && newsize == IPF_CFW_RING_DESTROY) || + newsize == cfw_ringsize) { + return (0); + } + kmem_free(cfw_ring, cfw_ringsize * sizeof (cfwev_t)); + cfw_ring = NULL; + if (cfw_ringfull) { + cfw_evdrops += cfw_ringsize; + } else if (cfw_ringstart > cfw_ringend) { + cfw_evdrops += cfw_ringend + + (cfw_ringsize - cfw_ringstart); + } else { + cfw_evdrops += cfw_ringend - cfw_ringstart; + } + cfw_ringsize = cfw_ringmask = cfw_ringstart = cfw_ringend = 0; + cfw_ringfull = B_FALSE; + + if (newsize == IPF_CFW_RING_DESTROY) + return (0); + /* + * Keep the reports & drops around because if we're just + * resizing, we need to know what we lost. + */ + } + + ASSERT(ISP2(newsize)); + cfw_ring = kmem_alloc(newsize * sizeof (cfwev_t), KM_SLEEP); + /* KM_SLEEP means we always succeed. */ + cfw_ringsize = newsize; + cfw_ringmask = cfw_ringsize - 1; + + return (0); +} + +/* + * ioctl handler for /dev/ipfev. Only supports SIOCIPFCFWCFG (get data + * collector statistics and configuration), and SIOCIPFCFWNEWSZ (resize the + * event ring buffer). + */ +/* ARGSUSED */ +int +ipf_cfwlog_ioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cp, + int *rp) +{ + ipfcfwcfg_t cfginfo; + int error; + + if (cmd != SIOCIPFCFWCFG && cmd != SIOCIPFCFWNEWSZ) + return (EIO); + + if (crgetzoneid(cp) != GLOBAL_ZONEID) + return (EACCES); + + error = COPYIN((caddr_t)data, (caddr_t)&cfginfo, sizeof (cfginfo)); + if (error != 0) + return (EFAULT); + + cfginfo.ipfcfwc_maxevsize = sizeof (cfwev_t); + mutex_enter(&cfw_ringlock); + cfginfo.ipfcfwc_evreports = cfw_evreports; + if (cmd == SIOCIPFCFWNEWSZ) { + uint32_t newsize = cfginfo.ipfcfwc_evringsize; + + /* Do ioctl parameter checking here, then call the resizer. */ + if (newsize < IPF_CFW_MIN_RING_BUFS || + newsize > IPF_CFW_MAX_RING_BUFS || !ISP2(newsize)) { + error = EINVAL; + } else { + error = ipf_cfw_ring_resize(cfginfo.ipfcfwc_evringsize); + } + } else { + error = 0; + } + /* Both cfw_evdrops and cfw_ringsize are affected by resize. */ + cfginfo.ipfcfwc_evdrops = cfw_evdrops; + cfginfo.ipfcfwc_evringsize = cfw_ringsize; + mutex_exit(&cfw_ringlock); + + if (error != 0) + return (error); + + error = COPYOUT((caddr_t)&cfginfo, (caddr_t)data, sizeof (cfginfo)); + if (error != 0) + return (EFAULT); + + return (0); +} + +/* + * Send events up via /dev/ipfev reads. Will return only complete events. + */ +/* ARGSUSED */ +int +ipf_cfwlog_read(dev_t dev, struct uio *uio, cred_t *cp) +{ + uint_t requested, consumed; + uio_error_t ue = {uio, 0}; + boolean_t block; + + if (uio->uio_resid == 0) + return (0); + if (uio->uio_resid < sizeof (cfwev_t)) + return (EINVAL); + + block = ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) == 0); + requested = uio->uio_resid / sizeof (cfwev_t); + + /* + * As stated earlier, ipf_cfwev_consume_many() takes a callback. + * The callback may be called multiple times before we return. + * The callback will execute uiomove(). + */ + consumed = ipf_cfwev_consume_many(requested, block, cfwlog_read_manycb, + &ue); + ASSERT3U(consumed, <=, requested); + if (!block && consumed == 0 && ue.ue_error == 0) { + /* No data available. */ + ue.ue_error = EWOULDBLOCK; + } else if (ue.ue_error != 0 && ue.ue_error != EINTR) { + /* + * We had a problem that wasn't simply a + * case of cv_wait_sig() receiving a signal. + */ + DTRACE_PROBE1(ipf__cfw__uiodiscard, int, consumed); + mutex_enter(&cfw_ringlock); + cfw_evdrops += consumed; + mutex_exit(&cfw_ringlock); + } + return (ue.ue_error); +} + +#else /* _KERNEL */ + +/* Blank stubs to satisfy userland's test compilations. */ + +int +ipf_cfw_ring_resize(uint32_t a) +{ + return (0); +} + +void +ipf_log_cfwlog(struct ipstate *a, uint_t b, ipf_stack_t *c) +{ +} + +void +ipf_block_cfwlog(frentry_t *a, fr_info_t *b, ipf_stack_t *c) +{ +} + +#endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ipf/fil.c b/usr/src/uts/common/inet/ipf/fil.c index 78980be106..48fa6e7325 100644 --- a/usr/src/uts/common/inet/ipf/fil.c +++ b/usr/src/uts/common/inet/ipf/fil.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #if defined(KERNEL) || defined(_KERNEL) @@ -2588,6 +2588,9 @@ ipf_stack_t *ifs; } #endif + if (IFS_CFWLOG(ifs, fr) && FR_ISBLOCK(pass)) + ipf_block_cfwlog(fr, fin, ifs); + /* * The FI_STATE flag is cleared here so that calling fr_checkstate * will work when called from inside of fr_fastroute. Although diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index c9d5f03e13..0d34e0fce3 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #if !defined(lint) @@ -85,6 +85,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t, static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hook6 __P((hook_data_t, int, int, void *)); +static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t, + void *)); static int ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hookviona_out __P((hook_event_token_t, hook_data_t, @@ -116,7 +124,7 @@ u_long *ip_forwarding = NULL; #endif vmem_t *ipf_minor; /* minor number arena */ -void *ipf_state; /* DDI state */ +void *ipf_state; /* DDI state */ /* * GZ-controlled and per-zone stacks: @@ -141,28 +149,38 @@ void *ipf_state; /* DDI state */ */ /* IPv4 hook names */ -char *hook4_nicevents = "ipfilter_hook4_nicevents"; -char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz"; -char *hook4_in = "ipfilter_hook4_in"; -char *hook4_in_gz = "ipfilter_hook4_in_gz"; -char *hook4_out = "ipfilter_hook4_out"; -char *hook4_out_gz = "ipfilter_hook4_out_gz"; -char *hook4_loop_in = "ipfilter_hook4_loop_in"; -char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz"; -char *hook4_loop_out = "ipfilter_hook4_loop_out"; -char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz"; +char *hook4_nicevents = "ipfilter_hook4_nicevents"; +char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz"; +char *hook4_in = "ipfilter_hook4_in"; +char *hook4_in_gz = "ipfilter_hook4_in_gz"; +char *hook4_out = "ipfilter_hook4_out"; +char *hook4_out_gz = "ipfilter_hook4_out_gz"; +char *hook4_loop_in = "ipfilter_hook4_loop_in"; +char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz"; +char *hook4_loop_out = "ipfilter_hook4_loop_out"; +char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz"; /* IPv6 hook names */ -char *hook6_nicevents = "ipfilter_hook6_nicevents"; -char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz"; -char *hook6_in = "ipfilter_hook6_in"; -char *hook6_in_gz = "ipfilter_hook6_in_gz"; -char *hook6_out = "ipfilter_hook6_out"; -char *hook6_out_gz = "ipfilter_hook6_out_gz"; -char *hook6_loop_in = "ipfilter_hook6_loop_in"; -char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; -char *hook6_loop_out = "ipfilter_hook6_loop_out"; -char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; +char *hook6_nicevents = "ipfilter_hook6_nicevents"; +char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz"; +char *hook6_in = "ipfilter_hook6_in"; +char *hook6_in_gz = "ipfilter_hook6_in_gz"; +char *hook6_out = "ipfilter_hook6_out"; +char *hook6_out_gz = "ipfilter_hook6_out_gz"; +char *hook6_loop_in = "ipfilter_hook6_loop_in"; +char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; +char *hook6_loop_out = "ipfilter_hook6_loop_out"; +char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; + +/* vnd IPv4/v6 hook names */ +char *hook4_vnd_in = "ipfilter_hookvndl3v4_in"; +char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz"; +char *hook6_vnd_in = "ipfilter_hookvndl3v6_in"; +char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz"; +char *hook4_vnd_out = "ipfilter_hookvndl3v4_out"; +char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz"; +char *hook6_vnd_out = "ipfilter_hookvndl3v6_out"; +char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz"; /* viona hook names */ char *hook_viona_in = "ipfilter_hookviona_in"; @@ -170,6 +188,39 @@ char *hook_viona_in_gz = "ipfilter_hookviona_in_gz"; char *hook_viona_out = "ipfilter_hookviona_out"; char *hook_viona_out_gz = "ipfilter_hookviona_out_gz"; +/* + * For VIONA. The net_{instance,protocol}_notify_register() functions only + * deal with per-callback-function granularity. We need two wrapper functions + * for GZ-controlled and per-zone instances. + */ +static int +ipf_hook_instance_notify_gz(hook_notify_cmd_t command, void *arg, + const char *netid, const char *dummy, const char *instance) +{ + return (ipf_hook_instance_notify(command, arg, netid, dummy, instance)); +} + +static int +ipf_hook_instance_notify_ngz(hook_notify_cmd_t command, void *arg, + const char *netid, const char *dummy, const char *instance) +{ + return (ipf_hook_instance_notify(command, arg, netid, dummy, instance)); +} + +static int +ipf_hook_protocol_notify_gz(hook_notify_cmd_t command, void *arg, + const char *name, const char *dummy, const char *he_name) +{ + return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name)); +} + +static int +ipf_hook_protocol_notify_ngz(hook_notify_cmd_t command, void *arg, + const char *name, const char *dummy, const char *he_name) +{ + return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name)); +} + /* ------------------------------------------------------------------------ */ /* Function: ipldetach */ /* Returns: int - 0 == success, else error. */ @@ -267,10 +318,36 @@ ipf_stack_t *ifs; } /* + * Remove VND hooks + */ + if (ifs->ifs_ipf_vndl3v4 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in); + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v4 = NULL; + } + + if (ifs->ifs_ipf_vndl3v6 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in); + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v6 = NULL; + } + + /* * Remove notification of viona hooks */ net_instance_notify_unregister(ifs->ifs_netid, - ipf_hook_instance_notify); + ifs->ifs_gz_controlled ? ipf_hook_instance_notify_gz : + ipf_hook_instance_notify_ngz); #undef UNDO_HOOK @@ -278,6 +355,10 @@ ipf_stack_t *ifs; * Normally, viona will unregister itself before ipldetach() is called, * so these will be no-ops, but out of caution, we try to make sure * we've removed any of our references. + * + * For now, the _gz and _ngz versions are both wrappers to what's + * below. Just call it directly, but if that changes fix here as + * well. */ (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL, NH_PHYSICAL_IN); @@ -295,6 +376,10 @@ ipf_stack_t *ifs; * traced, we pass the same value the nethook framework would * pass, even though the callback does not currently use the * value. + * + * For now, the _gz and _ngz versions are both wrappers to + * what's below. Just call it directly, but if that changes + * fix here as well. */ (void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr, NULL, Hn_VIONA); @@ -495,6 +580,49 @@ ipf_stack_t *ifs; } /* + * Add VND INET hooks + */ + ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET); + if (ifs->ifs_ipf_vndl3v4 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in, + hook4_vnd_in, hook4_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out, + hook4_vnd_out, hook4_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0); + if (!ifs->ifs_hookvndl3v4_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0); + if (!ifs->ifs_hookvndl3v4_physical_out) + goto hookup_failed; + + + /* + * VND INET6 hooks + */ + ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6); + if (ifs->ifs_ipf_vndl3v6 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in, + hook6_vnd_in, hook6_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out, + hook6_vnd_out, hook6_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0); + if (!ifs->ifs_hookvndl3v6_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0); + if (!ifs->ifs_hookvndl3v6_physical_out) + goto hookup_failed; + + /* * VIONA INET hooks. While the nethook framework allows us to register * hooks for events that haven't been registered yet, we instead * register and unregister our hooks in response to notifications @@ -504,9 +632,15 @@ ipf_stack_t *ifs; * is unloaded, the viona module cannot later re-register them if it * gets reloaded. As the ip, vnd, and ipf modules are rarely unloaded * even on DEBUG kernels, they do not experience this issue. + * + * Today, the per-zone ones don't matter for a BHYVE-branded zone, BUT + * the ipf_hook_protocol_notify() function is GZ vs. per-zone aware. + * Employ two different versions of ipf_hook_instance_notify(), one for + * the GZ-controlled, and one for the per-zone one. */ - if (net_instance_notify_register(id, ipf_hook_instance_notify, - ifs) != 0) + if (net_instance_notify_register(id, ifs->ifs_gz_controlled ? + ipf_hook_instance_notify_gz : ipf_hook_instance_notify_ngz, ifs) != + 0) goto hookup_failed; /* @@ -688,6 +822,7 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg, { ipf_stack_t *ifs = arg; int ret = 0; + const boolean_t gz = ifs->ifs_gz_controlled; /* We currently only care about viona hooks */ if (strcmp(instance, Hn_VIONA) != 0) @@ -705,14 +840,16 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg, return (EPROTONOSUPPORT); ret = net_protocol_notify_register(ifs->ifs_ipf_viona, - ipf_hook_protocol_notify, ifs); + gz ? ipf_hook_protocol_notify_gz : + ipf_hook_protocol_notify_ngz, ifs); VERIFY(ret == 0 || ret == ESHUTDOWN); break; case HN_UNREGISTER: if (ifs->ifs_ipf_viona == NULL) break; VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona, - ipf_hook_protocol_notify)); + gz ? ipf_hook_protocol_notify_gz : + ipf_hook_protocol_notify_ngz)); VERIFY0(net_protocol_release(ifs->ifs_ipf_viona)); ifs->ifs_ipf_viona = NULL; break; @@ -821,6 +958,9 @@ int *rp; return ENXIO; unit = isp->ipfs_minor; + if (unit == IPL_LOGEV) + return (ipf_cfwlog_ioctl(dev, cmd, data, mode, cp, rp)); + zid = crgetzoneid(cp); if (cmd == SIOCIPFZONESET) { if (zid == GLOBAL_ZONEID) @@ -1129,14 +1269,14 @@ ipf_stack_t *ifs; { net_handle_t nif; - if (v == 4) - nif = ifs->ifs_ipf_ipv4; - else if (v == 6) - nif = ifs->ifs_ipf_ipv6; - else - return 0; - - return (net_phylookup(nif, name)); + if (v == 4) + nif = ifs->ifs_ipf_ipv4; + else if (v == 6) + nif = ifs->ifs_ipf_ipv6; + else + return 0; + + return (net_phylookup(nif, name)); } /* @@ -1161,11 +1301,35 @@ cred_t *cred; if (IPL_LOGMAX < min) return ENXIO; + /* Special-case ipfev: global-zone-open only. */ + if (min == IPL_LOGEV) { + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (ENXIO); + /* + * Else enable the CFW logging of events. + * NOTE: For now, we only allow one open at a time. + * Use atomic_cas to confirm/deny. And also for now, + * assume sizeof (boolean_t) == sizeof (uint_t). + * + * Per the *_{refrele,REFRELE}() in other parts of inet, + * ensure all loads/stores complete before calling cas. + * membar_exit() does this. + */ + membar_exit(); + if (atomic_cas_uint(&ipf_cfwlog_enabled, 0, 1) != 0) + return (EBUSY); + } + minor = (minor_t)(uintptr_t)vmem_alloc(ipf_minor, 1, VM_BESTFIT | VM_SLEEP); if (ddi_soft_state_zalloc(ipf_state, minor) != 0) { vmem_free(ipf_minor, (void *)(uintptr_t)minor, 1); + if (min == IPL_LOGEV) { + /* See above... */ + membar_exit(); + VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1); + } return ENXIO; } @@ -1187,6 +1351,7 @@ int flags, otype; cred_t *cred; { minor_t min = getminor(dev); + ipf_devstate_t *isp; #ifdef IPFDEBUG cmn_err(CE_CONT, "iplclose(%x,%x,%x,%x)\n", dev, flags, otype, cred); @@ -1195,6 +1360,15 @@ cred_t *cred; if (IPL_LOGMAX < min) return ENXIO; + isp = ddi_get_soft_state(ipf_state, min); + if (isp != NULL && isp->ipfs_minor == IPL_LOGEV) { + /* + * Disable CFW logging. See iplopen() for details. + */ + membar_exit(); + VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1); + } + ddi_soft_state_free(ipf_state, min); vmem_free(ipf_minor, (void *)(uintptr_t)min, 1); @@ -1225,6 +1399,8 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; + if (unit == IPL_LOGEV) + return (ipf_cfwlog_read(dev, uio, cp)); /* * ipf_find_stack returns with a read lock on ifs_ipf_global @@ -1277,6 +1453,9 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; + if (unit == IPL_LOGEV) + return (EIO); /* ipfev doesn't support write yet. */ + /* * ipf_find_stack returns with a read lock on ifs_ipf_global */ @@ -2068,8 +2247,11 @@ frdest_t *fdp; return (-1); } - /* Check the src here, fin_ifp is the src interface. */ - if (!(fin->fin_flx & FI_GENERATED) && + /* + * If we're forwarding (vs. injecting), check the src here, fin_ifp is + * the src interface. + */ + if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) && !fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p)) { return (-1); } @@ -2138,8 +2320,8 @@ frdest_t *fdp; inj->ni_physical = net_routeto(net_data_p, sinp, NULL); } - /* we're checking the destination here */ - if (!(fin->fin_flx & FI_GENERATED) && + /* If we're forwarding (vs. injecting), check the destinatation here. */ + if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) && !fr_forwarding_enabled(inj->ni_physical, net_data_p)) { goto bad_fastroute; } @@ -2355,6 +2537,42 @@ int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg, } /* ------------------------------------------------------------------------ */ +/* Function: ipf_hookvndl3_in */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The vnd hooks are private hooks to ON. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. The driver sends */ +/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ +/* them is in the upper 16 bits while the remaining bits are the */ +/* traditional packet hook flags. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +/*ARGSUSED*/ +int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_in(token, info, arg); +} + +int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_in(token, info, arg); +} + +/*ARGSUSED*/ +int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_out(token, info, arg); +} + +int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_out(token, info, arg); +} + +/* ------------------------------------------------------------------------ */ /* Function: ipf_hookviona_{in,out} */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ /* Parameters: event(I) - pointer to event */ @@ -3120,16 +3338,16 @@ fr_info_t *fin; /* both IP versions. The details are going to be explained here. */ /* */ /* The packet looks as follows: */ -/* xxx | IP hdr | IP payload ... | */ -/* ^ ^ ^ ^ */ -/* | | | | */ +/* xxx | IP hdr | IP payload ... | */ +/* ^ ^ ^ ^ */ +/* | | | | */ /* | | | fin_m->b_wptr = fin->fin_dp + fin->fin_dlen */ /* | | | */ /* | | `- fin_m->fin_dp (in case of IPv4 points to L4 header) */ /* | | */ /* | `- fin_m->b_rptr + fin_ipoff (fin_ipoff is most likely 0 in case */ /* | of loopback) */ -/* | */ +/* | */ /* `- fin_m->b_rptr - points to L2 header in case of physical NIC */ /* */ /* All relevant IP headers are pulled up into the first mblk. It happened */ diff --git a/usr/src/uts/common/inet/ipf/ip_log.c b/usr/src/uts/common/inet/ipf/ip_log.c index 584ee42d9a..b70e320def 100644 --- a/usr/src/uts/common/inet/ipf/ip_log.c +++ b/usr/src/uts/common/inet/ipf/ip_log.c @@ -8,7 +8,7 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/param.h> @@ -373,9 +373,11 @@ u_int flags; if (fin->fin_fr != NULL) { ipfl.fl_loglevel = fin->fin_fr->fr_loglevel; ipfl.fl_logtag = fin->fin_fr->fr_logtag; + bcopy(fin->fin_fr->fr_uuid, ipfl.fl_uuid, sizeof (uuid_t)); } else { ipfl.fl_loglevel = 0xffff; ipfl.fl_logtag = FR_NOLOGTAG; + bzero(ipfl.fl_uuid, sizeof (uuid_t)); } if (fin->fin_nattag != NULL) bcopy(fin->fin_nattag, (void *)&ipfl.fl_nattag, diff --git a/usr/src/uts/common/inet/ipf/ip_state.c b/usr/src/uts/common/inet/ipf/ip_state.c index 184f8775b6..a45bcbfdaf 100644 --- a/usr/src/uts/common/inet/ipf/ip_state.c +++ b/usr/src/uts/common/inet/ipf/ip_state.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #if defined(KERNEL) || defined(_KERNEL) @@ -108,6 +108,7 @@ struct file; # include <sys/systm.h> # endif #endif +#include <sys/uuid.h> /* END OF INCLUDES */ @@ -1445,6 +1446,7 @@ u_int flags; is->is_sti.tqe_flags |= TQE_RULEBASED; } is->is_tag = fr->fr_logtag; + memcpy(is->is_uuid, fr->fr_uuid, sizeof (uuid_t)); is->is_ifp[(out << 1) + 1] = fr->fr_ifas[1]; is->is_ifp[(1 - out) << 1] = fr->fr_ifas[2]; @@ -1524,6 +1526,9 @@ u_int flags; if (ifs->ifs_ipstate_logging) ipstate_log(is, ISL_NEW, ifs); + if (IFS_CFWLOG(ifs, is->is_rule)) + ipf_log_cfwlog(is, ISL_NEW, ifs); + RWLOCK_EXIT(&ifs->ifs_ipf_state); fin->fin_rev = IP6_NEQ(&is->is_dst, &fin->fin_daddr); fin->fin_flx |= FI_STATE; @@ -2314,6 +2319,8 @@ u_32_t cmask; is->is_flags &= ~(SI_W_SPORT|SI_W_DPORT); if ((flags & SI_CLONED) && ifs->ifs_ipstate_logging) ipstate_log(is, ISL_CLONE, ifs); + if ((flags & SI_CLONED) && IFS_CFWLOG(ifs, is->is_rule)) + ipf_log_cfwlog(is, ISL_CLONE, ifs); } ret = -1; @@ -3397,6 +3404,15 @@ ipf_stack_t *ifs; if (ifs->ifs_ipstate_logging != 0 && why != 0) ipstate_log(is, why, ifs); + /* + * For now, ipf_log_cfwlog() copes with all "why" values. Strictly + * speaking, though, they all map to one event (CFWEV_END), which for + * now is not supported, hence no code calling ipf_log_cfwlog() like + * below: + * + * if (why != 0 && IFS_CFWLOG(ifs, is->is_rule)) + * ipf_log_cfwlog(is, why, ifs); + */ if (is->is_rule != NULL) { is->is_rule->fr_statecnt--; @@ -3931,7 +3947,6 @@ int flags; return rval; } - /* ------------------------------------------------------------------------ */ /* Function: ipstate_log */ /* Returns: Nil */ diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf index 6b36f9fdbf..f49e024a72 100644 --- a/usr/src/uts/common/inet/ipf/ipf.conf +++ b/usr/src/uts/common/inet/ipf/ipf.conf @@ -1,3 +1,8 @@ # # name="ipf" parent="pseudo" instance=0; + +# Increase the state table limits. fr_statemax should be ~70% of fr_statesize, +# and both should be prime numbers +fr_statesize=151007; +fr_statemax=113279; diff --git a/usr/src/uts/common/inet/ipf/netinet/Makefile b/usr/src/uts/common/inet/ipf/netinet/Makefile index cca3b48ac4..88f91e633f 100644 --- a/usr/src/uts/common/inet/ipf/netinet/Makefile +++ b/usr/src/uts/common/inet/ipf/netinet/Makefile @@ -1,16 +1,15 @@ # -#ident "%Z%%M% %I% %E% SMI" -# # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2019 Joyent, Inc. # # uts/common/inet/ipf/netinet/Makefile # # include global definitions include ../../../../../Makefile.master -HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h \ - ip_frag.h ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h +HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h ip_frag.h \ + ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h ipf_cfw.h ROOTDIRS= $(ROOT)/usr/include/netinet diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h index 4c3c5683b5..bb5ce7bd6c 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h +++ b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h @@ -8,7 +8,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019, Joyent, Inc. */ #ifndef __IP_FIL_H__ @@ -16,6 +16,7 @@ #include "netinet/ip_compat.h" #include <sys/zone.h> +#include <sys/uuid.h> #ifdef SOLARIS #undef SOLARIS @@ -115,6 +116,8 @@ #define SIOCDELFR SIOCRMAFR #define SIOCINSFR SIOCINAFR # define SIOCIPFZONESET _IOWR('r', 97, struct ipfzoneobj) +# define SIOCIPFCFWCFG _IOR('r', 98, struct ipfcfwcfg) +# define SIOCIPFCFWNEWSZ _IOWR('r', 99, struct ipfcfwcfg) /* * What type of table is getting flushed? @@ -600,6 +603,7 @@ typedef struct frentry { u_32_t fr_flags; /* per-rule flags && options (see below) */ u_32_t fr_logtag; /* user defined log tag # */ u_32_t fr_collect; /* collection number */ + uuid_t fr_uuid; /* user defined uuid */ u_int fr_arg; /* misc. numeric arg for rule */ u_int fr_loglevel; /* syslog log facility + priority */ u_int fr_age[2]; /* non-TCP timeouts */ @@ -728,6 +732,7 @@ typedef struct frentry { #define FR_NEWISN 0x400000 /* new ISN for outgoing TCP */ #define FR_NOICMPERR 0x800000 /* do not match ICMP errors in state */ #define FR_STATESYNC 0x1000000 /* synchronize state to slave */ +#define FR_CFWLOG 0x2000000 /* Global CFW logging enabled */ #define FR_NOMATCH 0x8000000 /* no match occured */ /* 0x10000000 FF_LOGPASS */ /* 0x20000000 FF_LOGBLOCK */ @@ -883,6 +888,7 @@ typedef struct ipflog { u_32_t fl_lflags; u_32_t fl_logtag; ipftag_t fl_nattag; + uuid_t fl_uuid; u_short fl_plen; /* extra data after hlen */ u_short fl_loglevel; /* syslog log level */ char fl_group[FR_GROUPLEN]; @@ -931,6 +937,7 @@ typedef struct ipflog { #define IPSYNC_NAME "/dev/ipsync" #define IPSCAN_NAME "/dev/ipscan" #define IPLOOKUP_NAME "/dev/iplookup" +#define IPFEV_NAME "/dev/ipfev" #define IPL_LOGIPF 0 /* Minor device #'s for accessing logs */ #define IPL_LOGNAT 1 @@ -939,8 +946,9 @@ typedef struct ipflog { #define IPL_LOGSYNC 4 #define IPL_LOGSCAN 5 #define IPL_LOGLOOKUP 6 -#define IPL_LOGCOUNT 7 -#define IPL_LOGMAX 7 +#define IPL_LOGEV 7 +#define IPL_LOGCOUNT 8 +#define IPL_LOGMAX 8 #define IPL_LOGSIZE (IPL_LOGMAX + 1) #define IPL_LOGALL -1 #define IPL_LOGNONE -2 @@ -1181,6 +1189,21 @@ typedef struct ipfzoneobj { char ipfz_zonename[ZONENAME_MAX]; /* zone to act on */ } ipfzoneobj_t; +/* ioctl to grab CFW logging parameters */ +typedef struct ipfcfwcfg { + /* CFG => Max event size, NEWSZ => ignored in, like CFG out. */ + uint32_t ipfcfwc_maxevsize; + /* + * CFG => Current ring size, + * NEWSZ => New ring size, must be 2^N for 3 <= N <= 31. + */ + uint32_t ipfcfwc_evringsize; + /* CFG => Number of event reports, NEWSZ => ignored in, like CFG out. */ + uint64_t ipfcfwc_evreports; + /* CFG => Number of event drops, NEWSZ => ignored in, like CFG out. */ + uint64_t ipfcfwc_evdrops; +} ipfcfwcfg_t; + #if defined(_KERNEL) /* Set ipfs_zoneid to this if no zone has been set: */ #define IPFS_ZONE_UNSET -2 @@ -1560,6 +1583,23 @@ extern int ipllog __P((int, fr_info_t *, void **, size_t *, int *, int, ipf_stack_t *)); extern void fr_logunload __P((ipf_stack_t *)); +/* SmartOS single-FD global-zone state accumulator (see cfw.c) */ +extern boolean_t ipf_cfwlog_enabled; +struct ipstate; /* Ugggh. */ +extern void ipf_log_cfwlog __P((struct ipstate *, uint_t, ipf_stack_t *)); +extern void ipf_block_cfwlog __P((frentry_t *, fr_info_t *, ipf_stack_t *)); +#define IFS_CFWLOG(ifs, fr) ((ifs)->ifs_gz_controlled && ipf_cfwlog_enabled &&\ + fr != NULL && ((fr)->fr_flags & FR_CFWLOG)) +struct cfwev_s; /* See ipf_cfw.h */ +extern boolean_t ipf_cfwev_consume __P((struct cfwev_s *, boolean_t)); +/* See cfw.c's ipf_cfwev_consume_many() for details. */ +typedef uint_t (*cfwmanycb_t) __P((struct cfwev_s *, uint_t, void *)); +extern int ipf_cfwlog_read __P((dev_t, struct uio *, struct cred *)); +extern int ipf_cfwlog_ioctl __P((dev_t, int, intptr_t, int, cred_t *, int *)); +#define IPF_CFW_RING_ALLOCATE 0 +#define IPF_CFW_RING_DESTROY 1 +extern int ipf_cfw_ring_resize(uint32_t); + extern frentry_t *fr_acctpkt __P((fr_info_t *, u_32_t *)); extern int fr_copytolog __P((int, char *, int)); extern u_short fr_cksum __P((mb_t *, ip_t *, int, void *)); diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_state.h b/usr/src/uts/common/inet/ipf/netinet/ip_state.h index 4c605c1b89..ef315d5ef1 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ip_state.h +++ b/usr/src/uts/common/inet/ipf/netinet/ip_state.h @@ -8,11 +8,14 @@ * * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ #ifndef __IP_STATE_H__ #define __IP_STATE_H__ +#include <sys/uuid.h> + #if defined(__STDC__) || defined(__GNUC__) || defined(_AIX51) # define SIOCDELST _IOW('r', 61, struct ipfobj) #else @@ -66,6 +69,7 @@ typedef struct ipstate { /* in both directions */ u_32_t is_optmsk[2]; /* " " mask */ /* in both directions */ + uuid_t is_uuid; u_short is_sec; /* security options set */ u_short is_secmsk; /* " " mask */ u_short is_auth; /* authentication options set */ diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h new file mode 100644 index 0000000000..1972d2b3f7 --- /dev/null +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h @@ -0,0 +1,69 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef __IPF_CFW_H__ +#define __IPF_CFW_H__ + +#include <sys/types.h> +#include <inet/ip6.h> +#include <sys/uuid.h> + +/* Because ipf compiles this kernel file in userland testing... */ +#ifndef ASSERT3U +#define ASSERT3U(a, b, c) ASSERT(a ## b ## c); +#endif /* ASSERT3U */ + +/* + * CFW Event, which is emitted to a global-zone listener. The global-zone + * listener solves the one-fd-per-zone problem of using each zone's ipmon. + * + * These must be 64-bit aligned because they form an array in-kernel. There + * might be reserved fields to ensure that alignment. + */ +#define CFWEV_BLOCK 1 +#define CFWEV_BEGIN 2 +#define CFWEV_END 3 +#define CFWDIR_IN 1 +#define CFWDIR_OUT 2 + +typedef struct cfwev_s { + uint16_t cfwev_type; /* BEGIN, END, BLOCK */ + uint16_t cfwev_length; /* in bytes, so capped to 65535 bytes */ + zoneid_t cfwev_zonedid; /* Pullable from ipf_stack_t. */ + + uint32_t cfwev_ruleid; /* Pullable from fr_info_t. */ + uint16_t cfwev_sport; /* Source port (network order) */ + uint16_t cfwev_dport; /* Dest. port (network order) */ + + uint8_t cfwev_protocol; /* IPPROTO_* */ + /* "direction" informs if src/dst are local/remote or remote/local. */ + uint8_t cfwev_direction; + uint8_t cfwev_reserved[6]; /* Ensures 64-bit alignment. */ + + in6_addr_t cfwev_saddr; /* IPv4 addresses are V4MAPPED. */ + in6_addr_t cfwev_daddr; + + /* + * Because of 'struct timeval' being different between 32-bit and + * 64-bit ABIs, this interface is only usable by 64-bit binaries. + */ + struct timeval cfwev_tstamp; + + uuid_t cfwev_ruleuuid; /* Pullable from fr_info_t. */ +} cfwev_t; + + + +#endif /* __IPF_CFW_H__ */ diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h index 0ceea1e921..0b2a8d826f 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h @@ -6,7 +6,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright 2018 Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef __IPF_STACK_H__ @@ -46,6 +46,7 @@ struct ipf_stack { struct ipf_stack *ifs_gz_cont_ifs; netid_t ifs_netid; zoneid_t ifs_zone; + zoneid_t ifs_zone_did; boolean_t ifs_gz_controlled; /* ipf module */ @@ -126,6 +127,11 @@ struct ipf_stack { hook_t *ifs_ipfhook6_loop_out; hook_t *ifs_ipfhook6_nicevents; + hook_t *ifs_ipfhookvndl3v4_in; + hook_t *ifs_ipfhookvndl3v6_in; + hook_t *ifs_ipfhookvndl3v4_out; + hook_t *ifs_ipfhookvndl3v6_out; + hook_t *ifs_ipfhookviona_in; hook_t *ifs_ipfhookviona_out; @@ -140,12 +146,18 @@ struct ipf_stack { boolean_t ifs_hook6_nic_events; boolean_t ifs_hook6_loopback_in; boolean_t ifs_hook6_loopback_out; + boolean_t ifs_hookvndl3v4_physical_in; + boolean_t ifs_hookvndl3v6_physical_in; + boolean_t ifs_hookvndl3v4_physical_out; + boolean_t ifs_hookvndl3v6_physical_out; boolean_t ifs_hookviona_physical_in; boolean_t ifs_hookviona_physical_out; int ifs_ipf_loopback; net_handle_t ifs_ipf_ipv4; net_handle_t ifs_ipf_ipv6; + net_handle_t ifs_ipf_vndl3v4; + net_handle_t ifs_ipf_vndl3v6; net_handle_t ifs_ipf_viona; /* ip_auth.c */ @@ -305,6 +317,7 @@ struct ipf_stack { char *ifs_addmask_key; char *ifs_rn_zeros; char *ifs_rn_ones; + #ifdef KERNEL /* kstats for inbound and outbound */ kstat_t *ifs_kstatp[2]; diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c index c541f4dddc..5ccbfa3188 100644 --- a/usr/src/uts/common/inet/ipf/solaris.c +++ b/usr/src/uts/common/inet/ipf/solaris.c @@ -6,7 +6,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -116,7 +116,7 @@ static void ipf_stack_shutdown __P((const netid_t, void *)); static int ipf_property_g_update __P((dev_info_t *)); static char *ipf_devfiles[] = { IPL_NAME, IPNAT_NAME, IPSTATE_NAME, IPAUTH_NAME, IPSYNC_NAME, IPSCAN_NAME, - IPLOOKUP_NAME, NULL }; + IPLOOKUP_NAME, IPFEV_NAME, NULL }; extern void *ipf_state; /* DDI state */ extern vmem_t *ipf_minor; /* minor number arena */ @@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg) /* * Destroy things for ipf for one stack. */ -/* ARGSUSED */ static void ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs) { @@ -742,6 +741,9 @@ ddi_attach_cmd_t cmd; ipf_dev_info = dip; + if (ipf_cfw_ring_resize(IPF_CFW_RING_ALLOCATE) != 0) + goto attach_failed; + ipfncb = net_instance_alloc(NETINFO_VERSION); if (ipfncb == NULL) goto attach_failed; @@ -769,6 +771,7 @@ ddi_attach_cmd_t cmd; } attach_failed: + (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY); ddi_remove_minor_node(dip, NULL); ddi_prop_remove_all(dip); ddi_soft_state_fini(&ipf_state); @@ -796,6 +799,7 @@ ddi_detach_cmd_t cmd; * framework guarantees we are not active with this devinfo * node in any other entry points at this time. */ + (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY); ddi_prop_remove_all(dip); i = ddi_get_instance(dip); ddi_remove_minor_node(dip, NULL); |