summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/ip
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet/ip')
-rw-r--r--usr/src/uts/common/inet/ip/conn_opt.c22
-rw-r--r--usr/src/uts/common/inet/ip/icmp.c117
-rw-r--r--usr/src/uts/common/inet/ip/icmp_opt_data.c6
-rw-r--r--usr/src/uts/common/inet/ip/ip.c94
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c177
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c2
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c165
7 files changed, 392 insertions, 191 deletions
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
index 7aac9b655a..eeec56b162 100644
--- a/usr/src/uts/common/inet/ip/conn_opt.c
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -644,6 +645,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
case SO_REUSEADDR:
*i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
break; /* goto sizeof (int) option return */
+ case SO_REUSEPORT:
+ *i1 = connp->conn_reuseport;
+ break; /* goto sizeof (int) option return */
case SO_TYPE:
*i1 = connp->conn_so_type;
break; /* goto sizeof (int) option return */
@@ -1214,8 +1218,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
int error;
- if (connp->conn_family != AF_INET)
+ if (connp->conn_family == AF_INET6 &&
+ connp->conn_ipversion == IPV4_VERSION) {
+ /*
+ * Allow certain IPv4 options to be set on an AF_INET6 socket
+ * if the connection is still IPv4.
+ */
+ switch (name) {
+ case IP_TOS:
+ case T_IP_TOS:
+ case IP_TTL:
+ case IP_DONTFRAG:
+ break;
+ default:
+ return (EINVAL);
+ }
+ } else if (connp->conn_family != AF_INET) {
return (EINVAL);
+ }
ifindex = UINT_MAX;
switch (name) {
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 57ee0c5585..46c791298a 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -81,6 +81,7 @@
#include <sys/tsol/tnet.h>
#include <inet/rawip_impl.h>
+#include <net/bpf.h>
#include <sys/disp.h>
@@ -1018,6 +1019,12 @@ icmp_close_free(conn_t *connp)
icmp->icmp_filter = NULL;
}
+ if (icmp->icmp_bpf_len != 0) {
+ kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+ icmp->icmp_bpf_len = 0;
+ icmp->icmp_bpf_prog = NULL;
+ }
+
/*
* Clear any fields which the kmem_cache constructor clears.
* Only icmp_connp needs to be preserved.
@@ -1971,6 +1978,104 @@ icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
return (err);
}
+static int
+icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp)
+{
+ struct bpf_program prog;
+ ip_bpf_insn_t *insns = NULL;
+ unsigned int size;
+
+#ifdef _LP64
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ struct bpf_program32 *prog32;
+
+ if (inlen != sizeof (struct bpf_program32)) {
+ return (EINVAL);
+ }
+ prog32 = (struct bpf_program32 *)invalp;
+ prog.bf_len = prog32->bf_len;
+ prog.bf_insns = (void *)(uint64_t)prog32->bf_insns;
+ } else
+#endif
+ if (inlen == sizeof (struct bpf_program)) {
+ bcopy(invalp, &prog, sizeof (prog));
+ } else {
+ return (EINVAL);
+ }
+
+ if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) {
+ return (EINVAL);
+ }
+ size = prog.bf_len * sizeof (struct bpf_insn);
+ insns = kmem_alloc(size, KM_SLEEP);
+ if (copyin(prog.bf_insns, insns, size) != 0) {
+ kmem_free(insns, size);
+ return (EFAULT);
+ }
+ if (!ip_bpf_validate(insns, prog.bf_len)) {
+ kmem_free(insns, size);
+ return (EINVAL);
+ }
+
+ rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+ if (icmp->icmp_bpf_len != 0) {
+ ASSERT(icmp->icmp_bpf_prog != NULL);
+
+ kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+ }
+ icmp->icmp_bpf_len = size;
+ icmp->icmp_bpf_prog = insns;
+ rw_exit(&icmp->icmp_bpf_lock);
+ return (0);
+}
+
+static int
+icmp_detach_filter(icmp_t *icmp)
+{
+ int error;
+
+ rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+ if (icmp->icmp_bpf_len == 0) {
+ ASSERT(icmp->icmp_bpf_prog == NULL);
+ error = ENOENT;
+ } else {
+ kmem_free(icmp->icmp_bpf_prog,
+ icmp->icmp_bpf_len);
+ icmp->icmp_bpf_len = 0;
+ icmp->icmp_bpf_prog = NULL;
+ error = 0;
+ }
+ rw_exit(&icmp->icmp_bpf_lock);
+ return (error);
+}
+
+static boolean_t
+icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira)
+{
+ boolean_t res;
+ uchar_t *buf = mp->b_rptr;
+ uint_t wirelen, len = MBLKL(mp);
+
+ rw_enter(&icmp->icmp_bpf_lock, RW_READER);
+ if (icmp->icmp_bpf_len == 0) {
+ rw_exit(&icmp->icmp_bpf_lock);
+ return (B_FALSE);
+ }
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)buf;
+
+ wirelen = ntohs(ipha->ipha_length);
+ } else {
+ ip6_t *ip6h = (ip6_t *)buf;
+
+ wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ }
+ res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len);
+ rw_exit(&icmp->icmp_bpf_lock);
+
+ return (res);
+}
+
/*
* This routine sets socket options.
*/
@@ -2060,6 +2165,10 @@ icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
return (ENOBUFS);
}
break;
+ case SO_ATTACH_FILTER:
+ return (icmp_attach_filter(icmp, inlen, invalp));
+ case SO_DETACH_FILTER:
+ return (icmp_detach_filter(icmp));
}
break;
@@ -2605,6 +2714,14 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
/* Initialize regardless of IP version */
ipps.ipp_fields = 0;
+ /* Apply socket filter, if needed */
+ if (icmp->icmp_bpf_len != 0) {
+ if (icmp_eval_filter(icmp, mp, ira)) {
+ freemsg(mp);
+ return;
+ }
+ }
+
if (ira->ira_flags & IRAF_IS_IPV4) {
ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
ASSERT(MBLKL(mp) >= sizeof (ipha_t));
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index ff0310de0c..d65d3164d3 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -41,6 +42,7 @@
#include <netinet/ip_mroute.h>
#include <inet/optcom.h>
#include <inet/rawip_impl.h>
+#include <net/bpf.h>
/*
* Table of all known options handled on a ICMP protocol stack.
@@ -86,6 +88,10 @@ opdes_t icmp_opt_arr[] = {
0 },
{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_ATTACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0,
+ sizeof (struct bpf_program), 0 },
+{ SO_DETACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, 0, 0 },
+
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
(OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 6063fa01d2..704f152bb9 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -8235,7 +8235,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
conn_t *connp = NULL;
t_uscalar_t paddrreq;
mblk_t *mp_hw;
- boolean_t success;
boolean_t ioctl_aborted = B_FALSE;
boolean_t log = B_TRUE;
@@ -8335,7 +8334,8 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
mutex_exit(&ill->ill_lock);
/*
- * Something went wrong with the bind. We presumably
+ * Something went wrong with the bind. If this was the
+ * result of a DL_NOTE_REPLUMB, then we presumably
* have an IOCTL hanging out waiting for completion.
* Find it, take down the interface that was coming
* up, and complete the IOCTL with the error noted.
@@ -8352,6 +8352,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
(void) ipif_down(ipif, NULL, NULL);
/* error is set below the switch */
+ } else {
+ /*
+ * There's no pending IOCTL, so the bind was
+ * most likely started by ill_dl_up(). We save
+ * the error and let it take care of responding
+ * to the IOCTL.
+ */
+ ill->ill_dl_bind_err = dlea->dl_unix_errno ?
+ dlea->dl_unix_errno : ENXIO;
}
break;
case DL_ENABMULTI_REQ:
@@ -8475,55 +8484,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
- /*
- * Now bring up the resolver; when that is complete, we'll
- * create IREs. Note that we intentionally mirror what
- * ipif_up() would have done, because we got here by way of
- * ill_dl_up(), which stopped ipif_up()'s processing.
- */
- if (ill->ill_isv6) {
- /*
- * v6 interfaces.
- * Unlike ARP which has to do another bind
- * and attach, once we get here we are
- * done with NDP
- */
- (void) ipif_resolver_up(ipif, Res_act_initial);
- if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
- err = ipif_up_done_v6(ipif);
- } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
- /*
- * ARP and other v4 external resolvers.
- * Leave the pending mblk intact so that
- * the ioctl completes in ip_rput().
- */
- if (connp != NULL)
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0);
- mutex_exit(&ill->ill_lock);
- if (connp != NULL)
- mutex_exit(&connp->conn_lock);
- if (success) {
- err = ipif_resolver_up(ipif, Res_act_initial);
- if (err == EINPROGRESS) {
- freemsg(mp);
- return;
- }
- mp1 = ipsq_pending_mp_get(ipsq, &connp);
- } else {
- /* The conn has started closing */
- err = EINTR;
- }
- } else {
- /*
- * This one is complete. Reply to pending ioctl.
- */
- (void) ipif_resolver_up(ipif, Res_act_initial);
- err = ipif_up_done(ipif);
- }
-
- if ((err == 0) && (ill->ill_up_ipifs)) {
+ if (ill->ill_up_ipifs) {
err = ill_up_ipifs(ill, q, mp1);
if (err == EINPROGRESS) {
freemsg(mp);
@@ -8531,25 +8492,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
}
- /*
- * If we have a moved ipif to bring up, and everything has
- * succeeded to this point, bring it up on the IPMP ill.
- * Otherwise, leave it down -- the admin can try to bring it
- * up by hand if need be.
- */
- if (ill->ill_move_ipif != NULL) {
- if (err != 0) {
- ill->ill_move_ipif = NULL;
- } else {
- ipif = ill->ill_move_ipif;
- ill->ill_move_ipif = NULL;
- err = ipif_up(ipif, q, mp1);
- if (err == EINPROGRESS) {
- freemsg(mp);
- return;
- }
- }
- }
break;
case DL_NOTIFY_IND: {
@@ -12621,6 +12563,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
ip_ioctl_cmd_t *ipip = arg;
ip_extract_func_t *extract_funcp;
+ ill_t *ill;
cmd_info_t ci;
int err;
boolean_t entered_ipsq = B_FALSE;
@@ -12742,6 +12685,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
/*
+ * We need to cache the ill_t that we're going to use as the argument
+ * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be
+ * blown away by calling ipi_func.
+ */
+ ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill;
+
+ /*
* A return value of EINPROGRESS means the ioctl is
* either queued and waiting for some reason or has
* already completed.
@@ -12749,9 +12699,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
- int, ipip->ipi_cmd,
- ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
- ipif_t *, ci.ci_ipif);
+ int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif);
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
if (entered_ipsq)
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index cc67299a1b..2307837eb8 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -22,7 +22,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1990 Mentat Inc.
* Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2016, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
@@ -174,7 +174,7 @@ static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen,
static int ill_alloc_ppa(ill_if_t *, ill_t *);
static void ill_delete_interface_type(ill_if_t *);
-static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
+static int ill_dl_up(ill_t *ill, ipif_t *ipif);
static void ill_dl_down(ill_t *ill);
static void ill_down(ill_t *ill);
static void ill_down_ipifs(ill_t *, boolean_t);
@@ -1380,6 +1380,36 @@ ill_capability_probe(ill_t *ill)
ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
}
+static boolean_t
+ill_capability_wait(ill_t *ill)
+{
+ /*
+ * I'm in this ill's squeue, aka a writer. The ILL_CONDEMNED flag can
+ * only be set by someone who is the writer. Since we
+ * drop-and-reacquire the squeue in this loop, we need to check for
+ * ILL_CONDEMNED, which if set means nothing can signal our capability
+ * condition variable.
+ */
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ while (ill->ill_capab_pending_cnt != 0 &&
+ (ill->ill_state_flags & ILL_CONDEMNED) == 0) {
+ /* This may enable blocked callers of ill_capability_done(). */
+ ipsq_exit(ill->ill_phyint->phyint_ipsq);
+ /* Pause a bit (1msec) before we re-enter the squeue. */
+ delay(drv_usectohz(1000000));
+
+ /*
+ * If ipsq_enter() fails, someone set ILL_CONDEMNED
+ * while we dropped the squeue. Indicate such to the caller.
+ */
+ if (!ipsq_enter(ill, B_FALSE, CUR_OP))
+ return (B_FALSE);
+ }
+
+ return ((ill->ill_state_flags & ILL_CONDEMNED) == 0);
+}
+
void
ill_capability_reset(ill_t *ill, boolean_t reneg)
{
@@ -1390,6 +1420,8 @@ ill_capability_reset(ill_t *ill, boolean_t reneg)
ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
+ ASSERT(ill->ill_capab_reset_mp != NULL);
+
ill_capability_send(ill, ill->ill_capab_reset_mp);
ill->ill_capab_reset_mp = NULL;
/*
@@ -2109,6 +2141,49 @@ ill_capability_lso_enable(ill_t *ill)
}
}
+/*
+ * Check whether or not mac will prevent us from sending with a given IP
+ * address. This requires having the IPCHECK capability, which we should
+ * always be able to successfully negotiate, but if it's somehow missing
+ * then we just permit the caller to use the address, since mac does the
+ * actual enforcement and ip is just performing a courtesy check to help
+ * prevent users from unwittingly setting and attempting to use blocked
+ * addresses.
+ */
+static boolean_t
+ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr)
+{
+ if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0)
+ return (B_TRUE);
+
+ ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck;
+ ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df;
+ return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr));
+}
+
+static void
+ill_capability_ipcheck_enable(ill_t *ill)
+{
+ ill_dld_capab_t *idc = ill->ill_dld_capab;
+ ill_dld_ipcheck_t *idi = &idc->idc_ipcheck;
+ dld_capab_ipcheck_t spoof;
+ int rc;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ bzero(&spoof, sizeof (spoof));
+ if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
+ &spoof, DLD_ENABLE)) == 0) {
+ idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df;
+ idi->idi_allowed_dh = spoof.ipc_allowed_dh;
+ ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK;
+ } else {
+ cmn_err(CE_WARN, "warning: could not enable IPCHECK "
+ "capability, rc = %d\n", rc);
+ DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc);
+ }
+}
+
static void
ill_capability_dld_enable(ill_t *ill)
{
@@ -2121,6 +2196,8 @@ ill_capability_dld_enable(ill_t *ill)
ill_capability_direct_enable(ill);
ill_capability_poll_enable(ill);
}
+
+ ill_capability_ipcheck_enable(ill);
ill_capability_lso_enable(ill);
ill->ill_capabilities |= ILL_CAPAB_DLD;
ill_mac_perim_exit(ill, mph);
@@ -2186,6 +2263,15 @@ ill_capability_dld_disable(ill_t *ill)
NULL, DLD_DISABLE);
}
+ if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) {
+ ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL);
+ ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL);
+
+ ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK;
+ (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
+ NULL, DLD_DISABLE);
+ }
+
ill->ill_capabilities &= ~ILL_CAPAB_DLD;
ill_mac_perim_exit(ill, mph);
}
@@ -9676,7 +9762,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
in6_addr_t v6addr;
boolean_t need_up = B_FALSE;
ill_t *ill;
- int i;
ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -9751,20 +9836,9 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
}
- /*
- * verify that the address being configured is permitted by the
- * ill_allowed_ips[] for the interface.
- */
- if (ill->ill_allowed_ips_cnt > 0) {
- for (i = 0; i < ill->ill_allowed_ips_cnt; i++) {
- if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i],
- &v6addr))
- break;
- }
- if (i == ill->ill_allowed_ips_cnt) {
- pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr);
- return (EPERM);
- }
+ /* verify that the address being configured is permitted by mac */
+ if (!ill_ipcheck_addr(ill, &v6addr)) {
+ return (EPERM);
}
/*
* Even if there is no change we redo things just to rerun
@@ -12704,6 +12778,12 @@ ill_dl_down(ill_t *ill)
}
ill->ill_unbind_mp = NULL;
+
+ mutex_enter(&ill->ill_lock);
+ ill->ill_dl_up = 0;
+ ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
+ mutex_exit(&ill->ill_lock);
+
if (mp != NULL) {
ip1dbg(("ill_dl_down: %s (%u) for %s\n",
dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
@@ -12726,11 +12806,13 @@ ill_dl_down(ill_t *ill)
ill_capability_dld_disable(ill);
ill_capability_reset(ill, B_FALSE);
ill_dlpi_send(ill, mp);
+
+ /*
+ * Wait for the capability reset to finish.
+ * In this case, it doesn't matter WHY or HOW it finished.
+ */
+ (void) ill_capability_wait(ill);
}
- mutex_enter(&ill->ill_lock);
- ill->ill_dl_up = 0;
- ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
- mutex_exit(&ill->ill_lock);
}
void
@@ -12852,6 +12934,7 @@ void
ill_capability_done(ill_t *ill)
{
ASSERT(ill->ill_capab_pending_cnt != 0);
+ ASSERT(IAM_WRITER_ILL(ill));
ill_dlpi_done(ill, DL_CAPABILITY_REQ);
@@ -14480,7 +14563,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
* address/netmask etc cause a down/up dance, but
* does not cause an unbind (DL_UNBIND) with the driver
*/
- return (ill_dl_up(ill, ipif, mp, q));
+ if ((err = ill_dl_up(ill, ipif)) != 0) {
+ return (err);
+ }
+ }
+
+ /* Reject bringing up interfaces with unusable IP addresses */
+ if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) {
+ return (EPERM);
}
/*
@@ -14593,24 +14683,22 @@ ill_delete_ires(ill_t *ill)
/*
* Perform a bind for the physical device.
- * When the routine returns EINPROGRESS then mp has been consumed and
- * the ioctl will be acked from ip_rput_dlpi.
- * Allocate an unbind message and save it until ipif_down.
+ *
+ * When the routine returns successfully then dlpi has been bound and
+ * capabilities negotiated. An unbind message will have been allocated
+ * for later use in ipif_down.
*/
static int
-ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
+ill_dl_up(ill_t *ill, ipif_t *ipif)
{
mblk_t *bind_mp = NULL;
mblk_t *unbind_mp = NULL;
- conn_t *connp;
- boolean_t success;
int err;
DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(mp != NULL);
/*
* Make sure we have an IRE_MULTICAST in case we immediately
@@ -14645,19 +14733,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
if (unbind_mp == NULL)
goto bad;
}
- /*
- * Record state needed to complete this operation when the
- * DL_BIND_ACK shows up. Also remember the pre-allocated mblks.
- */
- connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
- ASSERT(connp != NULL || !CONN_Q(q));
- GRAB_CONN_LOCK(q);
- mutex_enter(&ipif->ipif_ill->ill_lock);
- success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
- mutex_exit(&ipif->ipif_ill->ill_lock);
- RELEASE_CONN_LOCK(q);
- if (!success)
- goto bad;
/*
* Save the unbind message for ill_dl_down(); it will be consumed when
@@ -14669,6 +14744,18 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
ill_dlpi_send(ill, bind_mp);
/* Send down link-layer capabilities probe if not already done. */
ill_capability_probe(ill);
+ /*
+ * Wait for DLPI to be bound and the capability probe to finish.
+ * The call drops-and-reacquires the squeue. If it couldn't because
+ * ILL_CONDEMNED got set, bail.
+ */
+ if (!ill_capability_wait(ill))
+ return (ENXIO);
+
+ /* DLPI failed to bind. Return the saved error */
+ if (!ill->ill_dl_up) {
+ return (ill->ill_dl_bind_err);
+ }
/*
* Sysid used to rely on the fact that netboots set domainname
@@ -14686,11 +14773,7 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
cmn_err(CE_WARN, "no cached dhcp response");
}
- /*
- * This operation will complete in ip_rput_dlpi with either
- * a DL_BIND_ACK or DL_ERROR_ACK.
- */
- return (EINPROGRESS);
+ return (0);
bad:
ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index 13e961333c..b6565d9c1f 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -153,7 +153,7 @@ ip_squeue_create(pri_t pri)
{
squeue_t *sqp;
- sqp = squeue_create(pri);
+ sqp = squeue_create(pri, B_TRUE);
ASSERT(sqp != NULL);
if (ip_squeue_create_callback != NULL)
ip_squeue_create_callback(sqp);
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 34832d56e5..d47997a4aa 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
* Copyright 2022 Joyent, Inc.
*/
@@ -871,67 +872,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
mutex_exit(&(connfp)->connf_lock); \
}
-#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
- conn_t *pconnp = NULL, *nconnp; \
- IPCL_HASH_REMOVE((connp)); \
- mutex_enter(&(connfp)->connf_lock); \
- nconnp = (connfp)->connf_head; \
- while (nconnp != NULL && \
- !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
- pconnp = nconnp; \
- nconnp = nconnp->conn_next; \
- } \
- if (pconnp != NULL) { \
- pconnp->conn_next = (connp); \
- (connp)->conn_prev = pconnp; \
- } else { \
- (connfp)->connf_head = (connp); \
- } \
- if (nconnp != NULL) { \
- (connp)->conn_next = nconnp; \
- nconnp->conn_prev = (connp); \
- } \
- (connp)->conn_fanout = (connfp); \
- (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
- IPCL_BOUND; \
- CONN_INC_REF(connp); \
- mutex_exit(&(connfp)->connf_lock); \
-}
+/*
+ * When inserting bound or wildcard entries into the hash, ordering rules are
+ * used to facilitate timely and correct lookups. The order is as follows:
+ * 1. Entries bound to a specific address
+ * 2. Entries bound to INADDR_ANY
+ * 3. Entries bound to ADDR_UNSPECIFIED
+ * Entries in a category which share conn_lport (such as those using
+ * SO_REUSEPORT) will be ordered such that the newest inserted is first.
+ */
-#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
- conn_t **list, *prev, *next; \
- boolean_t isv4mapped = \
- IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
- IPCL_HASH_REMOVE((connp)); \
- mutex_enter(&(connfp)->connf_lock); \
- list = &(connfp)->connf_head; \
- prev = NULL; \
- while ((next = *list) != NULL) { \
- if (isv4mapped && \
- IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
- connp->conn_zoneid == next->conn_zoneid) { \
- (connp)->conn_next = next; \
- if (prev != NULL) \
- prev = next->conn_prev; \
- next->conn_prev = (connp); \
- break; \
- } \
- list = &next->conn_next; \
- prev = next; \
- } \
- (connp)->conn_prev = prev; \
- *list = (connp); \
- (connp)->conn_fanout = (connfp); \
- (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
- IPCL_BOUND; \
- CONN_INC_REF((connp)); \
- mutex_exit(&(connfp)->connf_lock); \
+void
+ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
+{
+ conn_t *pconnp, *nconnp;
+
+ IPCL_HASH_REMOVE(connp);
+ mutex_enter(&connfp->connf_lock);
+ nconnp = connfp->connf_head;
+ pconnp = NULL;
+ while (nconnp != NULL) {
+ /*
+ * Walk though entries associated with the fanout until one is
+ * found which fulfills any of these conditions:
+ * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
+ * 2. Listen port the same as connp
+ */
+ if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
+ connp->conn_lport == nconnp->conn_lport)
+ break;
+ pconnp = nconnp;
+ nconnp = nconnp->conn_next;
+ }
+ if (pconnp != NULL) {
+ pconnp->conn_next = connp;
+ connp->conn_prev = pconnp;
+ } else {
+ connfp->connf_head = connp;
+ }
+ if (nconnp != NULL) {
+ connp->conn_next = nconnp;
+ nconnp->conn_prev = connp;
+ }
+ connp->conn_fanout = connfp;
+ connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
}
void
ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
{
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ conn_t **list, *prev, *next;
+ conn_t *pconnp = NULL, *nconnp;
+ boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
+
+ IPCL_HASH_REMOVE(connp);
+ mutex_enter(&connfp->connf_lock);
+ nconnp = connfp->connf_head;
+ pconnp = NULL;
+ while (nconnp != NULL) {
+ if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
+ isv4mapped && connp->conn_lport == nconnp->conn_lport)
+ break;
+ if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
+ (isv4mapped ||
+ connp->conn_lport == nconnp->conn_lport))
+ break;
+
+ pconnp = nconnp;
+ nconnp = nconnp->conn_next;
+ }
+ if (pconnp != NULL) {
+ pconnp->conn_next = connp;
+ connp->conn_prev = pconnp;
+ } else {
+ connfp->connf_head = connp;
+ }
+ if (nconnp != NULL) {
+ connp->conn_next = nconnp;
+ nconnp->conn_prev = connp;
+ }
+ connp->conn_fanout = connfp;
+ connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
}
/*
@@ -1037,9 +1062,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
} else {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
}
} else {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
@@ -1208,9 +1233,9 @@ ipcl_bind_insert_v4(conn_t *connp)
if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (protocol == IPPROTO_RSVP)
ill_set_inputfn_all(ipst);
@@ -1222,9 +1247,9 @@ ipcl_bind_insert_v4(conn_t *connp)
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (cl_inet_listen != NULL) {
ASSERT(connp->conn_ipversion == IPV4_VERSION);
@@ -1274,9 +1299,9 @@ ipcl_bind_insert_v6(conn_t *connp)
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
@@ -1286,9 +1311,9 @@ ipcl_bind_insert_v6(conn_t *connp)
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (cl_inet_listen != NULL) {
sa_family_t addr_family;
@@ -1419,9 +1444,9 @@ ipcl_conn_insert_v4(conn_t *connp)
if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
}
@@ -1507,9 +1532,9 @@ ipcl_conn_insert_v6(conn_t *connp)
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
}
@@ -2095,6 +2120,7 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
connp->conn_flags = IPCL_RAWIPCONN;
connp->conn_proto = IPPROTO_ICMP;
icmp->icmp_connp = connp;
+ rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL);
rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
if (connp->conn_ixa == NULL)
@@ -2119,6 +2145,7 @@ rawip_conn_destructor(void *buf, void *cdrarg)
mutex_destroy(&connp->conn_lock);
cv_destroy(&connp->conn_cv);
rw_destroy(&connp->conn_ilg_lock);
+ rw_destroy(&icmp->icmp_bpf_lock);
/* Can be NULL if constructor failed */
if (connp->conn_ixa != NULL) {