diff options
Diffstat (limited to 'usr/src/uts/common/inet/ip')
-rw-r--r-- | usr/src/uts/common/inet/ip/conn_opt.c | 22 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/icmp.c | 117 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/icmp_opt_data.c | 6 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip.c | 94 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_if.c | 177 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_squeue.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ipclassifier.c | 165 |
7 files changed, 392 insertions, 191 deletions
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c index 7aac9b655a..eeec56b162 100644 --- a/usr/src/uts/common/inet/ip/conn_opt.c +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -644,6 +645,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name, case SO_REUSEADDR: *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0; break; /* goto sizeof (int) option return */ + case SO_REUSEPORT: + *i1 = connp->conn_reuseport; + break; /* goto sizeof (int) option return */ case SO_TYPE: *i1 = connp->conn_so_type; break; /* goto sizeof (int) option return */ @@ -1214,8 +1218,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, ip_stack_t *ipst = connp->conn_netstack->netstack_ip; int error; - if (connp->conn_family != AF_INET) + if (connp->conn_family == AF_INET6 && + connp->conn_ipversion == IPV4_VERSION) { + /* + * Allow certain IPv4 options to be set on an AF_INET6 socket + * if the connection is still IPv4. + */ + switch (name) { + case IP_TOS: + case T_IP_TOS: + case IP_TTL: + case IP_DONTFRAG: + break; + default: + return (EINVAL); + } + } else if (connp->conn_family != AF_INET) { return (EINVAL); + } ifindex = UINT_MAX; switch (name) { diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 57ee0c5585..46c791298a 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -81,6 +81,7 @@ #include <sys/tsol/tnet.h> #include <inet/rawip_impl.h> +#include <net/bpf.h> #include <sys/disp.h> @@ -1018,6 +1019,12 @@ icmp_close_free(conn_t *connp) icmp->icmp_filter = NULL; } + if (icmp->icmp_bpf_len != 0) { + kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len); + icmp->icmp_bpf_len = 0; + icmp->icmp_bpf_prog = NULL; + } + /* * Clear any fields which the kmem_cache constructor clears. * Only icmp_connp needs to be preserved. @@ -1971,6 +1978,104 @@ icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) return (err); } +static int +icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp) +{ + struct bpf_program prog; + ip_bpf_insn_t *insns = NULL; + unsigned int size; + +#ifdef _LP64 + if (get_udatamodel() != DATAMODEL_NATIVE) { + struct bpf_program32 *prog32; + + if (inlen != sizeof (struct bpf_program32)) { + return (EINVAL); + } + prog32 = (struct bpf_program32 *)invalp; + prog.bf_len = prog32->bf_len; + prog.bf_insns = (void *)(uint64_t)prog32->bf_insns; + } else +#endif + if (inlen == sizeof (struct bpf_program)) { + bcopy(invalp, &prog, sizeof (prog)); + } else { + return (EINVAL); + } + + if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) { + return (EINVAL); + } + size = prog.bf_len * sizeof (struct bpf_insn); + insns = kmem_alloc(size, KM_SLEEP); + if (copyin(prog.bf_insns, insns, size) != 0) { + kmem_free(insns, size); + return (EFAULT); + } + if (!ip_bpf_validate(insns, prog.bf_len)) { + kmem_free(insns, size); + return (EINVAL); + } + + rw_enter(&icmp->icmp_bpf_lock, RW_WRITER); + if (icmp->icmp_bpf_len != 0) { + ASSERT(icmp->icmp_bpf_prog != NULL); + + kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len); + } + icmp->icmp_bpf_len = size; + icmp->icmp_bpf_prog = insns; + rw_exit(&icmp->icmp_bpf_lock); + return (0); +} + +static int +icmp_detach_filter(icmp_t *icmp) +{ + int error; + + rw_enter(&icmp->icmp_bpf_lock, RW_WRITER); + if (icmp->icmp_bpf_len == 0) { + ASSERT(icmp->icmp_bpf_prog == NULL); + error = ENOENT; + } else { + kmem_free(icmp->icmp_bpf_prog, + icmp->icmp_bpf_len); + icmp->icmp_bpf_len = 0; + icmp->icmp_bpf_prog = NULL; + error = 0; + } + rw_exit(&icmp->icmp_bpf_lock); + return (error); +} + +static boolean_t +icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira) +{ + boolean_t res; + uchar_t *buf = mp->b_rptr; + uint_t wirelen, len = MBLKL(mp); + + rw_enter(&icmp->icmp_bpf_lock, RW_READER); + if (icmp->icmp_bpf_len == 0) { + rw_exit(&icmp->icmp_bpf_lock); + return (B_FALSE); + } + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)buf; + + wirelen = ntohs(ipha->ipha_length); + } else { + ip6_t *ip6h = (ip6_t *)buf; + + wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + } + res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len); + rw_exit(&icmp->icmp_bpf_lock); + + return (res); +} + /* * This routine sets socket options. */ @@ -2060,6 +2165,10 @@ icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, return (ENOBUFS); } break; + case SO_ATTACH_FILTER: + return (icmp_attach_filter(icmp, inlen, invalp)); + case SO_DETACH_FILTER: + return (icmp_detach_filter(icmp)); } break; @@ -2605,6 +2714,14 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) /* Initialize regardless of IP version */ ipps.ipp_fields = 0; + /* Apply socket filter, if needed */ + if (icmp->icmp_bpf_len != 0) { + if (icmp_eval_filter(icmp, mp, ira)) { + freemsg(mp); + return; + } + } + if (ira->ira_flags & IRAF_IS_IPV4) { ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); ASSERT(MBLKL(mp) >= sizeof (ipha_t)); diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c index ff0310de0c..d65d3164d3 100644 --- a/usr/src/uts/common/inet/ip/icmp_opt_data.c +++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -41,6 +42,7 @@ #include <netinet/ip_mroute.h> #include <inet/optcom.h> #include <inet/rawip_impl.h> +#include <net/bpf.h> /* * Table of all known options handled on a ICMP protocol stack. @@ -86,6 +88,10 @@ opdes_t icmp_opt_arr[] = { 0 }, { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_ATTACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, + sizeof (struct bpf_program), 0 }, +{ SO_DETACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, 0, 0 }, + { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 6063fa01d2..704f152bb9 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -8235,7 +8235,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) conn_t *connp = NULL; t_uscalar_t paddrreq; mblk_t *mp_hw; - boolean_t success; boolean_t ioctl_aborted = B_FALSE; boolean_t log = B_TRUE; @@ -8335,7 +8334,8 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; mutex_exit(&ill->ill_lock); /* - * Something went wrong with the bind. We presumably + * Something went wrong with the bind. If this was the + * result of a DL_NOTE_REPLUMB, then we presumably * have an IOCTL hanging out waiting for completion. * Find it, take down the interface that was coming * up, and complete the IOCTL with the error noted. @@ -8352,6 +8352,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) (void) ipif_down(ipif, NULL, NULL); /* error is set below the switch */ + } else { + /* + * There's no pending IOCTL, so the bind was + * most likely started by ill_dl_up(). We save + * the error and let it take care of responding + * to the IOCTL. + */ + ill->ill_dl_bind_err = dlea->dl_unix_errno ? + dlea->dl_unix_errno : ENXIO; } break; case DL_ENABMULTI_REQ: @@ -8475,55 +8484,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill); ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0); - /* - * Now bring up the resolver; when that is complete, we'll - * create IREs. Note that we intentionally mirror what - * ipif_up() would have done, because we got here by way of - * ill_dl_up(), which stopped ipif_up()'s processing. - */ - if (ill->ill_isv6) { - /* - * v6 interfaces. - * Unlike ARP which has to do another bind - * and attach, once we get here we are - * done with NDP - */ - (void) ipif_resolver_up(ipif, Res_act_initial); - if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0) - err = ipif_up_done_v6(ipif); - } else if (ill->ill_net_type == IRE_IF_RESOLVER) { - /* - * ARP and other v4 external resolvers. - * Leave the pending mblk intact so that - * the ioctl completes in ip_rput(). - */ - if (connp != NULL) - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); - mutex_exit(&ill->ill_lock); - if (connp != NULL) - mutex_exit(&connp->conn_lock); - if (success) { - err = ipif_resolver_up(ipif, Res_act_initial); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - mp1 = ipsq_pending_mp_get(ipsq, &connp); - } else { - /* The conn has started closing */ - err = EINTR; - } - } else { - /* - * This one is complete. Reply to pending ioctl. - */ - (void) ipif_resolver_up(ipif, Res_act_initial); - err = ipif_up_done(ipif); - } - - if ((err == 0) && (ill->ill_up_ipifs)) { + if (ill->ill_up_ipifs) { err = ill_up_ipifs(ill, q, mp1); if (err == EINPROGRESS) { freemsg(mp); @@ -8531,25 +8492,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } } - /* - * If we have a moved ipif to bring up, and everything has - * succeeded to this point, bring it up on the IPMP ill. - * Otherwise, leave it down -- the admin can try to bring it - * up by hand if need be. - */ - if (ill->ill_move_ipif != NULL) { - if (err != 0) { - ill->ill_move_ipif = NULL; - } else { - ipif = ill->ill_move_ipif; - ill->ill_move_ipif = NULL; - err = ipif_up(ipif, q, mp1); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - } - } break; case DL_NOTIFY_IND: { @@ -12621,6 +12563,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) struct iocblk *iocp = (struct iocblk *)mp->b_rptr; ip_ioctl_cmd_t *ipip = arg; ip_extract_func_t *extract_funcp; + ill_t *ill; cmd_info_t ci; int err; boolean_t entered_ipsq = B_FALSE; @@ -12742,6 +12685,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); /* + * We need to cache the ill_t that we're going to use as the argument + * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be + * blown away by calling ipi_func. + */ + ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill; + + /* * A return value of EINPROGRESS means the ioctl is * either queued and waiting for some reason or has * already completed. @@ -12749,9 +12699,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR", - int, ipip->ipi_cmd, - ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill, - ipif_t *, ci.ci_ipif); + int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); if (entered_ipsq) diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index cc67299a1b..2307837eb8 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -22,7 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1990 Mentat Inc. * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2016, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -174,7 +174,7 @@ static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen, static int ill_alloc_ppa(ill_if_t *, ill_t *); static void ill_delete_interface_type(ill_if_t *); -static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); +static int ill_dl_up(ill_t *ill, ipif_t *ipif); static void ill_dl_down(ill_t *ill); static void ill_down(ill_t *ill); static void ill_down_ipifs(ill_t *, boolean_t); @@ -1380,6 +1380,36 @@ ill_capability_probe(ill_t *ill) ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; } +static boolean_t +ill_capability_wait(ill_t *ill) +{ + /* + * I'm in this ill's squeue, aka a writer. The ILL_CONDEMNED flag can + * only be set by someone who is the writer. Since we + * drop-and-reacquire the squeue in this loop, we need to check for + * ILL_CONDEMNED, which if set means nothing can signal our capability + * condition variable. + */ + ASSERT(IAM_WRITER_ILL(ill)); + + while (ill->ill_capab_pending_cnt != 0 && + (ill->ill_state_flags & ILL_CONDEMNED) == 0) { + /* This may enable blocked callers of ill_capability_done(). */ + ipsq_exit(ill->ill_phyint->phyint_ipsq); + /* Pause a bit (1msec) before we re-enter the squeue. */ + delay(drv_usectohz(1000000)); + + /* + * If ipsq_enter() fails, someone set ILL_CONDEMNED + * while we dropped the squeue. Indicate such to the caller. + */ + if (!ipsq_enter(ill, B_FALSE, CUR_OP)) + return (B_FALSE); + } + + return ((ill->ill_state_flags & ILL_CONDEMNED) == 0); +} + void ill_capability_reset(ill_t *ill, boolean_t reneg) { @@ -1390,6 +1420,8 @@ ill_capability_reset(ill_t *ill, boolean_t reneg) ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; + ASSERT(ill->ill_capab_reset_mp != NULL); + ill_capability_send(ill, ill->ill_capab_reset_mp); ill->ill_capab_reset_mp = NULL; /* @@ -2109,6 +2141,49 @@ ill_capability_lso_enable(ill_t *ill) } } +/* + * Check whether or not mac will prevent us from sending with a given IP + * address. This requires having the IPCHECK capability, which we should + * always be able to successfully negotiate, but if it's somehow missing + * then we just permit the caller to use the address, since mac does the + * actual enforcement and ip is just performing a courtesy check to help + * prevent users from unwittingly setting and attempting to use blocked + * addresses. + */ +static boolean_t +ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr) +{ + if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0) + return (B_TRUE); + + ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck; + ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df; + return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr)); +} + +static void +ill_capability_ipcheck_enable(ill_t *ill) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + ill_dld_ipcheck_t *idi = &idc->idc_ipcheck; + dld_capab_ipcheck_t spoof; + int rc; + + ASSERT(IAM_WRITER_ILL(ill)); + + bzero(&spoof, sizeof (spoof)); + if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK, + &spoof, DLD_ENABLE)) == 0) { + idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df; + idi->idi_allowed_dh = spoof.ipc_allowed_dh; + ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK; + } else { + cmn_err(CE_WARN, "warning: could not enable IPCHECK " + "capability, rc = %d\n", rc); + DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc); + } +} + static void ill_capability_dld_enable(ill_t *ill) { @@ -2121,6 +2196,8 @@ ill_capability_dld_enable(ill_t *ill) ill_capability_direct_enable(ill); ill_capability_poll_enable(ill); } + + ill_capability_ipcheck_enable(ill); ill_capability_lso_enable(ill); ill->ill_capabilities |= ILL_CAPAB_DLD; ill_mac_perim_exit(ill, mph); @@ -2186,6 +2263,15 @@ ill_capability_dld_disable(ill_t *ill) NULL, DLD_DISABLE); } + if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) { + ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL); + ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL); + + ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK; + (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK, + NULL, DLD_DISABLE); + } + ill->ill_capabilities &= ~ILL_CAPAB_DLD; ill_mac_perim_exit(ill, mph); } @@ -9676,7 +9762,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, in6_addr_t v6addr; boolean_t need_up = B_FALSE; ill_t *ill; - int i; ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); @@ -9751,20 +9836,9 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); } - /* - * verify that the address being configured is permitted by the - * ill_allowed_ips[] for the interface. - */ - if (ill->ill_allowed_ips_cnt > 0) { - for (i = 0; i < ill->ill_allowed_ips_cnt; i++) { - if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i], - &v6addr)) - break; - } - if (i == ill->ill_allowed_ips_cnt) { - pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr); - return (EPERM); - } + /* verify that the address being configured is permitted by mac */ + if (!ill_ipcheck_addr(ill, &v6addr)) { + return (EPERM); } /* * Even if there is no change we redo things just to rerun @@ -12704,6 +12778,12 @@ ill_dl_down(ill_t *ill) } ill->ill_unbind_mp = NULL; + + mutex_enter(&ill->ill_lock); + ill->ill_dl_up = 0; + ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); + mutex_exit(&ill->ill_lock); + if (mp != NULL) { ip1dbg(("ill_dl_down: %s (%u) for %s\n", dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, @@ -12726,11 +12806,13 @@ ill_dl_down(ill_t *ill) ill_capability_dld_disable(ill); ill_capability_reset(ill, B_FALSE); ill_dlpi_send(ill, mp); + + /* + * Wait for the capability reset to finish. + * In this case, it doesn't matter WHY or HOW it finished. + */ + (void) ill_capability_wait(ill); } - mutex_enter(&ill->ill_lock); - ill->ill_dl_up = 0; - ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); - mutex_exit(&ill->ill_lock); } void @@ -12852,6 +12934,7 @@ void ill_capability_done(ill_t *ill) { ASSERT(ill->ill_capab_pending_cnt != 0); + ASSERT(IAM_WRITER_ILL(ill)); ill_dlpi_done(ill, DL_CAPABILITY_REQ); @@ -14480,7 +14563,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) * address/netmask etc cause a down/up dance, but * does not cause an unbind (DL_UNBIND) with the driver */ - return (ill_dl_up(ill, ipif, mp, q)); + if ((err = ill_dl_up(ill, ipif)) != 0) { + return (err); + } + } + + /* Reject bringing up interfaces with unusable IP addresses */ + if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) { + return (EPERM); } /* @@ -14593,24 +14683,22 @@ ill_delete_ires(ill_t *ill) /* * Perform a bind for the physical device. - * When the routine returns EINPROGRESS then mp has been consumed and - * the ioctl will be acked from ip_rput_dlpi. - * Allocate an unbind message and save it until ipif_down. + * + * When the routine returns successfully then dlpi has been bound and + * capabilities negotiated. An unbind message will have been allocated + * for later use in ipif_down. */ static int -ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) +ill_dl_up(ill_t *ill, ipif_t *ipif) { mblk_t *bind_mp = NULL; mblk_t *unbind_mp = NULL; - conn_t *connp; - boolean_t success; int err; DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(mp != NULL); /* * Make sure we have an IRE_MULTICAST in case we immediately @@ -14645,19 +14733,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) if (unbind_mp == NULL) goto bad; } - /* - * Record state needed to complete this operation when the - * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. - */ - connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; - ASSERT(connp != NULL || !CONN_Q(q)); - GRAB_CONN_LOCK(q); - mutex_enter(&ipif->ipif_ill->ill_lock); - success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); - mutex_exit(&ipif->ipif_ill->ill_lock); - RELEASE_CONN_LOCK(q); - if (!success) - goto bad; /* * Save the unbind message for ill_dl_down(); it will be consumed when @@ -14669,6 +14744,18 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) ill_dlpi_send(ill, bind_mp); /* Send down link-layer capabilities probe if not already done. */ ill_capability_probe(ill); + /* + * Wait for DLPI to be bound and the capability probe to finish. + * The call drops-and-reacquires the squeue. If it couldn't because + * ILL_CONDEMNED got set, bail. + */ + if (!ill_capability_wait(ill)) + return (ENXIO); + + /* DLPI failed to bind. Return the saved error */ + if (!ill->ill_dl_up) { + return (ill->ill_dl_bind_err); + } /* * Sysid used to rely on the fact that netboots set domainname @@ -14686,11 +14773,7 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) cmn_err(CE_WARN, "no cached dhcp response"); } - /* - * This operation will complete in ip_rput_dlpi with either - * a DL_BIND_ACK or DL_ERROR_ACK. - */ - return (EINPROGRESS); + return (0); bad: ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index 13e961333c..b6565d9c1f 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -153,7 +153,7 @@ ip_squeue_create(pri_t pri) { squeue_t *sqp; - sqp = squeue_create(pri); + sqp = squeue_create(pri, B_TRUE); ASSERT(sqp != NULL); if (ip_squeue_create_callback != NULL) ip_squeue_create_callback(sqp); diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 34832d56e5..d47997a4aa 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. * Copyright 2022 Joyent, Inc. */ @@ -871,67 +872,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) mutex_exit(&(connfp)->connf_lock); \ } -#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ - conn_t *pconnp = NULL, *nconnp; \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - nconnp = (connfp)->connf_head; \ - while (nconnp != NULL && \ - !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ - pconnp = nconnp; \ - nconnp = nconnp->conn_next; \ - } \ - if (pconnp != NULL) { \ - pconnp->conn_next = (connp); \ - (connp)->conn_prev = pconnp; \ - } else { \ - (connfp)->connf_head = (connp); \ - } \ - if (nconnp != NULL) { \ - (connp)->conn_next = nconnp; \ - nconnp->conn_prev = (connp); \ - } \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF(connp); \ - mutex_exit(&(connfp)->connf_lock); \ -} +/* + * When inserting bound or wildcard entries into the hash, ordering rules are + * used to facilitate timely and correct lookups. The order is as follows: + * 1. Entries bound to a specific address + * 2. Entries bound to INADDR_ANY + * 3. Entries bound to ADDR_UNSPECIFIED + * Entries in a category which share conn_lport (such as those using + * SO_REUSEPORT) will be ordered such that the newest inserted is first. + */ -#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ - conn_t **list, *prev, *next; \ - boolean_t isv4mapped = \ - IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - list = &(connfp)->connf_head; \ - prev = NULL; \ - while ((next = *list) != NULL) { \ - if (isv4mapped && \ - IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ - connp->conn_zoneid == next->conn_zoneid) { \ - (connp)->conn_next = next; \ - if (prev != NULL) \ - prev = next->conn_prev; \ - next->conn_prev = (connp); \ - break; \ - } \ - list = &next->conn_next; \ - prev = next; \ - } \ - (connp)->conn_prev = prev; \ - *list = (connp); \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF((connp)); \ - mutex_exit(&(connfp)->connf_lock); \ +void +ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp) +{ + conn_t *pconnp, *nconnp; + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + /* + * Walk though entries associated with the fanout until one is + * found which fulfills any of these conditions: + * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED + * 2. Listen port the same as connp + */ + if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) || + connp->conn_lport == nconnp->conn_lport) + break; + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } void ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + conn_t **list, *prev, *next; + conn_t *pconnp = NULL, *nconnp; + boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6); + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) && + isv4mapped && connp->conn_lport == nconnp->conn_lport) + break; + if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) && + (isv4mapped || + connp->conn_lport == nconnp->conn_lport)) + break; + + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } /* @@ -1037,9 +1062,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } else { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } } else { IPCL_HASH_INSERT_CONNECTED(connfp, connp); @@ -1208,9 +1233,9 @@ ipcl_bind_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (protocol == IPPROTO_RSVP) ill_set_inputfn_all(ipst); @@ -1222,9 +1247,9 @@ ipcl_bind_insert_v4(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { ASSERT(connp->conn_ipversion == IPV4_VERSION); @@ -1274,9 +1299,9 @@ ipcl_bind_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; @@ -1286,9 +1311,9 @@ ipcl_bind_insert_v6(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { sa_family_t addr_family; @@ -1419,9 +1444,9 @@ ipcl_conn_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -1507,9 +1532,9 @@ ipcl_conn_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -2095,6 +2120,7 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) connp->conn_flags = IPCL_RAWIPCONN; connp->conn_proto = IPPROTO_ICMP; icmp->icmp_connp = connp; + rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL); rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); if (connp->conn_ixa == NULL) @@ -2119,6 +2145,7 @@ rawip_conn_destructor(void *buf, void *cdrarg) mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); rw_destroy(&connp->conn_ilg_lock); + rw_destroy(&icmp->icmp_bpf_lock); /* Can be NULL if constructor failed */ if (connp->conn_ixa != NULL) { |