summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet')
-rw-r--r--usr/src/uts/common/inet/ilb/ilb_conn.c8
-rw-r--r--usr/src/uts/common/inet/ip/ip_attr.c5
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c169
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c2
-rw-r--r--usr/src/uts/common/inet/ip/ip_tunables.c1
-rw-r--r--usr/src/uts/common/inet/ip/ipsecesp.c3
-rw-r--r--usr/src/uts/common/inet/ipf/ip_fil_solaris.c124
-rw-r--r--usr/src/uts/common/inet/ipf/ipf.conf5
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ipf_stack.h10
-rw-r--r--usr/src/uts/common/inet/ipf/solaris.c1
-rw-r--r--usr/src/uts/common/inet/squeue.c100
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h6
12 files changed, 318 insertions, 116 deletions
diff --git a/usr/src/uts/common/inet/ilb/ilb_conn.c b/usr/src/uts/common/inet/ilb/ilb_conn.c
index 5029552f19..7f79d41dd6 100644
--- a/usr/src/uts/common/inet/ilb/ilb_conn.c
+++ b/usr/src/uts/common/inet/ilb/ilb_conn.c
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2014 Joyent, Inc. All rights reserved.
*/
#include <sys/sysmacros.h>
@@ -365,6 +366,7 @@ ilb_conn_hash_fini(ilb_stack_t *ilbs)
{
uint32_t i;
ilb_conn_t *connp;
+ ilb_conn_hash_t *hash;
if (ilbs->ilbs_c2s_conn_hash == NULL) {
ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
@@ -388,10 +390,10 @@ ilb_conn_hash_fini(ilb_stack_t *ilbs)
ilbs->ilbs_conn_taskq = NULL;
/* Then remove all the conns. */
+ hash = ilbs->ilbs_s2c_conn_hash;
for (i = 0; i < ilbs->ilbs_conn_hash_size; i++) {
- while ((connp = ilbs->ilbs_s2c_conn_hash->ilb_connp) != NULL) {
- ilbs->ilbs_s2c_conn_hash->ilb_connp =
- connp->conn_s2c_next;
+ while ((connp = hash[i].ilb_connp) != NULL) {
+ hash[i].ilb_connp = connp->conn_s2c_next;
ILB_SERVER_REFRELE(connp->conn_server);
if (connp->conn_rule_cache.topo == ILB_TOPO_IMPL_NAT) {
ilb_nat_src_entry_t *ent;
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c
index 85ee142dfc..c350d67c2d 100644
--- a/usr/src/uts/common/inet/ip/ip_attr.c
+++ b/usr/src/uts/common/inet/ip/ip_attr.c
@@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
*/
if (ixa->ixa_free_flags & IXA_FREE_CRED)
crhold(ixa->ixa_cred);
+
+ /*
+ * There is no cleanup in progress on this new copy.
+ */
+ ixa->ixa_tcpcleanup = IXATC_IDLE;
}
/*
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index d15d86d248..e31f96ebb8 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -24,6 +24,9 @@
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
/*
* This file contains the interface control functions for IP.
@@ -224,6 +227,8 @@ static void ipif_trace_cleanup(const ipif_t *);
static void ill_dlpi_clear_deferred(ill_t *ill);
+static void phyint_flags_init(phyint_t *, t_uscalar_t);
+
/*
* if we go over the memory footprint limit more than once in this msec
* interval, we'll start pruning aggressively.
@@ -282,7 +287,6 @@ static ip_m_t ip_m_tbl[] = {
ip_nodef_v6intfid }
};
-static ill_t ill_null; /* Empty ILL for init. */
char ipif_loopback_name[] = "lo0";
/* These are used by all IP network modules. */
@@ -3331,50 +3335,42 @@ ipsq_init(ill_t *ill, boolean_t enter)
}
/*
- * ill_init is called by ip_open when a device control stream is opened.
- * It does a few initializations, and shoots a DL_INFO_REQ message down
- * to the driver. The response is later picked up in ip_rput_dlpi and
- * used to set up default mechanisms for talking to the driver. (Always
- * called as writer.)
- *
- * If this function returns error, ip_open will call ip_close which in
- * turn will call ill_delete to clean up any memory allocated here that
- * is not yet freed.
+ * Here we perform initialisation of the ill_t common to both regular
+ * interface ILLs and the special loopback ILL created by ill_lookup_on_name.
*/
-int
-ill_init(queue_t *q, ill_t *ill)
+static int
+ill_init_common(ill_t *ill, queue_t *q, boolean_t isv6, boolean_t is_loopback,
+ boolean_t ipsq_enter)
{
- int count;
- dl_info_req_t *dlir;
- mblk_t *info_mp;
+ int count;
uchar_t *frag_ptr;
- /*
- * The ill is initialized to zero by mi_alloc*(). In addition
- * some fields already contain valid values, initialized in
- * ip_open(), before we reach here.
- */
mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0);
mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL);
ill->ill_saved_ire_cnt = 0;
- ill->ill_rq = q;
- ill->ill_wq = WR(q);
+ if (is_loopback) {
+ ill->ill_max_frag = isv6 ? ip_loopback_mtu_v6plus :
+ ip_loopback_mtuplus;
+ /*
+ * No resolver here.
+ */
+ ill->ill_net_type = IRE_LOOPBACK;
+ } else {
+ ill->ill_rq = q;
+ ill->ill_wq = WR(q);
+ ill->ill_ppa = UINT_MAX;
+ }
- info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
- BPRI_HI);
- if (info_mp == NULL)
- return (ENOMEM);
+ ill->ill_isv6 = isv6;
/*
* Allocate sufficient space to contain our fragment hash table and
* the device name.
*/
frag_ptr = (uchar_t *)mi_zalloc(ILL_FRAG_HASH_TBL_SIZE + 2 * LIFNAMSIZ);
- if (frag_ptr == NULL) {
- freemsg(info_mp);
+ if (frag_ptr == NULL)
return (ENOMEM);
- }
ill->ill_frag_ptr = frag_ptr;
ill->ill_frag_free_num_pkts = 0;
ill->ill_last_frag_clean_time = 0;
@@ -3387,35 +3383,30 @@ ill_init(queue_t *q, ill_t *ill)
ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
if (ill->ill_phyint == NULL) {
- freemsg(info_mp);
mi_free(frag_ptr);
return (ENOMEM);
}
mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
- /*
- * For now pretend this is a v4 ill. We need to set phyint_ill*
- * at this point because of the following reason. If we can't
- * enter the ipsq at some point and cv_wait, the writer that
- * wakes us up tries to locate us using the list of all phyints
- * in an ipsq and the ills from the phyint thru the phyint_ill*.
- * If we don't set it now, we risk a missed wakeup.
- */
- ill->ill_phyint->phyint_illv4 = ill;
- ill->ill_ppa = UINT_MAX;
+ if (isv6) {
+ ill->ill_phyint->phyint_illv6 = ill;
+ } else {
+ ill->ill_phyint->phyint_illv4 = ill;
+ }
+ if (is_loopback) {
+ phyint_flags_init(ill->ill_phyint, DL_LOOP);
+ }
+
list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
ill_set_inputfn(ill);
- if (!ipsq_init(ill, B_TRUE)) {
- freemsg(info_mp);
+ if (!ipsq_init(ill, ipsq_enter)) {
mi_free(frag_ptr);
mi_free(ill->ill_phyint);
return (ENOMEM);
}
- ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
-
/* Frag queue limit stuff */
ill->ill_frag_count = 0;
ill->ill_ipf_gen = 0;
@@ -3440,6 +3431,53 @@ ill_init(queue_t *q, ill_t *ill)
ill->ill_max_buf = ND_MAX_Q;
ill->ill_refcnt = 0;
+ return (0);
+}
+
+/*
+ * ill_init is called by ip_open when a device control stream is opened.
+ * It does a few initializations, and shoots a DL_INFO_REQ message down
+ * to the driver. The response is later picked up in ip_rput_dlpi and
+ * used to set up default mechanisms for talking to the driver. (Always
+ * called as writer.)
+ *
+ * If this function returns error, ip_open will call ip_close which in
+ * turn will call ill_delete to clean up any memory allocated here that
+ * is not yet freed.
+ *
+ * Note: ill_ipst and ill_zoneid must be set before calling ill_init.
+ */
+int
+ill_init(queue_t *q, ill_t *ill)
+{
+ int ret;
+ dl_info_req_t *dlir;
+ mblk_t *info_mp;
+
+ info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
+ BPRI_HI);
+ if (info_mp == NULL)
+ return (ENOMEM);
+
+ /*
+ * The ill is initialized to zero by mi_alloc*(). In addition
+ * some fields already contain valid values, initialized in
+ * ip_open(), before we reach here.
+ *
+ * For now pretend this is a v4 ill. We need to set phyint_ill*
+ * at this point because of the following reason. If we can't
+ * enter the ipsq at some point and cv_wait, the writer that
+ * wakes us up tries to locate us using the list of all phyints
+ * in an ipsq and the ills from the phyint thru the phyint_ill*.
+ * If we don't set it now, we risk a missed wakeup.
+ */
+ if ((ret = ill_init_common(ill, q, B_FALSE, B_FALSE, B_TRUE)) != 0) {
+ freemsg(info_mp);
+ return (ret);
+ }
+
+ ill->ill_state_flags |= ILL_LL_SUBNET_PENDING;
+
/* Send down the Info Request to the driver. */
info_mp->b_datap->db_type = M_PCPROTO;
dlir = (dl_info_req_t *)info_mp->b_rptr;
@@ -3687,10 +3725,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
if (ill == NULL)
goto done;
- *ill = ill_null;
- mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL);
+ bzero(ill, sizeof (*ill));
ill->ill_ipst = ipst;
- list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node));
netstack_hold(ipst->ips_netstack);
/*
* For exclusive stacks we set the zoneid to zero
@@ -3698,25 +3734,12 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
*/
ill->ill_zoneid = GLOBAL_ZONEID;
- ill->ill_phyint = (phyint_t *)mi_zalloc(sizeof (phyint_t));
- if (ill->ill_phyint == NULL)
+ if (ill_init_common(ill, NULL, isv6, B_TRUE, B_FALSE) != 0)
goto done;
- if (isv6)
- ill->ill_phyint->phyint_illv6 = ill;
- else
- ill->ill_phyint->phyint_illv4 = ill;
- mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0);
- phyint_flags_init(ill->ill_phyint, DL_LOOP);
-
- if (isv6) {
- ill->ill_isv6 = B_TRUE;
- ill->ill_max_frag = ip_loopback_mtu_v6plus;
- } else {
- ill->ill_max_frag = ip_loopback_mtuplus;
- }
if (!ill_allocate_mibs(ill))
goto done;
+
ill->ill_current_frag = ill->ill_max_frag;
ill->ill_mtu = ill->ill_max_frag; /* Initial value */
ill->ill_mc_mtu = ill->ill_mtu;
@@ -3732,21 +3755,6 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
/* Set ill_dlpi_pending for ipsq_current_finish() to work properly */
ill->ill_dlpi_pending = DL_PRIM_INVAL;
- rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL);
- mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL);
- ill->ill_global_timer = INFINITY;
- ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0;
- ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0;
- ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
- ill->ill_mcast_qi = MCAST_DEF_QUERY_INTERVAL;
-
- /* No resolver here. */
- ill->ill_net_type = IRE_LOOPBACK;
-
- /* Initialize the ipsq */
- if (!ipsq_init(ill, B_FALSE))
- goto done;
-
ipif = ipif_allocate(ill, 0L, IRE_LOOPBACK, B_TRUE, B_TRUE, NULL);
if (ipif == NULL)
goto done;
@@ -3775,17 +3783,10 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6,
* Chain us in at the end of the ill list. hold the ill
* before we make it globally visible. 1 for the lookup.
*/
- ill->ill_refcnt = 0;
ill_refhold(ill);
- ill->ill_frag_count = 0;
- ill->ill_frag_free_num_pkts = 0;
- ill->ill_last_frag_clean_time = 0;
-
ipsq = ill->ill_phyint->phyint_ipsq;
- ill_set_inputfn(ill);
-
if (ill_glist_insert(ill, "lo", isv6) != 0)
cmn_err(CE_PANIC, "cannot insert loopback interface");
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index 33a2fa5935..dedb4dadcc 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -163,7 +163,7 @@ ip_squeue_create(pri_t pri)
{
squeue_t *sqp;
- sqp = squeue_create(ip_squeue_worker_wait, pri);
+ sqp = squeue_create(ip_squeue_worker_wait, pri, B_TRUE);
ASSERT(sqp != NULL);
if (ip_squeue_create_callback != NULL)
ip_squeue_create_callback(sqp);
diff --git a/usr/src/uts/common/inet/ip/ip_tunables.c b/usr/src/uts/common/inet/ip/ip_tunables.c
index 58e3b59ff3..dda05a316d 100644
--- a/usr/src/uts/common/inet/ip/ip_tunables.c
+++ b/usr/src/uts/common/inet/ip/ip_tunables.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index c325e8dc26..2ca770ebe9 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
{
espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
"net", KSTAT_TYPE_NAMED,
- sizeof (esp_kstats_t) / sizeof (kstat_named_t),
- KSTAT_FLAG_PERSISTENT, stackid);
+ sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index 20dc18b588..0c3bb327ba 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -5,7 +5,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
*/
#if !defined(lint)
@@ -83,6 +83,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
void *));
static int ipf_hook6 __P((hook_data_t, int, int, void *));
+static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t,
+ void *));
extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *));
@@ -152,6 +160,16 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz";
char *hook6_loop_out = "ipfilter_hook6_loop_out";
char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz";
+/* vnd IPv4/v6 hook names */
+char *hook4_vnd_in = "ipfilter_hookvndl3v4_in";
+char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz";
+char *hook6_vnd_in = "ipfilter_hookvndl3v6_in";
+char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz";
+char *hook4_vnd_out = "ipfilter_hookvndl3v4_out";
+char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz";
+char *hook6_vnd_out = "ipfilter_hookvndl3v6_out";
+char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz";
+
/* ------------------------------------------------------------------------ */
/* Function: ipldetach */
/* Returns: int - 0 == success, else error. */
@@ -248,6 +266,31 @@ ipf_stack_t *ifs;
ifs->ifs_ipf_ipv4 = NULL;
}
+ /*
+ * Remove VND hooks
+ */
+ if (ifs->ifs_ipf_vndl3v4 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in);
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v4 = NULL;
+ }
+
+ if (ifs->ifs_ipf_vndl3v6 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in);
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v6 = NULL;
+ }
+
#undef UNDO_HOOK
#ifdef IPFDEBUG
@@ -445,6 +488,48 @@ ipf_stack_t *ifs;
}
/*
+ * Add VND INET hooks
+ */
+ ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET);
+ if (ifs->ifs_ipf_vndl3v4 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in,
+ hook4_vnd_in, hook4_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out,
+ hook4_vnd_out, hook4_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_out)
+ goto hookup_failed;
+
+
+ /*
+ * VND INET6 hooks
+ */
+ ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6);
+ if (ifs->ifs_ipf_vndl3v6 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in,
+ hook6_vnd_in, hook6_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out,
+ hook6_vnd_out, hook6_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_out)
+ goto hookup_failed;
+ /*
* Reacquire ipf_global, now it is safe.
*/
WRITE_ENTER(&ifs->ifs_ipf_global);
@@ -1011,7 +1096,6 @@ cred_t *cp;
return ENXIO;
unit = isp->ipfs_minor;
-
/*
* ipf_find_stack returns with a read lock on ifs_ipf_global
*/
@@ -2015,6 +2099,42 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg)
}
/* ------------------------------------------------------------------------ */
+/* Function: ipf_hookvndl3_in */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: event(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The vnd hooks are private hooks to ON. They represents a layer 2 */
+/* datapath generally used to implement virtual machines. The driver sends */
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */
+/* them is in the upper 16 bits while the remaining bits are the */
+/* traditional packet hook flags. */
+/* */
+/* They end up calling the appropriate traditional ip hooks. */
+/* ------------------------------------------------------------------------ */
+/*ARGSUSED*/
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_in(token, info, arg);
+}
+
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_in(token, info, arg);
+}
+
+/*ARGSUSED*/
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_out(token, info, arg);
+}
+
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_out(token, info, arg);
+}
+
+/* ------------------------------------------------------------------------ */
/* Function: ipf_hook4_loop_in */
/* Returns: int - 0 == packet ok, else problem, free packet if not done */
/* Parameters: event(I) - pointer to event */
diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf
index 6b36f9fdbf..f49e024a72 100644
--- a/usr/src/uts/common/inet/ipf/ipf.conf
+++ b/usr/src/uts/common/inet/ipf/ipf.conf
@@ -1,3 +1,8 @@
#
#
name="ipf" parent="pseudo" instance=0;
+
+# Increase the state table limits. fr_statemax should be ~70% of fr_statesize,
+# and both should be prime numbers
+fr_statesize=151007;
+fr_statemax=113279;
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index a239f1c1ca..9aa2478c6a 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -125,6 +125,10 @@ struct ipf_stack {
hook_t *ifs_ipfhook6_loop_in;
hook_t *ifs_ipfhook6_loop_out;
hook_t *ifs_ipfhook6_nicevents;
+ hook_t *ifs_ipfhookvndl3v4_in;
+ hook_t *ifs_ipfhookvndl3v6_in;
+ hook_t *ifs_ipfhookvndl3v4_out;
+ hook_t *ifs_ipfhookvndl3v6_out;
/* flags to indicate whether hooks are registered. */
boolean_t ifs_hook4_physical_in;
@@ -137,10 +141,16 @@ struct ipf_stack {
boolean_t ifs_hook6_nic_events;
boolean_t ifs_hook6_loopback_in;
boolean_t ifs_hook6_loopback_out;
+ boolean_t ifs_hookvndl3v4_physical_in;
+ boolean_t ifs_hookvndl3v6_physical_in;
+ boolean_t ifs_hookvndl3v4_physical_out;
+ boolean_t ifs_hookvndl3v6_physical_out;
int ifs_ipf_loopback;
net_handle_t ifs_ipf_ipv4;
net_handle_t ifs_ipf_ipv6;
+ net_handle_t ifs_ipf_vndl3v4;
+ net_handle_t ifs_ipf_vndl3v6;
/* ip_auth.c */
int ifs_fr_authsize;
diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c
index c541f4dddc..5d56debc31 100644
--- a/usr/src/uts/common/inet/ipf/solaris.c
+++ b/usr/src/uts/common/inet/ipf/solaris.c
@@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg)
/*
* Destroy things for ipf for one stack.
*/
-/* ARGSUSED */
static void
ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs)
{
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 2e08dc359b..1009f0700f 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -23,7 +23,7 @@
*/
/*
- * Copyright 2012 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
*/
/*
@@ -61,6 +61,10 @@
* connection are processed on that squeue. The connection ("conn") to
* squeue mapping is stored in "conn_t" member "conn_sqp".
*
+ * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is
+ * false and it will not have an associated conn_t, which means many aspects of
+ * the system, such as polling and swtiching squeues will not be used.
+ *
* Since the processing of the connection cuts across multiple layers
* but still allows packets for different connnection to be processed on
* other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
@@ -244,7 +248,7 @@ squeue_init(void)
/* ARGSUSED */
squeue_t *
-squeue_create(clock_t wait, pri_t pri)
+squeue_create(clock_t wait, pri_t pri, boolean_t isip)
{
squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
@@ -260,11 +264,36 @@ squeue_create(clock_t wait, pri_t pri)
sqp->sq_enter = squeue_enter;
sqp->sq_drain = squeue_drain;
+ sqp->sq_isip = isip;
return (sqp);
}
/*
+ * We need to kill the threads and then clean up. We should VERIFY that
+ * polling is disabled so we don't have to worry about disassociating from
+ * MAC/IP/etc.
+ */
+void
+squeue_destroy(squeue_t *sqp)
+{
+ kt_did_t worker, poll;
+ mutex_enter(&sqp->sq_lock);
+ VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT)));
+ worker = sqp->sq_worker->t_did;
+ poll = sqp->sq_poll_thr->t_did;
+ sqp->sq_state |= SQS_EXIT;
+ cv_signal(&sqp->sq_poll_cv);
+ cv_signal(&sqp->sq_worker_cv);
+ mutex_exit(&sqp->sq_lock);
+
+ thread_join(poll);
+ thread_join(worker);
+ kmem_cache_free(squeue_cache, sqp);
+}
+
+/*
* Bind squeue worker thread to the specified CPU, given by CPU id.
* If the CPU id value is -1, bind the worker thread to the value
* specified in sq_bind field. If a thread is already bound to a
@@ -475,18 +504,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
SQUEUE_DBG_CLEAR(sqp);
- CONN_DEC_REF(connp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -513,7 +545,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
return;
}
} else {
- if (ira != NULL) {
+ if (sqp->sq_isip == B_TRUE && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -587,7 +619,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
if (!(sqp->sq_state & SQS_REENTER) &&
(process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
(sqp->sq_run == curthread) && (cnt == 1) &&
- (connp->conn_on_sqp == B_FALSE)) {
+ (sqp->sq_isip == B_FALSE ||
+ connp->conn_on_sqp == B_FALSE)) {
sqp->sq_state |= SQS_REENTER;
mutex_exit(&sqp->sq_lock);
@@ -602,15 +635,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
+ SQUEUE_DBG_SET(sqp, mp, proc, connp,
+ tag);
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -631,7 +670,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
#ifdef DEBUG
mp->b_tag = tag;
#endif
- if (ira != NULL) {
+ if (sqp->sq_isip && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -779,7 +818,7 @@ again:
mp->b_prev = NULL;
/* Is there an ip_recv_attr_t to handle? */
- if (ip_recv_attr_is_mblk(mp)) {
+ if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) {
mblk_t *attrmp = mp;
ASSERT(attrmp->b_cont != NULL);
@@ -804,20 +843,25 @@ again:
/*
- * Handle squeue switching. More details in the
- * block comment at the top of the file
+ * Handle squeue switching. More details in the block comment at
+ * the top of the file. non-IP squeues cannot switch, as there
+ * is no conn_t.
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
mp->b_tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -1051,6 +1095,11 @@ squeue_polling_thread(squeue_t *sqp)
cv_wait(async, lock);
CALLB_CPR_SAFE_END(&cprinfo, lock);
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
SQS_POLL_THR_QUIESCED);
if (ctl_state != 0) {
@@ -1076,6 +1125,9 @@ squeue_polling_thread(squeue_t *sqp)
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
+ /* Only IP related squeues should reach this point */
+ VERIFY(sqp->sq_isip == B_TRUE);
+
poll_again:
sq_rx_ring = sqp->sq_rx_ring;
sq_get_pkts = sq_rx_ring->rr_rx;
@@ -1205,6 +1257,7 @@ squeue_worker_thr_control(squeue_t *sqp)
ill_rx_ring_t *rx_ring;
ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ VERIFY(sqp->sq_isip == B_TRUE);
if (sqp->sq_state & SQS_POLL_RESTART) {
/* Restart implies a previous quiesce. */
@@ -1316,6 +1369,11 @@ squeue_worker(squeue_t *sqp)
for (;;) {
for (;;) {
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
/*
* If the poll thread has handed control to us
* we need to break out of the wait.
@@ -1412,6 +1470,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp)
again:
sqp = connp->conn_sqp;
+ VERIFY(sqp->sq_isip == B_TRUE);
mutex_enter(&sqp->sq_lock);
if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
@@ -1487,6 +1546,7 @@ void
squeue_synch_exit(conn_t *connp)
{
squeue_t *sqp = connp->conn_sqp;
+ VERIFY(sqp->sq_isip == B_TRUE);
mutex_enter(&sqp->sq_lock);
if (sqp->sq_run == curthread) {
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 1b20e40aca..3488630f2c 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent Inc. All rights reserved.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/
@@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls;
* by setting it to 0.
*/
#define TCP_XMIT_LOWATER 4096
-#define TCP_XMIT_HIWATER 49152
+#define TCP_XMIT_HIWATER 128000
#define TCP_RECV_LOWATER 2048
-#define TCP_RECV_HIWATER 128000
+#define TCP_RECV_HIWATER 1048576
/*
* Bind hash list size and has function. It has to be a power of 2 for