summaryrefslogtreecommitdiff
path: root/usr/src/uts
diff options
context:
space:
mode:
authorpriyanka <none@none>2005-12-12 15:24:29 -0800
committerpriyanka <none@none>2005-12-12 15:24:29 -0800
commit43d18f1c320355e93c47399bea0b2e022fe06364 (patch)
treeb34f2864b862c5ac66d6014d591939f3e9f6704d /usr/src/uts
parent8fbd927ce8f563deec0dfab8fbb461dd1bfff20c (diff)
downloadillumos-joyent-43d18f1c320355e93c47399bea0b2e022fe06364.tar.gz
PSARC 2005/603 IP_NEXTHOP socket option
6264845 Need Policy Based Routing support in Solaris
Diffstat (limited to 'usr/src/uts')
-rw-r--r--usr/src/uts/common/inet/ip.h9
-rw-r--r--usr/src/uts/common/inet/ip/ip.c273
-rw-r--r--usr/src/uts/common/inet/ip/ip_ire.c19
-rw-r--r--usr/src/uts/common/inet/ip_impl.h1
-rw-r--r--usr/src/uts/common/inet/ip_ire.h4
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h5
-rw-r--r--usr/src/uts/common/inet/ipsec_info.h5
-rw-r--r--usr/src/uts/common/inet/sctp/sctp_opt_data.c41
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c264
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c3
-rw-r--r--usr/src/uts/common/inet/udp/udp.c5
-rw-r--r--usr/src/uts/common/inet/udp/udp_opt_data.c3
-rw-r--r--usr/src/uts/common/netinet/in.h3
13 files changed, 448 insertions, 187 deletions
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 507dfad5d6..f286253080 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -618,6 +618,15 @@ typedef struct ip_m_s {
*/
#define IRE_MARK_USESRC_CHECK 0x0020
+/*
+ * IRE_MARK_PRIVATE_ADDR is used for IP_NEXTHOP. When IP_NEXTHOP is set, the
+ * routing table lookup for the destination is bypassed and the packet is
+ * sent directly to the specified nexthop. The associated IRE_CACHE entries
+ * should be marked with IRE_MARK_PRIVATE_ADDR flag so that they don't show up
+ * in regular ire cache lookups.
+ */
+#define IRE_MARK_PRIVATE_ADDR 0x0040
+
/* Flags with ire_expire routine */
#define FLUSH_ARP_TIME 0x0001 /* ARP info potentially stale timer */
#define FLUSH_REDIRECT_TIME 0x0002 /* Redirects potentially stale */
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index a988b67cbb..17884e9d59 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -4113,23 +4113,30 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
MATCH_IRE_RJ_BHOLE));
} else {
/*
- * If conn_dontroute is set, and onlink ipif is not found
- * set ENETUNREACH error
+ * If conn_dontroute is set or if conn_nexthop_set is set,
+ * and onlink ipif is not found set ENETUNREACH error.
*/
- if (connp->conn_dontroute) {
+ if (connp->conn_dontroute || connp->conn_nexthop_set) {
ipif_t *ipif;
- ipif = ipif_lookup_onlink_addr(dst_addr, zoneid);
+ ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ?
+ dst_addr : connp->conn_nexthop_v4, zoneid);
if (ipif == NULL) {
error = ENETUNREACH;
goto bad_addr;
}
ipif_refrele(ipif);
}
- dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, &sire,
- zoneid,
- (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
- MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE));
+
+ if (connp->conn_nexthop_set) {
+ dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0,
+ 0, 0, NULL, NULL, zoneid, 0);
+ } else {
+ dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL,
+ &sire, zoneid,
+ (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
+ MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE));
+ }
}
/*
* dst_ire can't be a broadcast when not ire_requested.
@@ -6691,6 +6698,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
ire_t *ire = NULL;
mblk_t *res_mp;
ipaddr_t *addrp;
+ ipaddr_t nexthop_addr;
ipif_t *src_ipif = NULL;
ill_t *dst_ill = NULL;
ipha_t *ipha;
@@ -6712,6 +6720,7 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
boolean_t multirt_is_resolvable;
boolean_t multirt_resolve_next;
boolean_t do_attach_ill = B_FALSE;
+ boolean_t ip_nexthop = B_FALSE;
zoneid_t zoneid;
if (ip_debug > 2) {
@@ -6760,6 +6769,10 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
if (ill_is_probeonly(attach_ill))
ire_marks = IRE_MARK_HIDDEN;
}
+ if (mctl_present && io->ipsec_out_ip_nexthop) {
+ ip_nexthop = B_TRUE;
+ nexthop_addr = io->ipsec_out_nexthop_addr;
+ }
/*
* If this IRE is created for forwarding or it is not for
* traffic for congestion controlled protocols, mark it as temporary.
@@ -6788,6 +6801,28 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
if (in_ill != NULL) {
ire = ire_srcif_table_lookup(dst, IRE_IF_RESOLVER, NULL,
in_ill, MATCH_IRE_TYPE);
+ } else if (ip_nexthop) {
+ /*
+ * The first time we come here, we look for an IRE_INTERFACE
+ * entry for the specified nexthop, set the dst to be the
+ * nexthop address and create an IRE_CACHE entry for the
+ * nexthop. The next time around, we are able to find an
+ * IRE_CACHE entry for the nexthop, set the gateway to be the
+ * nexthop address and create an IRE_CACHE entry for the
+ * destination address via the specified nexthop.
+ */
+ ire = ire_cache_lookup(nexthop_addr, zoneid);
+ if (ire != NULL) {
+ gw = nexthop_addr;
+ ire_marks |= IRE_MARK_PRIVATE_ADDR;
+ } else {
+ ire = ire_ftable_lookup(nexthop_addr, 0, 0,
+ IRE_INTERFACE, NULL, NULL, zoneid, 0,
+ MATCH_IRE_TYPE);
+ if (ire != NULL) {
+ dst = nexthop_addr;
+ }
+ }
} else if (attach_ill == NULL) {
ire = ire_ftable_lookup(dst, 0, 0, 0,
NULL, &sire, zoneid, 0,
@@ -7211,7 +7246,6 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
ire_t *ipif_ire;
mblk_t *ire_fp_mp;
- ASSERT(sire != NULL);
if (gw == 0)
gw = ire->ire_gateway_addr;
/*
@@ -7219,7 +7253,8 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
* off-link destination from the cache ire of the
* gateway.
*
- * 1. The prefix ire 'sire'
+ * 1. The prefix ire 'sire' (Note that this does
+ * not apply to the conn_nexthop_set case)
* 2. The cache ire of the gateway 'ire'
* 3. The interface ire 'ipif_ire'
*
@@ -7227,9 +7262,14 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
*
* If there is no interface route to the gateway,
* it is a race condition, where we found the cache
- * but the inteface route has been deleted.
+ * but the interface route has been deleted.
*/
- ipif_ire = ire_ihandle_lookup_offlink(ire, sire);
+ if (ip_nexthop) {
+ ipif_ire = ire_ihandle_lookup_onlink(ire);
+ } else {
+ ipif_ire =
+ ire_ihandle_lookup_offlink(ire, sire);
+ }
if (ipif_ire == NULL) {
ip1dbg(("ip_newroute: "
"ire_ihandle_lookup_offlink failed\n"));
@@ -7268,19 +7308,21 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
save_ire->ire_dlureq_mp,
src_ipif,
in_ill, /* incoming ill */
- sire->ire_mask, /* Parent mask */
- sire->ire_phandle, /* Parent handle */
+ (sire != NULL) ?
+ sire->ire_mask : 0, /* Parent mask */
+ (sire != NULL) ?
+ sire->ire_phandle : 0, /* Parent handle */
ipif_ire->ire_ihandle, /* Interface handle */
- sire->ire_flags &
- (RTF_SETSRC | RTF_MULTIRT), /* flags if any */
- &(sire->ire_uinfo));
+ (sire != NULL) ? (sire->ire_flags &
+ (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */
+ (sire != NULL) ?
+ &(sire->ire_uinfo) : &(save_ire->ire_uinfo));
if (ire == NULL) {
ire_refrele(ipif_ire);
ire_refrele(save_ire);
break;
}
-
ire->ire_marks |= ire_marks;
/*
@@ -7288,20 +7330,23 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
* The newly created ire is tied to both of them via
* the phandle and ihandle respectively.
*/
- IRB_REFHOLD(sire->ire_bucket);
- /* Has it been removed already ? */
- if (sire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(sire->ire_bucket);
- ire_refrele(ipif_ire);
- ire_refrele(save_ire);
- break;
+ if (sire != NULL) {
+ IRB_REFHOLD(sire->ire_bucket);
+ /* Has it been removed already ? */
+ if (sire->ire_marks & IRE_MARK_CONDEMNED) {
+ IRB_REFRELE(sire->ire_bucket);
+ ire_refrele(ipif_ire);
+ ire_refrele(save_ire);
+ break;
+ }
}
IRB_REFHOLD(ipif_ire->ire_bucket);
/* Has it been removed already ? */
if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) {
IRB_REFRELE(ipif_ire->ire_bucket);
- IRB_REFRELE(sire->ire_bucket);
+ if (sire != NULL)
+ IRB_REFRELE(sire->ire_bucket);
ire_refrele(ipif_ire);
ire_refrele(save_ire);
break;
@@ -7325,8 +7370,10 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
ire_refrele(save_ire);
/* Assert that sire is not deleted yet. */
- ASSERT(sire->ire_ptpn != NULL);
- IRB_REFRELE(sire->ire_bucket);
+ if (sire != NULL) {
+ ASSERT(sire->ire_ptpn != NULL);
+ IRB_REFRELE(sire->ire_bucket);
+ }
/* Assert that ipif_ire is not deleted yet. */
ASSERT(ipif_ire->ire_ptpn != NULL);
@@ -7349,8 +7396,8 @@ ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, ill_t *in_ill, conn_t *connp)
multirt_resolve_next = B_TRUE;
continue;
}
-
- ire_refrele(sire);
+ if (sire != NULL)
+ ire_refrele(sire);
ipif_refrele(src_ipif);
ill_refrele(dst_ill);
return;
@@ -9086,12 +9133,19 @@ ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option,
if (addr != INADDR_ANY || checkonly) {
ASSERT(connp != NULL);
- ipif = ipif_lookup_addr(addr, NULL, connp->conn_zoneid,
- CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt, &error);
+ if (option == IP_NEXTHOP) {
+ ipif =
+ ipif_lookup_onlink_addr(addr, connp->conn_zoneid);
+ } else {
+ ipif = ipif_lookup_addr(addr, NULL, connp->conn_zoneid,
+ CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt,
+ &error);
+ }
if (ipif == NULL) {
if (error == EINPROGRESS)
return (error);
- else if (option == IP_MULTICAST_IF)
+ else if ((option == IP_MULTICAST_IF) ||
+ (option == IP_NEXTHOP))
return (EHOSTUNREACH);
else
return (EINVAL);
@@ -9156,6 +9210,10 @@ ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option,
case IP_MULTICAST_IF:
connp->conn_multicast_ipif = ipif;
break;
+ case IP_NEXTHOP:
+ connp->conn_nexthop_v4 = addr;
+ connp->conn_nexthop_set = B_TRUE;
+ break;
}
if (ipif != NULL) {
@@ -9472,6 +9530,7 @@ ip_opt_set(queue_t *q, uint_t optset_context, int level, int name,
break;
case IPPROTO_IP:
switch (name) {
+ case IP_NEXTHOP:
case IP_MULTICAST_IF:
case IP_DONTFAILOVER_IF: {
ipaddr_t addr = *i1;
@@ -10268,6 +10327,12 @@ ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
return (sizeof (int));
case IP_SEC_OPT:
return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4));
+ case IP_NEXTHOP:
+ if (connp->conn_nexthop_set) {
+ *(ipaddr_t *)ptr = connp->conn_nexthop_v4;
+ return (sizeof (ipaddr_t));
+ } else
+ return (0);
default:
break;
}
@@ -17748,6 +17813,9 @@ ip_output(void *arg, mblk_t *mp, void *arg2, int caller)
zoneid_t zoneid;
boolean_t need_decref = B_FALSE;
boolean_t ignore_dontroute = B_FALSE;
+ boolean_t ignore_nexthop = B_FALSE;
+ boolean_t ip_nexthop = B_FALSE;
+ ipaddr_t nexthop_addr;
#ifdef _BIG_ENDIAN
#define V_HLEN (v_hlen_tos_len >> 24)
@@ -17850,20 +17918,21 @@ ip_output(void *arg, mblk_t *mp, void *arg2, int caller)
if (CLASSD(dst))
goto multicast;
- if ((connp->conn_dontroute) || (connp->conn_xmit_if_ill != NULL)) {
+ if ((connp->conn_dontroute) || (connp->conn_xmit_if_ill != NULL) ||
+ (connp->conn_nexthop_set)) {
/*
* If the destination is a broadcast or a loopback
- * address, both SO_DONTROUTE and IP_XMIT_IF go
+ * address, SO_DONTROUTE, IP_XMIT_IF and IP_NEXTHOP go
* through the standard path. But in the case of local
- * destination only SO_DONTROUTE goes through the
- * standard path not IP_XMIT_IF.
+ * destination only SO_DONTROUTE and IP_NEXTHOP go through
+ * the standard path not IP_XMIT_IF.
*/
ire = ire_cache_lookup(dst, zoneid);
if ((ire == NULL) || ((ire->ire_type != IRE_BROADCAST) &&
(ire->ire_type != IRE_LOOPBACK))) {
-
- if ((connp->conn_dontroute) && (ire != NULL) &&
- (ire->ire_type == IRE_LOCAL))
+ if ((connp->conn_dontroute ||
+ connp->conn_nexthop_set) && (ire != NULL) &&
+ (ire->ire_type == IRE_LOCAL))
goto standard_path;
if (ire != NULL) {
@@ -17875,8 +17944,13 @@ ip_output(void *arg, mblk_t *mp, void *arg2, int caller)
* bypass routing checks and go directly to
* interface.
*/
- if (connp->conn_dontroute)
+ if (connp->conn_dontroute) {
goto dontroute;
+ } else if (connp->conn_nexthop_set) {
+ ip_nexthop = B_TRUE;
+ nexthop_addr = connp->conn_nexthop_v4;
+ goto send_from_ill;
+ }
/*
* If IP_XMIT_IF socket option is set,
@@ -18227,52 +18301,69 @@ qnext:
io = (ipsec_out_t *)first_mp->b_rptr;
if (io->ipsec_out_attach_if ||
- io->ipsec_out_xmit_if) {
+ io->ipsec_out_xmit_if ||
+ io->ipsec_out_ip_nexthop) {
ill_t *ill;
- ASSERT(io->ipsec_out_ill_index != 0);
- ifindex = io->ipsec_out_ill_index;
- ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
- NULL, NULL, NULL, NULL);
/*
- * ipsec_out_xmit_if bit is used to tell
- * ip_wput to use the ill to send outgoing data
- * as we have no conn when data comes from ICMP
- * error msg routines. Currently this feature is
- * only used by ip_mrtun_forward routine.
+ * We may have lost the conn context if we are
+ * coming here from ip_newroute(). Copy the
+ * nexthop information.
*/
- if (io->ipsec_out_xmit_if) {
- xmit_ill = ill;
- if (xmit_ill == NULL) {
- ip1dbg(("ip_wput: bad ifindex for"
- "xmit_ill %d\n", ifindex));
- freemsg(first_mp);
- BUMP_MIB(&ip_mib, ipOutDiscards);
- ASSERT(!need_decref);
- return;
- }
- /* Free up the ipsec_out_t mblk */
- ASSERT(first_mp->b_cont == mp);
- first_mp->b_cont = NULL;
- freeb(first_mp);
- /* Just send the IP header+ICMP+data */
- first_mp = mp;
+ if (io->ipsec_out_ip_nexthop) {
+ ip_nexthop = B_TRUE;
+ nexthop_addr = io->ipsec_out_nexthop_addr;
+
ipha = (ipha_t *)mp->b_rptr;
dst = ipha->ipha_dst;
goto send_from_ill;
-
} else {
- attach_ill = ill;
- }
+ ASSERT(io->ipsec_out_ill_index != 0);
+ ifindex = io->ipsec_out_ill_index;
+ ill = ill_lookup_on_ifindex(ifindex, B_FALSE,
+ NULL, NULL, NULL, NULL);
+ /*
+ * ipsec_out_xmit_if bit is used to tell
+ * ip_wput to use the ill to send outgoing data
+ * as we have no conn when data comes from ICMP
+ * error msg routines. Currently this feature is
+ * only used by ip_mrtun_forward routine.
+ */
+ if (io->ipsec_out_xmit_if) {
+ xmit_ill = ill;
+ if (xmit_ill == NULL) {
+ ip1dbg(("ip_output:bad ifindex "
+ "for xmit_ill %d\n",
+ ifindex));
+ freemsg(first_mp);
+ BUMP_MIB(&ip_mib,
+ ipOutDiscards);
+ ASSERT(!need_decref);
+ return;
+ }
+ /* Free up the ipsec_out_t mblk */
+ ASSERT(first_mp->b_cont == mp);
+ first_mp->b_cont = NULL;
+ freeb(first_mp);
+ /* Just send the IP header+ICMP+data */
+ first_mp = mp;
+ ipha = (ipha_t *)mp->b_rptr;
+ dst = ipha->ipha_dst;
+ goto send_from_ill;
+ } else {
+ attach_ill = ill;
+ }
- if (attach_ill == NULL) {
- ASSERT(xmit_ill == NULL);
- ip1dbg(("ip_wput : bad ifindex for "
- "(BIND TO IPIF_NOFAILOVER) %d\n", ifindex));
- freemsg(first_mp);
- BUMP_MIB(&ip_mib, ipOutDiscards);
- ASSERT(!need_decref);
- return;
+ if (attach_ill == NULL) {
+ ASSERT(xmit_ill == NULL);
+ ip1dbg(("ip_output: bad ifindex for "
+ "(BIND TO IPIF_NOFAILOVER) %d\n",
+ ifindex));
+ freemsg(first_mp);
+ BUMP_MIB(&ip_mib, ipOutDiscards);
+ ASSERT(!need_decref);
+ return;
+ }
}
}
}
@@ -18711,6 +18802,7 @@ qnext:
if ((ire != NULL) && (ire->ire_type &
(IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) {
ignore_dontroute = B_TRUE;
+ ignore_nexthop = B_TRUE;
}
if (ire != NULL) {
ire_refrele(ire);
@@ -18853,6 +18945,16 @@ send_from_ill:
if (need_decref)
CONN_DEC_REF(connp);
return;
+ } else if (ip_nexthop || (connp != NULL &&
+ (connp->conn_nexthop_set)) && !ignore_nexthop) {
+ if (!ip_nexthop) {
+ ip_nexthop = B_TRUE;
+ nexthop_addr = connp->conn_nexthop_v4;
+ }
+ match_flags = MATCH_IRE_MARK_PRIVATE_ADDR |
+ MATCH_IRE_GW;
+ ire = ire_ctable_lookup(dst, nexthop_addr, 0,
+ NULL, zoneid, match_flags);
} else {
ire = ire_cache_lookup(dst, zoneid);
}
@@ -18861,7 +18963,8 @@ send_from_ill:
* Make sure we don't load spread if this
* is IPIF_NOFAILOVER case.
*/
- if (attach_ill != NULL) {
+ if ((attach_ill != NULL) ||
+ (ip_nexthop && !ignore_nexthop)) {
if (mctl_present) {
io = (ipsec_out_t *)first_mp->b_rptr;
ASSERT(first_mp->b_datap->db_type ==
@@ -18890,9 +18993,15 @@ send_from_ill:
first_mp->b_cont = mp;
mctl_present = B_TRUE;
}
- io->ipsec_out_ill_index = attach_ill->
- ill_phyint->phyint_ifindex;
- io->ipsec_out_attach_if = B_TRUE;
+ if (attach_ill != NULL) {
+ io->ipsec_out_ill_index = attach_ill->
+ ill_phyint->phyint_ifindex;
+ io->ipsec_out_attach_if = B_TRUE;
+ } else {
+ io->ipsec_out_ip_nexthop = ip_nexthop;
+ io->ipsec_out_nexthop_addr =
+ nexthop_addr;
+ }
}
noirefound:
/*
diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c
index ed4c13a8e2..c1c903ff25 100644
--- a/usr/src/uts/common/inet/ip/ip_ire.c
+++ b/usr/src/uts/common/inet/ip/ip_ire.c
@@ -4393,6 +4393,18 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
(ire->ire_marks & IRE_MARK_HIDDEN))
return (B_FALSE);
+ /*
+ * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option
+ * is used. In that case the routing table is bypassed and the
+ * packets are sent directly to the specified nexthop. The
+ * IRE_CACHE entry representing this route should be marked
+ * with IRE_MARK_PRIVATE_ADDR.
+ */
+
+ if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) &&
+ (ire->ire_marks & IRE_MARK_PRIVATE_ADDR))
+ return (B_FALSE);
+
if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid) {
/*
* If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
@@ -4498,6 +4510,9 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway,
((!(match_flags & MATCH_IRE_MARK_HIDDEN)) ||
(ire->ire_type != IRE_CACHE ||
ire->ire_marks & IRE_MARK_HIDDEN)) &&
+ ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) ||
+ (ire->ire_type != IRE_CACHE ||
+ ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) &&
((!(match_flags & MATCH_IRE_ILL)) ||
(ire_ill == ipif_ill)) &&
((!(match_flags & MATCH_IRE_IHANDLE)) ||
@@ -5005,8 +5020,10 @@ ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid)
irb_ptr = &ip_cache_table[IRE_ADDR_HASH(addr, ip_cache_table_size)];
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_HIDDEN))
+ if (ire->ire_marks & (IRE_MARK_CONDEMNED |
+ IRE_MARK_HIDDEN | IRE_MARK_PRIVATE_ADDR)) {
continue;
+ }
if (ire->ire_addr == addr) {
if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
ire->ire_type == IRE_LOCAL) {
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index f55bb7d6ce..8a9f611fab 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -363,6 +363,7 @@ typedef struct ip_mdt_info_s {
*/
#define CONN_IS_MD_FASTPATH(connp) \
((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \
+ !((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \
(connp)->conn_nofailover_ill == NULL && /* IPIF_NOFAILOVER */ \
(connp)->conn_xmit_if_ill == NULL && /* IP_XMIT_IF */ \
(connp)->conn_outgoing_pill == NULL && /* IP{V6}_BOUND_PIF */ \
diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h
index 5ed3cff368..2e4a3b99db 100644
--- a/usr/src/uts/common/inet/ip_ire.h
+++ b/usr/src/uts/common/inet/ip_ire.h
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -126,6 +126,8 @@ extern "C" {
#define MATCH_IRE_ZONEONLY 0x4000 /* Match IREs in specified zone, ie */
/* don't match IRE_LOCALs from other */
/* zones or shared IREs */
+#define MATCH_IRE_MARK_PRIVATE_ADDR 0x8000 /* Match IRE ire_marks with */
+ /* IRE_MARK_PRIVATE_ADDR. */
/* Structure for ire_cache_count() */
typedef struct {
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index a5148c57c0..85302c350b 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -174,7 +174,8 @@ struct conn_s {
conn_recvif : 1, /* IP_RECVIF option */
conn_recvslla : 1, /* IP_RECVSLLA option */
conn_mdt_ok : 1, /* MDT is permitted */
- pad_to_bit_31 : 2;
+ conn_nexthop_set : 1,
+ pad_to_bit_31 : 1;
tcp_t *conn_tcp; /* Pointer to the tcp struct */
udp_t *conn_udp; /* Pointer to the udp struct */
@@ -257,6 +258,8 @@ struct conn_s {
/* mtuinfo from IPV6_PACKET_TOO_BIG conditional on conn_pathmtu_valid */
struct ip6_mtuinfo mtuinfo;
zoneid_t conn_zoneid; /* zone connection is in */
+ in6_addr_t conn_nexthop_v6; /* nexthop IP address */
+#define conn_nexthop_v4 V4_PART_OF_V6(conn_nexthop_v6)
#ifdef CONN_DEBUG
#define CONN_TRACE_MAX 10
int conn_trace_last; /* ndx of last used tracebuf */
diff --git a/usr/src/uts/common/inet/ipsec_info.h b/usr/src/uts/common/inet/ipsec_info.h
index 554dcdf0c1..f83f4e216b 100644
--- a/usr/src/uts/common/inet/ipsec_info.h
+++ b/usr/src/uts/common/inet/ipsec_info.h
@@ -219,7 +219,8 @@ typedef struct ipsec_out_s {
* messages are to be trusted by all receivers.
*/
ipsec_out_icmp_loopback: 1,
- ipsec_out_pad_bits : 12;
+ ipsec_out_ip_nexthop : 1, /* IP_NEXTHOP option is set */
+ ipsec_out_pad_bits : 11;
cred_t *ipsec_out_cred;
uint32_t ipsec_out_capab_ill_index;
@@ -235,6 +236,8 @@ typedef struct ipsec_out_s {
crypto_data_t ipsec_out_crypto_mac; /* to store the MAC */
zoneid_t ipsec_out_zoneid; /* source zone for the datagram */
+ in6_addr_t ipsec_out_nexthop_v6; /* nexthop IP address */
+#define ipsec_out_nexthop_addr V4_PART_OF_V6(ipsec_out_nexthop_v6)
} ipsec_out_t;
/*
diff --git a/usr/src/uts/common/inet/sctp/sctp_opt_data.c b/usr/src/uts/common/inet/sctp/sctp_opt_data.c
index 0bbd1cf47b..8299f6cd7b 100644
--- a/usr/src/uts/common/inet/sctp/sctp_opt_data.c
+++ b/usr/src/uts/common/inet/sctp/sctp_opt_data.c
@@ -44,6 +44,7 @@
#include <netinet/ip6.h>
#include <inet/ip.h>
#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
#include <inet/ipclassifier.h>
#include <inet/ipsec_impl.h>
@@ -796,6 +797,7 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
int *i1 = (int *)ptr;
int retval = 0;
int buflen = *optlen;
+ conn_t *connp = sctp->sctp_connp;
ip6_pkt_t *ipp = &sctp->sctp_sticky_ipp;
/* In most cases, the return buffer is just an int */
*optlen = sizeof (int32_t);
@@ -1041,6 +1043,14 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen)
case IP_TTL:
*i1 = (int)sctp->sctp_ipha->ipha_ttl;
break;
+ case IP_NEXTHOP:
+ if (connp->conn_nexthop_set) {
+ *(ipaddr_t *)ptr = connp->conn_nexthop_v4;
+ *optlen = sizeof (ipaddr_t);
+ } else {
+ *optlen = 0;
+ }
+ break;
default:
retval = EINVAL;
break;
@@ -1487,6 +1497,37 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp,
case IP_UNSPEC_SRC:
connp->conn_unspec_src = onoff;
break;
+ case IP_NEXTHOP: {
+ ipaddr_t addr = *i1;
+ ipif_t *ipif = NULL;
+ ill_t *ill;
+
+ if (secpolicy_net(CRED(), OP_CONFIG, B_TRUE) == 0) {
+ ipif =
+ ipif_lookup_onlink_addr(addr,
+ connp->conn_zoneid);
+ if (ipif == NULL) {
+ retval = EHOSTUNREACH;
+ break;
+ }
+ ill = ipif->ipif_ill;
+ mutex_enter(&ill->ill_lock);
+ if ((ill->ill_state_flags & ILL_CONDEMNED) ||
+ (ipif->ipif_state_flags & IPIF_CONDEMNED)) {
+ mutex_exit(&ill->ill_lock);
+ ipif_refrele(ipif);
+ retval = EHOSTUNREACH;
+ break;
+ }
+ mutex_exit(&ill->ill_lock);
+ ipif_refrele(ipif);
+ mutex_enter(&connp->conn_lock);
+ connp->conn_nexthop_v4 = addr;
+ connp->conn_nexthop_set = B_TRUE;
+ mutex_exit(&connp->conn_lock);
+ }
+ break;
+ }
default:
retval = EINVAL;
break;
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index a9ebe742ae..61495f4705 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -27,7 +27,7 @@
#pragma ident "%Z%%M% %I% %E% SMI"
-const char tcp_version[] = "%Z%%M% %I% %E% SMI";
+const char tcp_version[] = "@(#)tcp.c 1.490 05/11/29 SMI";
#include <sys/types.h>
#include <sys/stream.h>
@@ -2468,7 +2468,7 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
tcp_hsp_t *hsp;
ire_t *ire;
ire_t *sire = NULL;
- iulp_t *ire_uinfo;
+ iulp_t *ire_uinfo = NULL;
uint32_t mss_max;
uint32_t mss;
boolean_t tcp_detached = TCP_IS_DETACHED(tcp);
@@ -2486,32 +2486,58 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
BUMP_MIB(&ip_mib, ipInDiscards);
return (0);
}
-
- ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, zoneid);
- if (ire != NULL) {
- ire_cacheable = B_TRUE;
- ire_uinfo = (ire_mp != NULL) ?
- &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
- &ire->ire_uinfo;
-
- } else {
- if (ire_mp == NULL) {
+ /*
+ * If IP_NEXTHOP is set, then look for an IRE_CACHE
+ * for the destination with the nexthop as gateway.
+ * ire_ctable_lookup() is used because this particular
+ * ire, if it exists, will be marked private.
+ * If that is not available, use the interface ire
+ * for the nexthop.
+ */
+ if (tcp->tcp_connp->conn_nexthop_set) {
+ ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem,
+ tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid,
+ MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW);
+ if (ire == NULL) {
ire = ire_ftable_lookup(
- tcp->tcp_connp->conn_rem,
- 0, 0, 0, NULL, &sire, zoneid, 0,
- (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT));
+ tcp->tcp_connp->conn_nexthop_v4,
+ 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0,
+ MATCH_IRE_TYPE);
if (ire == NULL)
return (0);
- ire_uinfo = (sire != NULL) ? &sire->ire_uinfo :
+ } else {
+ ire_uinfo = &ire->ire_uinfo;
+ }
+ } else {
+ ire = ire_cache_lookup(tcp->tcp_connp->conn_rem,
+ zoneid);
+ if (ire != NULL) {
+ ire_cacheable = B_TRUE;
+ ire_uinfo = (ire_mp != NULL) ?
+ &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
&ire->ire_uinfo;
+
} else {
- ire = (ire_t *)ire_mp->b_rptr;
- ire_uinfo =
- &((ire_t *)ire_mp->b_rptr)->ire_uinfo;
+ if (ire_mp == NULL) {
+ ire = ire_ftable_lookup(
+ tcp->tcp_connp->conn_rem,
+ 0, 0, 0, NULL, &sire, zoneid, 0,
+ (MATCH_IRE_RECURSIVE |
+ MATCH_IRE_DEFAULT));
+ if (ire == NULL)
+ return (0);
+ ire_uinfo = (sire != NULL) ?
+ &sire->ire_uinfo :
+ &ire->ire_uinfo;
+ } else {
+ ire = (ire_t *)ire_mp->b_rptr;
+ ire_uinfo =
+ &((ire_t *)
+ ire_mp->b_rptr)->ire_uinfo;
+ }
}
}
ASSERT(ire != NULL);
- ASSERT(ire_uinfo != NULL);
if ((ire->ire_src_addr == INADDR_ANY) ||
(ire->ire_type & IRE_BROADCAST)) {
@@ -2550,7 +2576,19 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
tcp->tcp_ipha->ipha_fragment_offset_and_flags =
htons(IPH_DF);
}
- tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
+ /*
+ * If ire_uinfo is NULL, this is the IRE_INTERFACE case
+ * for IP_NEXTHOP. No cache ire has been found for the
+ * destination and we are working with the nexthop's
+ * interface ire. Since we need to forward all packets
+ * to the nexthop first, we "blindly" set tcp_localnet
+ * to false, eventhough the destination may also be
+ * onlink.
+ */
+ if (ire_uinfo == NULL)
+ tcp->tcp_localnet = 0;
+ else
+ tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
} else {
/*
* For incoming connection ire_mp = NULL
@@ -2662,94 +2700,100 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
* Make use of the cached rtt and rtt_sd values to calculate the
* initial RTO. Note that they are already initialized in
* tcp_init_values().
- */
- if (ire_uinfo->iulp_rtt != 0) {
- clock_t rto;
-
- tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt;
- tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5);
-
- if (rto > tcp_rexmit_interval_max) {
- tcp->tcp_rto = tcp_rexmit_interval_max;
- } else if (rto < tcp_rexmit_interval_min) {
- tcp->tcp_rto = tcp_rexmit_interval_min;
- } else {
- tcp->tcp_rto = rto;
+ * If ire_uinfo is NULL, i.e., we do not have a cache ire for
+ * IP_NEXTHOP, but instead are using the interface ire for the
+ * nexthop, then we do not use the ire_uinfo from that ire to
+ * do any initializations.
+ */
+ if (ire_uinfo != NULL) {
+ if (ire_uinfo->iulp_rtt != 0) {
+ clock_t rto;
+
+ tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt;
+ tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd;
+ rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
+ tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5);
+
+ if (rto > tcp_rexmit_interval_max) {
+ tcp->tcp_rto = tcp_rexmit_interval_max;
+ } else if (rto < tcp_rexmit_interval_min) {
+ tcp->tcp_rto = tcp_rexmit_interval_min;
+ } else {
+ tcp->tcp_rto = rto;
+ }
+ }
+ if (ire_uinfo->iulp_ssthresh != 0)
+ tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh;
+ else
+ tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
+ if (ire_uinfo->iulp_spipe > 0) {
+ tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe,
+ tcp_max_buf);
+ if (tcp_snd_lowat_fraction != 0)
+ tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
+ tcp_snd_lowat_fraction;
+ (void) tcp_maxpsz_set(tcp, B_TRUE);
}
- }
- if (ire_uinfo->iulp_ssthresh != 0)
- tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh;
- else
- tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
- if (ire_uinfo->iulp_spipe > 0) {
- tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe,
- tcp_max_buf);
- if (tcp_snd_lowat_fraction != 0)
- tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
- tcp_snd_lowat_fraction;
- (void) tcp_maxpsz_set(tcp, B_TRUE);
- }
- /*
- * Note that up till now, acceptor always inherits receive
- * window from the listener. But if there is a metrics associated
- * with a host, we should use that instead of inheriting it from
- * listener. Thus we need to pass this info back to the caller.
- */
- if (ire_uinfo->iulp_rpipe > 0) {
- tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, tcp_max_buf);
- } else {
/*
- * For passive open, set tcp_rwnd to 0 so that the caller
- * knows that there is no rpipe metric for this connection.
+ * Note that up till now, acceptor always inherits receive
+ * window from the listener. But if there is a metrics
+ * associated with a host, we should use that instead of
+ * inheriting it from listener. Thus we need to pass this
+ * info back to the caller.
*/
- if (tcp_detached)
- tcp->tcp_rwnd = 0;
- }
- if (ire_uinfo->iulp_rtomax > 0) {
- tcp->tcp_second_timer_threshold = ire_uinfo->iulp_rtomax;
- }
+ if (ire_uinfo->iulp_rpipe > 0) {
+ tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, tcp_max_buf);
+ }
+
+ if (ire_uinfo->iulp_rtomax > 0) {
+ tcp->tcp_second_timer_threshold =
+ ire_uinfo->iulp_rtomax;
+ }
- /*
- * Use the metric option settings, iulp_tstamp_ok and iulp_wscale_ok,
- * only for active open. What this means is that if the other side
- * uses timestamp or window scale option, TCP will also use those
- * options. That is for passive open. If the application sets a
- * large window, window scale is enabled regardless of the value in
- * iulp_wscale_ok. This is the behavior since 2.6. So we keep it.
- * The only case left in passive open processing is the check for SACK.
- *
- * For ECN, it should probably be like SACK. But the current
- * value is binary, so we treat it like the other cases. The
- * metric only controls active open. For passive open, the ndd
- * param, tcp_ecn_permitted, controls the behavior.
- */
- if (!tcp_detached) {
- /*
- * The if check means that the following can only be turned
- * on by the metrics only IRE, but not off.
- */
- if (ire_uinfo->iulp_tstamp_ok)
- tcp->tcp_snd_ts_ok = B_TRUE;
- if (ire_uinfo->iulp_wscale_ok)
- tcp->tcp_snd_ws_ok = B_TRUE;
- if (ire_uinfo->iulp_sack == 2)
- tcp->tcp_snd_sack_ok = B_TRUE;
- if (ire_uinfo->iulp_ecn_ok)
- tcp->tcp_ecn_ok = B_TRUE;
- } else {
/*
- * Passive open.
- *
- * As above, the if check means that SACK can only be
- * turned on by the metric only IRE.
+ * Use the metric option settings, iulp_tstamp_ok and
+ * iulp_wscale_ok, only for active open. What this means
+ * is that if the other side uses timestamp or window
+ * scale option, TCP will also use those options. That
+ * is for passive open. If the application sets a
+ * large window, window scale is enabled regardless of
+ * the value in iulp_wscale_ok. This is the behavior
+ * since 2.6. So we keep it.
+ * The only case left in passive open processing is the
+ * check for SACK.
+ * For ECN, it should probably be like SACK. But the
+ * current value is binary, so we treat it like the other
+ * cases. The metric only controls active open.For passive
+ * open, the ndd param, tcp_ecn_permitted, controls the
+ * behavior.
*/
- if (ire_uinfo->iulp_sack > 0) {
- tcp->tcp_snd_sack_ok = B_TRUE;
+ if (!tcp_detached) {
+ /*
+ * The if check means that the following can only
+ * be turned on by the metrics only IRE, but not off.
+ */
+ if (ire_uinfo->iulp_tstamp_ok)
+ tcp->tcp_snd_ts_ok = B_TRUE;
+ if (ire_uinfo->iulp_wscale_ok)
+ tcp->tcp_snd_ws_ok = B_TRUE;
+ if (ire_uinfo->iulp_sack == 2)
+ tcp->tcp_snd_sack_ok = B_TRUE;
+ if (ire_uinfo->iulp_ecn_ok)
+ tcp->tcp_ecn_ok = B_TRUE;
+ } else {
+ /*
+ * Passive open.
+ *
+ * As above, the if check means that SACK can only be
+ * turned on by the metric only IRE.
+ */
+ if (ire_uinfo->iulp_sack > 0) {
+ tcp->tcp_snd_sack_ok = B_TRUE;
+ }
}
}
+
/*
* XXX: Note that currently, ire_max_frag can be as small as 68
* because of PMTUd. So tcp_mss may go to negative if combined
@@ -5308,11 +5352,24 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
tcp->tcp_second_ctimer_threshold;
/*
+ * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics.
+ * If it does not, the eager's receive window will be set to the
+ * listener's receive window later in this function.
+ */
+ eager->tcp_rwnd = 0;
+
+ /*
* Zones: tcp_adapt_ire() and tcp_send_data() both need the
* zone id before the accept is completed in tcp_wput_accept().
*/
econnp->conn_zoneid = connp->conn_zoneid;
+ /* Copy nexthop information from listener to eager */
+ if (connp->conn_nexthop_set) {
+ econnp->conn_nexthop_set = connp->conn_nexthop_set;
+ econnp->conn_nexthop_v4 = connp->conn_nexthop_v4;
+ }
+
eager->tcp_hard_binding = B_TRUE;
tcp_bind_hash_insert(&tcp_bind_fanout[
@@ -7662,6 +7719,7 @@ tcp_init_values(tcp_t *tcp)
tcp->tcp_ms_we_have_waited = 0;
tcp->tcp_last_recv_time = lbolt;
tcp->tcp_cwnd_max = tcp_cwnd_max_;
+ tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
tcp->tcp_snd_burst = TCP_CWND_INFINITE;
tcp->tcp_maxpsz = tcp_maxpsz_multiplier;
@@ -9362,6 +9420,9 @@ tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
case IP_TTL:
*i1 = (int)tcp->tcp_ipha->ipha_ttl;
break;
+ case IP_NEXTHOP:
+ /* Handled at IP level */
+ return (-EINVAL);
default:
return (-1);
}
@@ -9912,6 +9973,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
}
break;
case IP_BOUND_IF:
+ case IP_NEXTHOP:
/* Handled at the IP level */
return (-EINVAL);
case IP_SEC_OPT:
@@ -17725,6 +17787,7 @@ tcp_zcopy_check(tcp_t *tcp)
IPCL_IS_CONNECTED(connp) &&
(connp->conn_flags & IPCL_CHECK_POLICY) == 0 &&
connp->conn_dontroute == 0 &&
+ !connp->conn_nexthop_set &&
connp->conn_xmit_if_ill == NULL &&
connp->conn_nofailover_ill == NULL &&
do_tcpzcopy == 1) {
@@ -17895,6 +17958,7 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
!IPCL_IS_CONNECTED(connp) ||
(connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
connp->conn_dontroute ||
+ connp->conn_nexthop_set ||
connp->conn_xmit_if_ill != NULL ||
connp->conn_nofailover_ill != NULL ||
ipha->ipha_ident == IP_HDR_INCLUDED ||
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index d0ab3d8130..2dd6db079e 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -140,6 +140,9 @@ opdes_t tcp_opt_arr[] = {
{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
+{ IP_NEXTHOP, IPPROTO_IP, OA_RW, OA_RW, OP_CONFIG, OP_PASSNEXT,
+ sizeof (in_addr_t), -1 /* not initialized */ },
+
{ IPV6_BOUND_PIF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT,
sizeof (int), 0 /* no ifindex */ },
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index d804018911..c13d7c485f 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -2989,6 +2989,9 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr)
case IP_TTL:
*i1 = (int)udp->udp_ttl;
break; /* goto sizeof (int) option return */
+ case IP_NEXTHOP:
+ /* Handled at IP level */
+ return (-EINVAL);
case IP_MULTICAST_IF:
/* 0 address if not set */
*(ipaddr_t *)ptr = udp->udp_multicast_if_addr;
@@ -3418,6 +3421,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level,
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case IP_SEC_OPT:
+ case IP_NEXTHOP:
/*
* "soft" error (negative)
* option not handled at this level
@@ -6054,6 +6058,7 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
(ire->ire_flags & RTF_MULTIRT) || ire->ire_stq == NULL ||
ire->ire_max_frag < ntohs(ipha->ipha_length) ||
(ire_fp_mp = ire->ire_fp_mp) == NULL ||
+ (connp->conn_nexthop_set) ||
(ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp)) {
if (ipif != NULL)
ipif_refrele(ipif);
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index 33429cc59a..328f2cb44b 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -137,6 +137,9 @@ opdes_t udp_opt_arr[] = {
{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT,
sizeof (int), 0 },
+{ IP_NEXTHOP, IPPROTO_IP, OA_RW, OA_RW, OP_CONFIG, OP_PASSNEXT,
+ sizeof (in_addr_t), -1 /* not initialized */ },
+
{ MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP,
(OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req),
-1 /* not initialized */ },
diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h
index 6e7f4066bb..7bed8be4b9 100644
--- a/usr/src/uts/common/netinet/in.h
+++ b/usr/src/uts/common/netinet/in.h
@@ -807,7 +807,8 @@ struct sockaddr_in6 {
#define IP_BLOCK_SOURCE 0x15 /* block mcast pkts from source */
#define IP_UNBLOCK_SOURCE 0x16 /* unblock mcast pkts from source */
#define IP_ADD_SOURCE_MEMBERSHIP 0x17 /* add mcast group/source pair */
-#define IP_DROP_SOURCE_MEMBERSHIP 0x18 /* drop mcast gruop/source pair */
+#define IP_DROP_SOURCE_MEMBERSHIP 0x18 /* drop mcast group/source pair */
+#define IP_NEXTHOP 0x19 /* send directly to next hop */
#if !defined(_XPG4_2) || defined(__EXTENSIONS__)
/*